blob: 4df9fd8f06ae1f7c5e9b84cc7e6b2f6a4f8f1577 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000264 unicode->str = PyObject_REALLOC(unicode->str,
265 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000267 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_NoMemory();
269 return -1;
270 }
271 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000272 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000274 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 if (unicode->defenc) {
277 Py_DECREF(unicode->defenc);
278 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return 0;
283}
284
285/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000286 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287
288 XXX This allocator could further be enhanced by assuring that the
289 free list never reduces its size below 1.
290
291*/
292
293static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000294PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295{
296 register PyUnicodeObject *unicode;
297
Andrew Dalkee0df7622006-05-27 11:04:36 +0000298 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 if (length == 0 && unicode_empty != NULL) {
300 Py_INCREF(unicode_empty);
301 return unicode_empty;
302 }
303
304 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000305 if (free_list) {
306 unicode = free_list;
307 free_list = *(PyUnicodeObject **)unicode;
308 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 /* Keep-Alive optimization: we only upsize the buffer,
311 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000312 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000313 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000314 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000318 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000319 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
320 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 }
322 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 }
324 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000325 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000326 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (unicode == NULL)
328 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000329 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000333 if (!unicode->str) {
334 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000335 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000337 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000338 * the caller fails before initializing str -- unicode_resize()
339 * reads str[0], and the Keep-Alive optimization can keep memory
340 * allocated for str alive across a call to unicode_dealloc(unicode).
341 * We don't want unicode_resize to read uninitialized memory in
342 * that case.
343 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000344 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000346 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350
351 onError:
352 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000353 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355}
356
357static
Guido van Rossum9475a232001-10-05 20:51:39 +0000358void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000360 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000361 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000362 /* Keep-Alive optimization */
363 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000364 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 unicode->str = NULL;
366 unicode->length = 0;
367 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000368 if (unicode->defenc) {
369 Py_DECREF(unicode->defenc);
370 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000371 }
372 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000373 *(PyUnicodeObject **)unicode = free_list;
374 free_list = unicode;
375 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 }
377 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000378 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000380 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382}
383
Martin v. Löwis18e16552006-02-15 17:27:45 +0000384int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000385{
386 register PyUnicodeObject *v;
387
388 /* Argument checks */
389 if (unicode == NULL) {
390 PyErr_BadInternalCall();
391 return -1;
392 }
393 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000394 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 PyErr_BadInternalCall();
396 return -1;
397 }
398
399 /* Resizing unicode_empty and single character objects is not
400 possible since these are being shared. We simply return a fresh
401 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000402 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 (v == unicode_empty || v->length == 1)) {
404 PyUnicodeObject *w = _PyUnicode_New(length);
405 if (w == NULL)
406 return -1;
407 Py_UNICODE_COPY(w->str, v->str,
408 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000409 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000410 *unicode = (PyObject *)w;
411 return 0;
412 }
413
414 /* Note that we don't have to modify *unicode for unshared Unicode
415 objects, since we can modify them in-place. */
416 return unicode_resize(v, length);
417}
418
419/* Internal API for use in unicodeobject.c only ! */
420#define _PyUnicode_Resize(unicodevar, length) \
421 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000424 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425{
426 PyUnicodeObject *unicode;
427
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000428 /* If the Unicode data is known at construction time, we can apply
429 some optimizations which share commonly used objects. */
430 if (u != NULL) {
431
432 /* Optimization for empty strings */
433 if (size == 0 && unicode_empty != NULL) {
434 Py_INCREF(unicode_empty);
435 return (PyObject *)unicode_empty;
436 }
437
438 /* Single character Unicode objects in the Latin-1 range are
439 shared when using this constructor */
440 if (size == 1 && *u < 256) {
441 unicode = unicode_latin1[*u];
442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 if (!unicode)
445 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000446 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 unicode_latin1[*u] = unicode;
448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
Tim Petersced69f82003-09-16 20:30:58 +0000453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 unicode = _PyUnicode_New(size);
455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461
462 return (PyObject *)unicode;
463}
464
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000465PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
466{
467 PyUnicodeObject *unicode;
468 /* If the Unicode data is known at construction time, we can apply
469 some optimizations which share commonly used objects.
470 Also, this means the input must be UTF-8, so fall back to the
471 UTF-8 decoder at the end. */
472 if (u != NULL) {
473
474 /* Optimization for empty strings */
475 if (size == 0 && unicode_empty != NULL) {
476 Py_INCREF(unicode_empty);
477 return (PyObject *)unicode_empty;
478 }
479
480 /* Single characters are shared when using this constructor.
481 Restrict to ASCII, since the input must be UTF-8. */
482 if (size == 1 && Py_CHARMASK(*u) < 128) {
483 unicode = unicode_latin1[Py_CHARMASK(*u)];
484 if (!unicode) {
485 unicode = _PyUnicode_New(1);
486 if (!unicode)
487 return NULL;
488 unicode->str[0] = Py_CHARMASK(*u);
489 unicode_latin1[Py_CHARMASK(*u)] = unicode;
490 }
491 Py_INCREF(unicode);
492 return (PyObject *)unicode;
493 }
494
495 return PyUnicode_DecodeUTF8(u, size, NULL);
496 }
497
498 unicode = _PyUnicode_New(size);
499 if (!unicode)
500 return NULL;
501
502 return (PyObject *)unicode;
503}
504
505PyObject *PyUnicode_FromString(const char *u)
506{
507 size_t size = strlen(u);
508 if (size > PY_SSIZE_T_MAX) {
509 PyErr_SetString(PyExc_OverflowError, "input too long");
510 return NULL;
511 }
512
513 return PyUnicode_FromStringAndSize(u, size);
514}
515
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516#ifdef HAVE_WCHAR_H
517
518PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000519 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520{
521 PyUnicodeObject *unicode;
522
523 if (w == NULL) {
524 PyErr_BadInternalCall();
525 return NULL;
526 }
527
528 unicode = _PyUnicode_New(size);
529 if (!unicode)
530 return NULL;
531
532 /* Copy the wchar_t data into the new object */
533#ifdef HAVE_USABLE_WCHAR_T
534 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000535#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536 {
537 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000538 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000539 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000540 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541 *u++ = *w++;
542 }
543#endif
544
545 return (PyObject *)unicode;
546}
547
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000548static void
549makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
550{
551 *fmt++ = '%';
552 if (width) {
553 if (zeropad)
554 *fmt++ = '0';
555 fmt += sprintf(fmt, "%d", width);
556 }
557 if (precision)
558 fmt += sprintf(fmt, ".%d", precision);
559 if (longflag)
560 *fmt++ = 'l';
561 else if (size_tflag) {
562 char *f = PY_FORMAT_SIZE_T;
563 while (*f)
564 *fmt++ = *f++;
565 }
566 *fmt++ = c;
567 *fmt = '\0';
568}
569
570#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
571
572PyObject *
573PyUnicode_FromFormatV(const char *format, va_list vargs)
574{
575 va_list count;
576 Py_ssize_t callcount = 0;
577 PyObject **callresults = NULL;
578 PyObject **callresult = NULL;
579 Py_ssize_t n = 0;
580 int width = 0;
581 int precision = 0;
582 int zeropad;
583 const char* f;
584 Py_UNICODE *s;
585 PyObject *string;
586 /* used by sprintf */
587 char buffer[21];
588 /* use abuffer instead of buffer, if we need more space
589 * (which can happen if there's a format specifier with width). */
590 char *abuffer = NULL;
591 char *realbuffer;
592 Py_ssize_t abuffersize = 0;
593 char fmt[60]; /* should be enough for %0width.precisionld */
594 const char *copy;
595
596#ifdef VA_LIST_IS_ARRAY
597 Py_MEMCPY(count, vargs, sizeof(va_list));
598#else
599#ifdef __va_copy
600 __va_copy(count, vargs);
601#else
602 count = vargs;
603#endif
604#endif
605 /* step 1: count the number of %S/%R format specifications
606 * (we call PyObject_Str()/PyObject_Repr() for these objects
607 * once during step 3 and put the result in an array) */
608 for (f = format; *f; f++) {
609 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
610 ++callcount;
611 }
612 /* step 2: allocate memory for the results of
613 * PyObject_Str()/PyObject_Repr() calls */
614 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000615 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000616 if (!callresults) {
617 PyErr_NoMemory();
618 return NULL;
619 }
620 callresult = callresults;
621 }
622 /* step 3: figure out how large a buffer we need */
623 for (f = format; *f; f++) {
624 if (*f == '%') {
625 const char* p = f;
626 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000627 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000628 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000629 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000630 ;
631
632 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
633 * they don't affect the amount of space we reserve.
634 */
635 if ((*f == 'l' || *f == 'z') &&
636 (f[1] == 'd' || f[1] == 'u'))
637 ++f;
638
639 switch (*f) {
640 case 'c':
641 (void)va_arg(count, int);
642 /* fall through... */
643 case '%':
644 n++;
645 break;
646 case 'd': case 'u': case 'i': case 'x':
647 (void) va_arg(count, int);
648 /* 20 bytes is enough to hold a 64-bit
649 integer. Decimal takes the most space.
650 This isn't enough for octal.
651 If a width is specified we need more
652 (which we allocate later). */
653 if (width < 20)
654 width = 20;
655 n += width;
656 if (abuffersize < width)
657 abuffersize = width;
658 break;
659 case 's':
660 {
661 /* UTF-8 */
662 unsigned char*s;
663 s = va_arg(count, unsigned char*);
664 while (*s) {
665 if (*s < 128) {
666 n++; s++;
667 } else if (*s < 0xc0) {
668 /* invalid UTF-8 */
669 n++; s++;
670 } else if (*s < 0xc0) {
671 n++;
672 s++; if(!*s)break;
673 s++;
674 } else if (*s < 0xe0) {
675 n++;
676 s++; if(!*s)break;
677 s++; if(!*s)break;
678 s++;
679 } else {
680 #ifdef Py_UNICODE_WIDE
681 n++;
682 #else
683 n+=2;
684 #endif
685 s++; if(!*s)break;
686 s++; if(!*s)break;
687 s++; if(!*s)break;
688 s++;
689 }
690 }
691 break;
692 }
693 case 'U':
694 {
695 PyObject *obj = va_arg(count, PyObject *);
696 assert(obj && PyUnicode_Check(obj));
697 n += PyUnicode_GET_SIZE(obj);
698 break;
699 }
700 case 'V':
701 {
702 PyObject *obj = va_arg(count, PyObject *);
703 const char *str = va_arg(count, const char *);
704 assert(obj || str);
705 assert(!obj || PyUnicode_Check(obj));
706 if (obj)
707 n += PyUnicode_GET_SIZE(obj);
708 else
709 n += strlen(str);
710 break;
711 }
712 case 'S':
713 {
714 PyObject *obj = va_arg(count, PyObject *);
715 PyObject *str;
716 assert(obj);
717 str = PyObject_Str(obj);
718 if (!str)
719 goto fail;
720 n += PyUnicode_GET_SIZE(str);
721 /* Remember the str and switch to the next slot */
722 *callresult++ = str;
723 break;
724 }
725 case 'R':
726 {
727 PyObject *obj = va_arg(count, PyObject *);
728 PyObject *repr;
729 assert(obj);
730 repr = PyObject_Repr(obj);
731 if (!repr)
732 goto fail;
733 n += PyUnicode_GET_SIZE(repr);
734 /* Remember the repr and switch to the next slot */
735 *callresult++ = repr;
736 break;
737 }
738 case 'p':
739 (void) va_arg(count, int);
740 /* maximum 64-bit pointer representation:
741 * 0xffffffffffffffff
742 * so 19 characters is enough.
743 * XXX I count 18 -- what's the extra for?
744 */
745 n += 19;
746 break;
747 default:
748 /* if we stumble upon an unknown
749 formatting code, copy the rest of
750 the format string to the output
751 string. (we cannot just skip the
752 code, since there's no way to know
753 what's in the argument list) */
754 n += strlen(p);
755 goto expand;
756 }
757 } else
758 n++;
759 }
760 expand:
761 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000762 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000763 if (!abuffer) {
764 PyErr_NoMemory();
765 goto fail;
766 }
767 realbuffer = abuffer;
768 }
769 else
770 realbuffer = buffer;
771 /* step 4: fill the buffer */
772 /* Since we've analyzed how much space we need for the worst case,
773 we don't have to resize the string.
774 There can be no errors beyond this point. */
775 string = PyUnicode_FromUnicode(NULL, n);
776 if (!string)
777 goto fail;
778
779 s = PyUnicode_AS_UNICODE(string);
780 callresult = callresults;
781
782 for (f = format; *f; f++) {
783 if (*f == '%') {
784 const char* p = f++;
785 int longflag = 0;
786 int size_tflag = 0;
787 zeropad = (*f == '0');
788 /* parse the width.precision part */
789 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000790 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000791 width = (width*10) + *f++ - '0';
792 precision = 0;
793 if (*f == '.') {
794 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000795 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000796 precision = (precision*10) + *f++ - '0';
797 }
798 /* handle the long flag, but only for %ld and %lu.
799 others can be added when necessary. */
800 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
801 longflag = 1;
802 ++f;
803 }
804 /* handle the size_t flag. */
805 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
806 size_tflag = 1;
807 ++f;
808 }
809
810 switch (*f) {
811 case 'c':
812 *s++ = va_arg(vargs, int);
813 break;
814 case 'd':
815 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
816 if (longflag)
817 sprintf(realbuffer, fmt, va_arg(vargs, long));
818 else if (size_tflag)
819 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
820 else
821 sprintf(realbuffer, fmt, va_arg(vargs, int));
822 appendstring(realbuffer);
823 break;
824 case 'u':
825 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
826 if (longflag)
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
828 else if (size_tflag)
829 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
830 else
831 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
832 appendstring(realbuffer);
833 break;
834 case 'i':
835 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
836 sprintf(realbuffer, fmt, va_arg(vargs, int));
837 appendstring(realbuffer);
838 break;
839 case 'x':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 's':
845 {
846 /* Parameter must be UTF-8 encoded.
847 In case of encoding errors, use
848 the replacement character. */
849 PyObject *u;
850 p = va_arg(vargs, char*);
851 u = PyUnicode_DecodeUTF8(p, strlen(p),
852 "replace");
853 if (!u)
854 goto fail;
855 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
856 PyUnicode_GET_SIZE(u));
857 s += PyUnicode_GET_SIZE(u);
858 Py_DECREF(u);
859 break;
860 }
861 case 'U':
862 {
863 PyObject *obj = va_arg(vargs, PyObject *);
864 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
865 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
866 s += size;
867 break;
868 }
869 case 'V':
870 {
871 PyObject *obj = va_arg(vargs, PyObject *);
872 const char *str = va_arg(vargs, const char *);
873 if (obj) {
874 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
875 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
876 s += size;
877 } else {
878 appendstring(str);
879 }
880 break;
881 }
882 case 'S':
883 case 'R':
884 {
885 Py_UNICODE *ucopy;
886 Py_ssize_t usize;
887 Py_ssize_t upos;
888 /* unused, since we already have the result */
889 (void) va_arg(vargs, PyObject *);
890 ucopy = PyUnicode_AS_UNICODE(*callresult);
891 usize = PyUnicode_GET_SIZE(*callresult);
892 for (upos = 0; upos<usize;)
893 *s++ = ucopy[upos++];
894 /* We're done with the unicode()/repr() => forget it */
895 Py_DECREF(*callresult);
896 /* switch to next unicode()/repr() result */
897 ++callresult;
898 break;
899 }
900 case 'p':
901 sprintf(buffer, "%p", va_arg(vargs, void*));
902 /* %p is ill-defined: ensure leading 0x. */
903 if (buffer[1] == 'X')
904 buffer[1] = 'x';
905 else if (buffer[1] != 'x') {
906 memmove(buffer+2, buffer, strlen(buffer)+1);
907 buffer[0] = '0';
908 buffer[1] = 'x';
909 }
910 appendstring(buffer);
911 break;
912 case '%':
913 *s++ = '%';
914 break;
915 default:
916 appendstring(p);
917 goto end;
918 }
919 } else
920 *s++ = *f;
921 }
922
923 end:
924 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000925 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000926 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000927 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000928 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
929 return string;
930 fail:
931 if (callresults) {
932 PyObject **callresult2 = callresults;
933 while (callresult2 < callresult) {
934 Py_DECREF(*callresult2);
935 ++callresult2;
936 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000937 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000938 }
939 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000940 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000941 return NULL;
942}
943
944#undef appendstring
945
946PyObject *
947PyUnicode_FromFormat(const char *format, ...)
948{
949 PyObject* ret;
950 va_list vargs;
951
952#ifdef HAVE_STDARG_PROTOTYPES
953 va_start(vargs, format);
954#else
955 va_start(vargs);
956#endif
957 ret = PyUnicode_FromFormatV(format, vargs);
958 va_end(vargs);
959 return ret;
960}
961
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
963 wchar_t *w,
964 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965{
966 if (unicode == NULL) {
967 PyErr_BadInternalCall();
968 return -1;
969 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000970
971 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000973 size = PyUnicode_GET_SIZE(unicode) + 1;
974
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975#ifdef HAVE_USABLE_WCHAR_T
976 memcpy(w, unicode->str, size * sizeof(wchar_t));
977#else
978 {
979 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000980 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000981 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000982 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983 *w++ = *u++;
984 }
985#endif
986
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000987 if (size > PyUnicode_GET_SIZE(unicode))
988 return PyUnicode_GET_SIZE(unicode);
989 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990 return size;
991}
992
993#endif
994
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000995PyObject *PyUnicode_FromOrdinal(int ordinal)
996{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000997 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000998
999#ifdef Py_UNICODE_WIDE
1000 if (ordinal < 0 || ordinal > 0x10ffff) {
1001 PyErr_SetString(PyExc_ValueError,
1002 "unichr() arg not in range(0x110000) "
1003 "(wide Python build)");
1004 return NULL;
1005 }
1006#else
1007 if (ordinal < 0 || ordinal > 0xffff) {
1008 PyErr_SetString(PyExc_ValueError,
1009 "unichr() arg not in range(0x10000) "
1010 "(narrow Python build)");
1011 return NULL;
1012 }
1013#endif
1014
Hye-Shik Chang40574832004-04-06 07:24:51 +00001015 s[0] = (Py_UNICODE)ordinal;
1016 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001017}
1018
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019PyObject *PyUnicode_FromObject(register PyObject *obj)
1020{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001021 /* XXX Perhaps we should make this API an alias of
1022 PyObject_Unicode() instead ?! */
1023 if (PyUnicode_CheckExact(obj)) {
1024 Py_INCREF(obj);
1025 return obj;
1026 }
1027 if (PyUnicode_Check(obj)) {
1028 /* For a Unicode subtype that's not a Unicode object,
1029 return a true Unicode object with the same data. */
1030 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1031 PyUnicode_GET_SIZE(obj));
1032 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001033 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1034}
1035
1036PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1037 const char *encoding,
1038 const char *errors)
1039{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001040 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001041 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001043
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044 if (obj == NULL) {
1045 PyErr_BadInternalCall();
1046 return NULL;
1047 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001049#if 0
1050 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001051 that no encodings is given and then redirect to
1052 PyObject_Unicode() which then applies the additional logic for
1053 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001055 NOTE: This API should really only be used for object which
1056 represent *encoded* Unicode !
1057
1058 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001059 if (PyUnicode_Check(obj)) {
1060 if (encoding) {
1061 PyErr_SetString(PyExc_TypeError,
1062 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001065 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001066 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067#else
1068 if (PyUnicode_Check(obj)) {
1069 PyErr_SetString(PyExc_TypeError,
1070 "decoding Unicode is not supported");
1071 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001072 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001073#endif
1074
1075 /* Coerce object */
1076 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001077 s = PyString_AS_STRING(obj);
1078 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001079 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001080 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1081 /* Overwrite the error message with something more useful in
1082 case of a TypeError. */
1083 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001084 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001085 "coercing to Unicode: need string or buffer, "
1086 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001087 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001088 goto onError;
1089 }
Tim Petersced69f82003-09-16 20:30:58 +00001090
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001091 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 if (len == 0) {
1093 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001094 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 }
Tim Petersced69f82003-09-16 20:30:58 +00001096 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001097 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001098
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001099 return v;
1100
1101 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103}
1104
1105PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001106 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 const char *encoding,
1108 const char *errors)
1109{
1110 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001111
1112 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001113 encoding = PyUnicode_GetDefaultEncoding();
1114
1115 /* Shortcuts for common default encodings */
1116 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001118 else if (strcmp(encoding, "latin-1") == 0)
1119 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001120#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1121 else if (strcmp(encoding, "mbcs") == 0)
1122 return PyUnicode_DecodeMBCS(s, size, errors);
1123#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001124 else if (strcmp(encoding, "ascii") == 0)
1125 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126
1127 /* Decode via the codec registry */
1128 buffer = PyBuffer_FromMemory((void *)s, size);
1129 if (buffer == NULL)
1130 goto onError;
1131 unicode = PyCodec_Decode(buffer, encoding, errors);
1132 if (unicode == NULL)
1133 goto onError;
1134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001136 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001137 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 Py_DECREF(unicode);
1139 goto onError;
1140 }
1141 Py_DECREF(buffer);
1142 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001143
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 onError:
1145 Py_XDECREF(buffer);
1146 return NULL;
1147}
1148
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001149PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1150 const char *encoding,
1151 const char *errors)
1152{
1153 PyObject *v;
1154
1155 if (!PyUnicode_Check(unicode)) {
1156 PyErr_BadArgument();
1157 goto onError;
1158 }
1159
1160 if (encoding == NULL)
1161 encoding = PyUnicode_GetDefaultEncoding();
1162
1163 /* Decode via the codec registry */
1164 v = PyCodec_Decode(unicode, encoding, errors);
1165 if (v == NULL)
1166 goto onError;
1167 return v;
1168
1169 onError:
1170 return NULL;
1171}
1172
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001174 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 const char *encoding,
1176 const char *errors)
1177{
1178 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 unicode = PyUnicode_FromUnicode(s, size);
1181 if (unicode == NULL)
1182 return NULL;
1183 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1184 Py_DECREF(unicode);
1185 return v;
1186}
1187
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001188PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1189 const char *encoding,
1190 const char *errors)
1191{
1192 PyObject *v;
1193
1194 if (!PyUnicode_Check(unicode)) {
1195 PyErr_BadArgument();
1196 goto onError;
1197 }
1198
1199 if (encoding == NULL)
1200 encoding = PyUnicode_GetDefaultEncoding();
1201
1202 /* Encode via the codec registry */
1203 v = PyCodec_Encode(unicode, encoding, errors);
1204 if (v == NULL)
1205 goto onError;
1206 return v;
1207
1208 onError:
1209 return NULL;
1210}
1211
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1213 const char *encoding,
1214 const char *errors)
1215{
1216 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 if (!PyUnicode_Check(unicode)) {
1219 PyErr_BadArgument();
1220 goto onError;
1221 }
Fred Drakee4315f52000-05-09 19:53:39 +00001222
Tim Petersced69f82003-09-16 20:30:58 +00001223 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001224 encoding = PyUnicode_GetDefaultEncoding();
1225
1226 /* Shortcuts for common default encodings */
1227 if (errors == NULL) {
1228 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001229 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001230 else if (strcmp(encoding, "latin-1") == 0)
1231 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001232#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1233 else if (strcmp(encoding, "mbcs") == 0)
1234 return PyUnicode_AsMBCSString(unicode);
1235#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001236 else if (strcmp(encoding, "ascii") == 0)
1237 return PyUnicode_AsASCIIString(unicode);
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239
1240 /* Encode via the codec registry */
1241 v = PyCodec_Encode(unicode, encoding, errors);
1242 if (v == NULL)
1243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 if (!PyString_Check(v)) {
1245 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001246 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001247 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 Py_DECREF(v);
1249 goto onError;
1250 }
1251 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 onError:
1254 return NULL;
1255}
1256
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001257PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1258 const char *errors)
1259{
1260 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1261
1262 if (v)
1263 return v;
1264 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1265 if (v && errors == NULL)
1266 ((PyUnicodeObject *)unicode)->defenc = v;
1267 return v;
1268}
1269
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1271{
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_BadArgument();
1274 goto onError;
1275 }
1276 return PyUnicode_AS_UNICODE(unicode);
1277
1278 onError:
1279 return NULL;
1280}
1281
Martin v. Löwis18e16552006-02-15 17:27:45 +00001282Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283{
1284 if (!PyUnicode_Check(unicode)) {
1285 PyErr_BadArgument();
1286 goto onError;
1287 }
1288 return PyUnicode_GET_SIZE(unicode);
1289
1290 onError:
1291 return -1;
1292}
1293
Thomas Wouters78890102000-07-22 19:25:51 +00001294const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001295{
1296 return unicode_default_encoding;
1297}
1298
1299int PyUnicode_SetDefaultEncoding(const char *encoding)
1300{
1301 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001302
Fred Drakee4315f52000-05-09 19:53:39 +00001303 /* Make sure the encoding is valid. As side effect, this also
1304 loads the encoding into the codec registry cache. */
1305 v = _PyCodec_Lookup(encoding);
1306 if (v == NULL)
1307 goto onError;
1308 Py_DECREF(v);
1309 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001310 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001311 sizeof(unicode_default_encoding));
1312 return 0;
1313
1314 onError:
1315 return -1;
1316}
1317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318/* error handling callback helper:
1319 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001320 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001321 and adjust various state variables.
1322 return 0 on success, -1 on error
1323*/
1324
1325static
1326int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1327 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001328 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1329 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001330 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001332 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333
1334 PyObject *restuple = NULL;
1335 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1337 Py_ssize_t requiredsize;
1338 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001340 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 int res = -1;
1342
1343 if (*errorHandler == NULL) {
1344 *errorHandler = PyCodec_LookupError(errors);
1345 if (*errorHandler == NULL)
1346 goto onError;
1347 }
1348
1349 if (*exceptionObject == NULL) {
1350 *exceptionObject = PyUnicodeDecodeError_Create(
1351 encoding, input, insize, *startinpos, *endinpos, reason);
1352 if (*exceptionObject == NULL)
1353 goto onError;
1354 }
1355 else {
1356 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1357 goto onError;
1358 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1359 goto onError;
1360 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1361 goto onError;
1362 }
1363
1364 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1365 if (restuple == NULL)
1366 goto onError;
1367 if (!PyTuple_Check(restuple)) {
1368 PyErr_Format(PyExc_TypeError, &argparse[4]);
1369 goto onError;
1370 }
1371 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1372 goto onError;
1373 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001374 newpos = insize+newpos;
1375 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001376 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001377 goto onError;
1378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001379
1380 /* need more space? (at least enough for what we
1381 have+the replacement+the rest of the string (starting
1382 at the new input position), so we won't have to check space
1383 when there are no errors in the rest of the string) */
1384 repptr = PyUnicode_AS_UNICODE(repunicode);
1385 repsize = PyUnicode_GET_SIZE(repunicode);
1386 requiredsize = *outpos + repsize + insize-newpos;
1387 if (requiredsize > outsize) {
1388 if (requiredsize<2*outsize)
1389 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001390 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391 goto onError;
1392 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1393 }
1394 *endinpos = newpos;
1395 *inptr = input + newpos;
1396 Py_UNICODE_COPY(*outptr, repptr, repsize);
1397 *outptr += repsize;
1398 *outpos += repsize;
1399 /* we made it! */
1400 res = 0;
1401
1402 onError:
1403 Py_XDECREF(restuple);
1404 return res;
1405}
1406
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407/* --- UTF-7 Codec -------------------------------------------------------- */
1408
1409/* see RFC2152 for details */
1410
Tim Petersced69f82003-09-16 20:30:58 +00001411static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412char utf7_special[128] = {
1413 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1414 encoded:
1415 0 - not special
1416 1 - special
1417 2 - whitespace (optional)
1418 3 - RFC2152 Set O (optional) */
1419 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1420 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1421 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1423 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1425 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1427
1428};
1429
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1431 warnings about the comparison always being false; since
1432 utf7_special[0] is 1, we can safely make that one comparison
1433 true */
1434
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001436 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001437 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001438 (encodeO && (utf7_special[(c)] == 3)))
1439
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001440#define B64(n) \
1441 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1442#define B64CHAR(c) \
1443 (isalnum(c) || (c) == '+' || (c) == '/')
1444#define UB64(c) \
1445 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1446 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001447
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001448#define ENCODE(out, ch, bits) \
1449 while (bits >= 6) { \
1450 *out++ = B64(ch >> (bits-6)); \
1451 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452 }
1453
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001454#define DECODE(out, ch, bits, surrogate) \
1455 while (bits >= 16) { \
1456 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1457 bits -= 16; \
1458 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001459 /* We have already generated an error for the high surrogate \
1460 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001461 surrogate = 0; \
1462 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001464 it in a 16-bit character */ \
1465 surrogate = 1; \
1466 errmsg = "code pairs are not supported"; \
1467 goto utf7Error; \
1468 } else { \
1469 *out++ = outCh; \
1470 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001475 const char *errors)
1476{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001477 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1478}
1479
1480PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1481 Py_ssize_t size,
1482 const char *errors,
1483 Py_ssize_t *consumed)
1484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001485 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001486 Py_ssize_t startinpos;
1487 Py_ssize_t endinpos;
1488 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 const char *e;
1490 PyUnicodeObject *unicode;
1491 Py_UNICODE *p;
1492 const char *errmsg = "";
1493 int inShift = 0;
1494 unsigned int bitsleft = 0;
1495 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 int surrogate = 0;
1497 PyObject *errorHandler = NULL;
1498 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499
1500 unicode = _PyUnicode_New(size);
1501 if (!unicode)
1502 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001503 if (size == 0) {
1504 if (consumed)
1505 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001506 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001507 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508
1509 p = unicode->str;
1510 e = s + size;
1511
1512 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001513 Py_UNICODE ch;
1514 restart:
1515 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516
1517 if (inShift) {
1518 if ((ch == '-') || !B64CHAR(ch)) {
1519 inShift = 0;
1520 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001521
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1523 if (bitsleft >= 6) {
1524 /* The shift sequence has a partial character in it. If
1525 bitsleft < 6 then we could just classify it as padding
1526 but that is not the case here */
1527
1528 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001529 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530 }
1531 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001532 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 here so indicate the potential of a misencoded character. */
1534
1535 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1536 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1537 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001538 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 }
1540
1541 if (ch == '-') {
1542 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001543 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 inShift = 1;
1545 }
1546 } else if (SPECIAL(ch,0,0)) {
1547 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001548 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 } else {
1550 *p++ = ch;
1551 }
1552 } else {
1553 charsleft = (charsleft << 6) | UB64(ch);
1554 bitsleft += 6;
1555 s++;
1556 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1557 }
1558 }
1559 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001560 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 s++;
1562 if (s < e && *s == '-') {
1563 s++;
1564 *p++ = '+';
1565 } else
1566 {
1567 inShift = 1;
1568 bitsleft = 0;
1569 }
1570 }
1571 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001572 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 errmsg = "unexpected special character";
1574 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001575 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 }
1577 else {
1578 *p++ = ch;
1579 s++;
1580 }
1581 continue;
1582 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001583 outpos = p-PyUnicode_AS_UNICODE(unicode);
1584 endinpos = s-starts;
1585 if (unicode_decode_call_errorhandler(
1586 errors, &errorHandler,
1587 "utf7", errmsg,
1588 starts, size, &startinpos, &endinpos, &exc, &s,
1589 (PyObject **)&unicode, &outpos, &p))
1590 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 }
1592
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001593 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 outpos = p-PyUnicode_AS_UNICODE(unicode);
1595 endinpos = size;
1596 if (unicode_decode_call_errorhandler(
1597 errors, &errorHandler,
1598 "utf7", "unterminated shift sequence",
1599 starts, size, &startinpos, &endinpos, &exc, &s,
1600 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (s < e)
1603 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001605 if (consumed) {
1606 if(inShift)
1607 *consumed = startinpos;
1608 else
1609 *consumed = s-starts;
1610 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001612 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 goto onError;
1614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 Py_XDECREF(errorHandler);
1616 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001617 return (PyObject *)unicode;
1618
1619onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 Py_DECREF(unicode);
1623 return NULL;
1624}
1625
1626
1627PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001628 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629 int encodeSetO,
1630 int encodeWhiteSpace,
1631 const char *errors)
1632{
1633 PyObject *v;
1634 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001637 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 unsigned int bitsleft = 0;
1639 unsigned long charsleft = 0;
1640 char * out;
1641 char * start;
1642
1643 if (size == 0)
1644 return PyString_FromStringAndSize(NULL, 0);
1645
1646 v = PyString_FromStringAndSize(NULL, cbAllocated);
1647 if (v == NULL)
1648 return NULL;
1649
1650 start = out = PyString_AS_STRING(v);
1651 for (;i < size; ++i) {
1652 Py_UNICODE ch = s[i];
1653
1654 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001655 if (ch == '+') {
1656 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 *out++ = '-';
1658 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1659 charsleft = ch;
1660 bitsleft = 16;
1661 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001662 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001664 } else {
1665 *out++ = (char) ch;
1666 }
1667 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1669 *out++ = B64(charsleft << (6-bitsleft));
1670 charsleft = 0;
1671 bitsleft = 0;
1672 /* Characters not in the BASE64 set implicitly unshift the sequence
1673 so no '-' is required, except if the character is itself a '-' */
1674 if (B64CHAR(ch) || ch == '-') {
1675 *out++ = '-';
1676 }
1677 inShift = 0;
1678 *out++ = (char) ch;
1679 } else {
1680 bitsleft += 16;
1681 charsleft = (charsleft << 16) | ch;
1682 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1683
1684 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001685 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 or '-' then the shift sequence will be terminated implicitly and we
1687 don't have to insert a '-'. */
1688
1689 if (bitsleft == 0) {
1690 if (i + 1 < size) {
1691 Py_UNICODE ch2 = s[i+1];
1692
1693 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001694
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 } else if (B64CHAR(ch2) || ch2 == '-') {
1696 *out++ = '-';
1697 inShift = 0;
1698 } else {
1699 inShift = 0;
1700 }
1701
1702 }
1703 else {
1704 *out++ = '-';
1705 inShift = 0;
1706 }
1707 }
Tim Petersced69f82003-09-16 20:30:58 +00001708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001710 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001711 if (bitsleft) {
1712 *out++= B64(charsleft << (6-bitsleft) );
1713 *out++ = '-';
1714 }
1715
Tim Peters5de98422002-04-27 18:44:32 +00001716 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001717 return v;
1718}
1719
1720#undef SPECIAL
1721#undef B64
1722#undef B64CHAR
1723#undef UB64
1724#undef ENCODE
1725#undef DECODE
1726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727/* --- UTF-8 Codec -------------------------------------------------------- */
1728
Tim Petersced69f82003-09-16 20:30:58 +00001729static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730char utf8_code_length[256] = {
1731 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1732 illegal prefix. see RFC 2279 for details */
1733 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1734 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1735 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1736 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1737 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1738 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1739 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1740 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1741 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1745 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1746 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1747 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1748 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1749};
1750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001752 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 const char *errors)
1754{
Walter Dörwald69652032004-09-07 20:24:22 +00001755 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1756}
1757
1758PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001759 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001760 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001765 Py_ssize_t startinpos;
1766 Py_ssize_t endinpos;
1767 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 const char *e;
1769 PyUnicodeObject *unicode;
1770 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 PyObject *errorHandler = NULL;
1773 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
1775 /* Note: size will always be longer than the resulting Unicode
1776 character count */
1777 unicode = _PyUnicode_New(size);
1778 if (!unicode)
1779 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001780 if (size == 0) {
1781 if (consumed)
1782 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785
1786 /* Unpack UTF-8 encoded data */
1787 p = unicode->str;
1788 e = s + size;
1789
1790 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001791 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792
1793 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001794 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 s++;
1796 continue;
1797 }
1798
1799 n = utf8_code_length[ch];
1800
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001802 if (consumed)
1803 break;
1804 else {
1805 errmsg = "unexpected end of data";
1806 startinpos = s-starts;
1807 endinpos = size;
1808 goto utf8Error;
1809 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 switch (n) {
1813
1814 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 startinpos = s-starts;
1823 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825
1826 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 if ((s[1] & 0xc0) != 0x80) {
1828 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 goto utf8Error;
1832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 errmsg = "illegal encoding";
1838 goto utf8Error;
1839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 break;
1843
1844 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001845 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 (s[2] & 0xc0) != 0x80) {
1847 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 startinpos = s-starts;
1849 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 goto utf8Error;
1851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001853 if (ch < 0x0800) {
1854 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001855 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001856
1857 XXX For wide builds (UCS-4) we should probably try
1858 to recombine the surrogates into a single code
1859 unit.
1860 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 startinpos = s-starts;
1863 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 goto utf8Error;
1865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001867 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001868 break;
1869
1870 case 4:
1871 if ((s[1] & 0xc0) != 0x80 ||
1872 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 (s[3] & 0xc0) != 0x80) {
1874 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 startinpos = s-starts;
1876 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 goto utf8Error;
1878 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001879 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1880 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1881 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001882 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001883 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001884 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001885 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001886 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001887 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 startinpos = s-starts;
1889 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 goto utf8Error;
1891 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001892#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001893 *p++ = (Py_UNICODE)ch;
1894#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001895 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001896
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001897 /* translate from 10000..10FFFF to 0..FFFF */
1898 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001899
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001900 /* high surrogate = top 10 bits added to D800 */
1901 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001902
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001904 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 break;
1907
1908 default:
1909 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001911 startinpos = s-starts;
1912 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001913 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914 }
1915 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001917
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001918 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 outpos = p-PyUnicode_AS_UNICODE(unicode);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "utf8", errmsg,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 (PyObject **)&unicode, &outpos, &p))
1925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 }
Walter Dörwald69652032004-09-07 20:24:22 +00001927 if (consumed)
1928 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001931 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 goto onError;
1933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 Py_XDECREF(errorHandler);
1935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 return (PyObject *)unicode;
1937
1938onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 Py_DECREF(unicode);
1942 return NULL;
1943}
1944
Tim Peters602f7402002-04-27 18:03:26 +00001945/* Allocation strategy: if the string is short, convert into a stack buffer
1946 and allocate exactly as much space needed at the end. Else allocate the
1947 maximum possible needed (4 result bytes per Unicode character), and return
1948 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001949*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001950PyObject *
1951PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001952 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954{
Tim Peters602f7402002-04-27 18:03:26 +00001955#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001956
Martin v. Löwis18e16552006-02-15 17:27:45 +00001957 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001958 PyObject *v; /* result string object */
1959 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001960 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001961 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001962 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001963
Tim Peters602f7402002-04-27 18:03:26 +00001964 assert(s != NULL);
1965 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
Tim Peters602f7402002-04-27 18:03:26 +00001967 if (size <= MAX_SHORT_UNICHARS) {
1968 /* Write into the stack buffer; nallocated can't overflow.
1969 * At the end, we'll allocate exactly as much heap space as it
1970 * turns out we need.
1971 */
1972 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1973 v = NULL; /* will allocate after we're done */
1974 p = stackbuf;
1975 }
1976 else {
1977 /* Overallocate on the heap, and give the excess back at the end. */
1978 nallocated = size * 4;
1979 if (nallocated / 4 != size) /* overflow! */
1980 return PyErr_NoMemory();
1981 v = PyString_FromStringAndSize(NULL, nallocated);
1982 if (v == NULL)
1983 return NULL;
1984 p = PyString_AS_STRING(v);
1985 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001986
Tim Peters602f7402002-04-27 18:03:26 +00001987 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001988 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001989
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001990 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001993
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001995 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001996 *p++ = (char)(0xc0 | (ch >> 6));
1997 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001998 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001999 else {
Tim Peters602f7402002-04-27 18:03:26 +00002000 /* Encode UCS2 Unicode ordinals */
2001 if (ch < 0x10000) {
2002 /* Special case: check for high surrogate */
2003 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2004 Py_UCS4 ch2 = s[i];
2005 /* Check for low surrogate and combine the two to
2006 form a UCS4 value */
2007 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002008 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002009 i++;
2010 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 }
Tim Peters602f7402002-04-27 18:03:26 +00002012 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002013 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002014 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002015 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2016 *p++ = (char)(0x80 | (ch & 0x3f));
2017 continue;
2018 }
2019encodeUCS4:
2020 /* Encode UCS4 Unicode ordinals */
2021 *p++ = (char)(0xf0 | (ch >> 18));
2022 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2023 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2024 *p++ = (char)(0x80 | (ch & 0x3f));
2025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002027
Tim Peters602f7402002-04-27 18:03:26 +00002028 if (v == NULL) {
2029 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002030 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002031 assert(nneeded <= nallocated);
2032 v = PyString_FromStringAndSize(stackbuf, nneeded);
2033 }
2034 else {
2035 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002036 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002037 assert(nneeded <= nallocated);
2038 _PyString_Resize(&v, nneeded);
2039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002041
Tim Peters602f7402002-04-27 18:03:26 +00002042#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043}
2044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 if (!PyUnicode_Check(unicode)) {
2048 PyErr_BadArgument();
2049 return NULL;
2050 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002051 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2052 PyUnicode_GET_SIZE(unicode),
2053 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054}
2055
Walter Dörwald6e390802007-08-17 16:41:28 +00002056/* --- UTF-32 Codec ------------------------------------------------------- */
2057
2058PyObject *
2059PyUnicode_DecodeUTF32(const char *s,
2060 Py_ssize_t size,
2061 const char *errors,
2062 int *byteorder)
2063{
2064 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2065}
2066
2067PyObject *
2068PyUnicode_DecodeUTF32Stateful(const char *s,
2069 Py_ssize_t size,
2070 const char *errors,
2071 int *byteorder,
2072 Py_ssize_t *consumed)
2073{
2074 const char *starts = s;
2075 Py_ssize_t startinpos;
2076 Py_ssize_t endinpos;
2077 Py_ssize_t outpos;
2078 PyUnicodeObject *unicode;
2079 Py_UNICODE *p;
2080#ifndef Py_UNICODE_WIDE
2081 int i, pairs;
2082#else
2083 const int pairs = 0;
2084#endif
2085 const unsigned char *q, *e;
2086 int bo = 0; /* assume native ordering by default */
2087 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002088 /* Offsets from q for retrieving bytes in the right order. */
2089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2090 int iorder[] = {0, 1, 2, 3};
2091#else
2092 int iorder[] = {3, 2, 1, 0};
2093#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002094 PyObject *errorHandler = NULL;
2095 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002096 /* On narrow builds we split characters outside the BMP into two
2097 codepoints => count how much extra space we need. */
2098#ifndef Py_UNICODE_WIDE
2099 for (i = pairs = 0; i < size/4; i++)
2100 if (((Py_UCS4 *)s)[i] >= 0x10000)
2101 pairs++;
2102#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002103
2104 /* This might be one to much, because of a BOM */
2105 unicode = _PyUnicode_New((size+3)/4+pairs);
2106 if (!unicode)
2107 return NULL;
2108 if (size == 0)
2109 return (PyObject *)unicode;
2110
2111 /* Unpack UTF-32 encoded data */
2112 p = unicode->str;
2113 q = (unsigned char *)s;
2114 e = q + size;
2115
2116 if (byteorder)
2117 bo = *byteorder;
2118
2119 /* Check for BOM marks (U+FEFF) in the input and adjust current
2120 byte order setting accordingly. In native mode, the leading BOM
2121 mark is skipped, in all other modes, it is copied to the output
2122 stream as-is (giving a ZWNBSP character). */
2123 if (bo == 0) {
2124 if (size >= 4) {
2125 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2126 (q[iorder[1]] << 8) | q[iorder[0]];
2127#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2128 if (bom == 0x0000FEFF) {
2129 q += 4;
2130 bo = -1;
2131 }
2132 else if (bom == 0xFFFE0000) {
2133 q += 4;
2134 bo = 1;
2135 }
2136#else
2137 if (bom == 0x0000FEFF) {
2138 q += 4;
2139 bo = 1;
2140 }
2141 else if (bom == 0xFFFE0000) {
2142 q += 4;
2143 bo = -1;
2144 }
2145#endif
2146 }
2147 }
2148
2149 if (bo == -1) {
2150 /* force LE */
2151 iorder[0] = 0;
2152 iorder[1] = 1;
2153 iorder[2] = 2;
2154 iorder[3] = 3;
2155 }
2156 else if (bo == 1) {
2157 /* force BE */
2158 iorder[0] = 3;
2159 iorder[1] = 2;
2160 iorder[2] = 1;
2161 iorder[3] = 0;
2162 }
2163
2164 while (q < e) {
2165 Py_UCS4 ch;
2166 /* remaining bytes at the end? (size should be divisible by 4) */
2167 if (e-q<4) {
2168 if (consumed)
2169 break;
2170 errmsg = "truncated data";
2171 startinpos = ((const char *)q)-starts;
2172 endinpos = ((const char *)e)-starts;
2173 goto utf32Error;
2174 /* The remaining input chars are ignored if the callback
2175 chooses to skip the input */
2176 }
2177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2178 (q[iorder[1]] << 8) | q[iorder[0]];
2179
2180 if (ch >= 0x110000)
2181 {
2182 errmsg = "codepoint not in range(0x110000)";
2183 startinpos = ((const char *)q)-starts;
2184 endinpos = startinpos+4;
2185 goto utf32Error;
2186 }
2187#ifndef Py_UNICODE_WIDE
2188 if (ch >= 0x10000)
2189 {
2190 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2191 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2192 }
2193 else
2194#endif
2195 *p++ = ch;
2196 q += 4;
2197 continue;
2198 utf32Error:
2199 outpos = p-PyUnicode_AS_UNICODE(unicode);
2200 if (unicode_decode_call_errorhandler(
2201 errors, &errorHandler,
2202 "utf32", errmsg,
2203 starts, size, &startinpos, &endinpos, &exc, &s,
2204 (PyObject **)&unicode, &outpos, &p))
2205 goto onError;
2206 }
2207
2208 if (byteorder)
2209 *byteorder = bo;
2210
2211 if (consumed)
2212 *consumed = (const char *)q-starts;
2213
2214 /* Adjust length */
2215 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2216 goto onError;
2217
2218 Py_XDECREF(errorHandler);
2219 Py_XDECREF(exc);
2220 return (PyObject *)unicode;
2221
2222onError:
2223 Py_DECREF(unicode);
2224 Py_XDECREF(errorHandler);
2225 Py_XDECREF(exc);
2226 return NULL;
2227}
2228
2229PyObject *
2230PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2231 Py_ssize_t size,
2232 const char *errors,
2233 int byteorder)
2234{
2235 PyObject *v;
2236 unsigned char *p;
2237#ifndef Py_UNICODE_WIDE
2238 int i, pairs;
2239#else
2240 const int pairs = 0;
2241#endif
2242 /* Offsets from p for storing byte pairs in the right order. */
2243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2244 int iorder[] = {0, 1, 2, 3};
2245#else
2246 int iorder[] = {3, 2, 1, 0};
2247#endif
2248
2249#define STORECHAR(CH) \
2250 do { \
2251 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2252 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2253 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2254 p[iorder[0]] = (CH) & 0xff; \
2255 p += 4; \
2256 } while(0)
2257
2258 /* In narrow builds we can output surrogate pairs as one codepoint,
2259 so we need less space. */
2260#ifndef Py_UNICODE_WIDE
2261 for (i = pairs = 0; i < size-1; i++)
2262 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2263 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2264 pairs++;
2265#endif
2266 v = PyString_FromStringAndSize(NULL,
2267 4 * (size - pairs + (byteorder == 0)));
2268 if (v == NULL)
2269 return NULL;
2270
2271 p = (unsigned char *)PyString_AS_STRING(v);
2272 if (byteorder == 0)
2273 STORECHAR(0xFEFF);
2274 if (size == 0)
2275 return v;
2276
2277 if (byteorder == -1) {
2278 /* force LE */
2279 iorder[0] = 0;
2280 iorder[1] = 1;
2281 iorder[2] = 2;
2282 iorder[3] = 3;
2283 }
2284 else if (byteorder == 1) {
2285 /* force BE */
2286 iorder[0] = 3;
2287 iorder[1] = 2;
2288 iorder[2] = 1;
2289 iorder[3] = 0;
2290 }
2291
2292 while (size-- > 0) {
2293 Py_UCS4 ch = *s++;
2294#ifndef Py_UNICODE_WIDE
2295 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2296 Py_UCS4 ch2 = *s;
2297 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2298 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2299 s++;
2300 size--;
2301 }
2302 }
2303#endif
2304 STORECHAR(ch);
2305 }
2306 return v;
2307#undef STORECHAR
2308}
2309
2310PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2311{
2312 if (!PyUnicode_Check(unicode)) {
2313 PyErr_BadArgument();
2314 return NULL;
2315 }
2316 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2317 PyUnicode_GET_SIZE(unicode),
2318 NULL,
2319 0);
2320}
2321
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322/* --- UTF-16 Codec ------------------------------------------------------- */
2323
Tim Peters772747b2001-08-09 22:21:55 +00002324PyObject *
2325PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002326 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002327 const char *errors,
2328 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329{
Walter Dörwald69652032004-09-07 20:24:22 +00002330 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2331}
2332
2333PyObject *
2334PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002335 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002336 const char *errors,
2337 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002338 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002340 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002341 Py_ssize_t startinpos;
2342 Py_ssize_t endinpos;
2343 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 PyUnicodeObject *unicode;
2345 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002346 const unsigned char *q, *e;
2347 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002348 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002349 /* Offsets from q for retrieving byte pairs in the right order. */
2350#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2351 int ihi = 1, ilo = 0;
2352#else
2353 int ihi = 0, ilo = 1;
2354#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 PyObject *errorHandler = NULL;
2356 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357
2358 /* Note: size will always be longer than the resulting Unicode
2359 character count */
2360 unicode = _PyUnicode_New(size);
2361 if (!unicode)
2362 return NULL;
2363 if (size == 0)
2364 return (PyObject *)unicode;
2365
2366 /* Unpack UTF-16 encoded data */
2367 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002368 q = (unsigned char *)s;
2369 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370
2371 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002372 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002374 /* Check for BOM marks (U+FEFF) in the input and adjust current
2375 byte order setting accordingly. In native mode, the leading BOM
2376 mark is skipped, in all other modes, it is copied to the output
2377 stream as-is (giving a ZWNBSP character). */
2378 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002379 if (size >= 2) {
2380 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002381#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002382 if (bom == 0xFEFF) {
2383 q += 2;
2384 bo = -1;
2385 }
2386 else if (bom == 0xFFFE) {
2387 q += 2;
2388 bo = 1;
2389 }
Tim Petersced69f82003-09-16 20:30:58 +00002390#else
Walter Dörwald69652032004-09-07 20:24:22 +00002391 if (bom == 0xFEFF) {
2392 q += 2;
2393 bo = 1;
2394 }
2395 else if (bom == 0xFFFE) {
2396 q += 2;
2397 bo = -1;
2398 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002399#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002400 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
Tim Peters772747b2001-08-09 22:21:55 +00002403 if (bo == -1) {
2404 /* force LE */
2405 ihi = 1;
2406 ilo = 0;
2407 }
2408 else if (bo == 1) {
2409 /* force BE */
2410 ihi = 0;
2411 ilo = 1;
2412 }
2413
2414 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002416 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002418 if (consumed)
2419 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 errmsg = "truncated data";
2421 startinpos = ((const char *)q)-starts;
2422 endinpos = ((const char *)e)-starts;
2423 goto utf16Error;
2424 /* The remaining input chars are ignored if the callback
2425 chooses to skip the input */
2426 }
2427 ch = (q[ihi] << 8) | q[ilo];
2428
Tim Peters772747b2001-08-09 22:21:55 +00002429 q += 2;
2430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 if (ch < 0xD800 || ch > 0xDFFF) {
2432 *p++ = ch;
2433 continue;
2434 }
2435
2436 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 if (q >= e) {
2438 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002439 startinpos = (((const char *)q)-2)-starts;
2440 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002441 goto utf16Error;
2442 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002443 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002444 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2445 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002446 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002447#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002448 *p++ = ch;
2449 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002450#else
2451 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002452#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002453 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002454 }
2455 else {
2456 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 startinpos = (((const char *)q)-4)-starts;
2458 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002459 goto utf16Error;
2460 }
2461
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002463 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 startinpos = (((const char *)q)-2)-starts;
2465 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002466 /* Fall through to report the error */
2467
2468 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469 outpos = p-PyUnicode_AS_UNICODE(unicode);
2470 if (unicode_decode_call_errorhandler(
2471 errors, &errorHandler,
2472 "utf16", errmsg,
2473 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2474 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002475 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
2477
2478 if (byteorder)
2479 *byteorder = bo;
2480
Walter Dörwald69652032004-09-07 20:24:22 +00002481 if (consumed)
2482 *consumed = (const char *)q-starts;
2483
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002485 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 goto onError;
2487
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 Py_XDECREF(errorHandler);
2489 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 return (PyObject *)unicode;
2491
2492onError:
2493 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 Py_XDECREF(errorHandler);
2495 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 return NULL;
2497}
2498
Tim Peters772747b2001-08-09 22:21:55 +00002499PyObject *
2500PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002502 const char *errors,
2503 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504{
2505 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002506 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002507#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002508 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002509#else
2510 const int pairs = 0;
2511#endif
Tim Peters772747b2001-08-09 22:21:55 +00002512 /* Offsets from p for storing byte pairs in the right order. */
2513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2514 int ihi = 1, ilo = 0;
2515#else
2516 int ihi = 0, ilo = 1;
2517#endif
2518
2519#define STORECHAR(CH) \
2520 do { \
2521 p[ihi] = ((CH) >> 8) & 0xff; \
2522 p[ilo] = (CH) & 0xff; \
2523 p += 2; \
2524 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002526#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002527 for (i = pairs = 0; i < size; i++)
2528 if (s[i] >= 0x10000)
2529 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002530#endif
Tim Petersced69f82003-09-16 20:30:58 +00002531 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002532 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 if (v == NULL)
2534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535
Tim Peters772747b2001-08-09 22:21:55 +00002536 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002538 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002539 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002540 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002541
2542 if (byteorder == -1) {
2543 /* force LE */
2544 ihi = 1;
2545 ilo = 0;
2546 }
2547 else if (byteorder == 1) {
2548 /* force BE */
2549 ihi = 0;
2550 ilo = 1;
2551 }
2552
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002553 while (size-- > 0) {
2554 Py_UNICODE ch = *s++;
2555 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002556#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002557 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002558 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2559 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002561#endif
Tim Peters772747b2001-08-09 22:21:55 +00002562 STORECHAR(ch);
2563 if (ch2)
2564 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002567#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568}
2569
2570PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2571{
2572 if (!PyUnicode_Check(unicode)) {
2573 PyErr_BadArgument();
2574 return NULL;
2575 }
2576 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2577 PyUnicode_GET_SIZE(unicode),
2578 NULL,
2579 0);
2580}
2581
2582/* --- Unicode Escape Codec ----------------------------------------------- */
2583
Fredrik Lundh06d12682001-01-24 07:59:11 +00002584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002585
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002587 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 const char *errors)
2589{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002591 Py_ssize_t startinpos;
2592 Py_ssize_t endinpos;
2593 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002598 char* message;
2599 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 PyObject *errorHandler = NULL;
2601 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 /* Escaped strings will always be longer than the resulting
2604 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 length after conversion to the true value.
2606 (but if the error callback returns a long replacement string
2607 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 v = _PyUnicode_New(size);
2609 if (v == NULL)
2610 goto onError;
2611 if (size == 0)
2612 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 while (s < end) {
2618 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002619 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621
2622 /* Non-escape characters are interpreted as Unicode ordinals */
2623 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002624 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 continue;
2626 }
2627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 /* \ - Escapes */
2630 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002631 c = *s++;
2632 if (s > end)
2633 c = '\0'; /* Invalid after \ */
2634 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635
2636 /* \x escapes */
2637 case '\n': break;
2638 case '\\': *p++ = '\\'; break;
2639 case '\'': *p++ = '\''; break;
2640 case '\"': *p++ = '\"'; break;
2641 case 'b': *p++ = '\b'; break;
2642 case 'f': *p++ = '\014'; break; /* FF */
2643 case 't': *p++ = '\t'; break;
2644 case 'n': *p++ = '\n'; break;
2645 case 'r': *p++ = '\r'; break;
2646 case 'v': *p++ = '\013'; break; /* VT */
2647 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2648
2649 /* \OOO (octal) escapes */
2650 case '0': case '1': case '2': case '3':
2651 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002652 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002653 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002654 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002655 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002656 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002658 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 break;
2660
Fredrik Lundhccc74732001-02-18 22:13:49 +00002661 /* hex escapes */
2662 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002664 digits = 2;
2665 message = "truncated \\xXX escape";
2666 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667
Fredrik Lundhccc74732001-02-18 22:13:49 +00002668 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 digits = 4;
2671 message = "truncated \\uXXXX escape";
2672 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002675 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 digits = 8;
2677 message = "truncated \\UXXXXXXXX escape";
2678 hexescape:
2679 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 outpos = p-PyUnicode_AS_UNICODE(v);
2681 if (s+digits>end) {
2682 endinpos = size;
2683 if (unicode_decode_call_errorhandler(
2684 errors, &errorHandler,
2685 "unicodeescape", "end of string in escape sequence",
2686 starts, size, &startinpos, &endinpos, &exc, &s,
2687 (PyObject **)&v, &outpos, &p))
2688 goto onError;
2689 goto nextByte;
2690 }
2691 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002693 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 endinpos = (s+i+1)-starts;
2695 if (unicode_decode_call_errorhandler(
2696 errors, &errorHandler,
2697 "unicodeescape", message,
2698 starts, size, &startinpos, &endinpos, &exc, &s,
2699 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002700 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002702 }
2703 chr = (chr<<4) & ~0xF;
2704 if (c >= '0' && c <= '9')
2705 chr += c - '0';
2706 else if (c >= 'a' && c <= 'f')
2707 chr += 10 + c - 'a';
2708 else
2709 chr += 10 + c - 'A';
2710 }
2711 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002712 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002713 /* _decoding_error will have already written into the
2714 target buffer. */
2715 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002716 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002717 /* when we get here, chr is a 32-bit unicode character */
2718 if (chr <= 0xffff)
2719 /* UCS-2 character */
2720 *p++ = (Py_UNICODE) chr;
2721 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002722 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002723 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002725 *p++ = chr;
2726#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002727 chr -= 0x10000L;
2728 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002729 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002730#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002731 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 endinpos = s-starts;
2733 outpos = p-PyUnicode_AS_UNICODE(v);
2734 if (unicode_decode_call_errorhandler(
2735 errors, &errorHandler,
2736 "unicodeescape", "illegal Unicode character",
2737 starts, size, &startinpos, &endinpos, &exc, &s,
2738 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 goto onError;
2740 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002741 break;
2742
2743 /* \N{name} */
2744 case 'N':
2745 message = "malformed \\N character escape";
2746 if (ucnhash_CAPI == NULL) {
2747 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002749 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002750 if (m == NULL)
2751 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002752 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002754 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002755 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002756 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002757 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758 if (ucnhash_CAPI == NULL)
2759 goto ucnhashError;
2760 }
2761 if (*s == '{') {
2762 const char *start = s+1;
2763 /* look for the closing brace */
2764 while (*s != '}' && s < end)
2765 s++;
2766 if (s > start && s < end && *s == '}') {
2767 /* found a name. look it up in the unicode database */
2768 message = "unknown Unicode character name";
2769 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002770 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002771 goto store;
2772 }
2773 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 endinpos = s-starts;
2775 outpos = p-PyUnicode_AS_UNICODE(v);
2776 if (unicode_decode_call_errorhandler(
2777 errors, &errorHandler,
2778 "unicodeescape", message,
2779 starts, size, &startinpos, &endinpos, &exc, &s,
2780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002782 break;
2783
2784 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002785 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 message = "\\ at end of string";
2787 s--;
2788 endinpos = s-starts;
2789 outpos = p-PyUnicode_AS_UNICODE(v);
2790 if (unicode_decode_call_errorhandler(
2791 errors, &errorHandler,
2792 "unicodeescape", message,
2793 starts, size, &startinpos, &endinpos, &exc, &s,
2794 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002795 goto onError;
2796 }
2797 else {
2798 *p++ = '\\';
2799 *p++ = (unsigned char)s[-1];
2800 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002801 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 nextByte:
2804 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002811
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002813 PyErr_SetString(
2814 PyExc_UnicodeError,
2815 "\\N escapes not supported (can't load unicodedata module)"
2816 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002817 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 Py_XDECREF(errorHandler);
2819 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002820 return NULL;
2821
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 Py_XDECREF(errorHandler);
2825 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 return NULL;
2827}
2828
2829/* Return a Unicode-Escape string version of the Unicode object.
2830
2831 If quotes is true, the string is enclosed in u"" or u'' quotes as
2832 appropriate.
2833
2834*/
2835
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002836Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002837 Py_ssize_t size,
2838 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002839{
2840 /* like wcschr, but doesn't stop at NULL characters */
2841
2842 while (size-- > 0) {
2843 if (*s == ch)
2844 return s;
2845 s++;
2846 }
2847
2848 return NULL;
2849}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851static
2852PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 int quotes)
2855{
2856 PyObject *repr;
2857 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002859 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860
Neal Norwitz17753ec2006-08-21 22:21:19 +00002861 /* XXX(nnorwitz): rather than over-allocating, it would be
2862 better to choose a different scheme. Perhaps scan the
2863 first N-chars of the string and allocate based on that size.
2864 */
2865 /* Initial allocation is based on the longest-possible unichr
2866 escape.
2867
2868 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2869 unichr, so in this case it's the longest unichr escape. In
2870 narrow (UTF-16) builds this is five chars per source unichr
2871 since there are two unichrs in the surrogate pair, so in narrow
2872 (UTF-16) builds it's not the longest unichr escape.
2873
2874 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2875 so in the narrow (UTF-16) build case it's the longest unichr
2876 escape.
2877 */
2878
2879 repr = PyString_FromStringAndSize(NULL,
2880 2
2881#ifdef Py_UNICODE_WIDE
2882 + 10*size
2883#else
2884 + 6*size
2885#endif
2886 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 if (repr == NULL)
2888 return NULL;
2889
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002890 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891
2892 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002894 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 !findchar(s, size, '"')) ? '"' : '\'';
2896 }
2897 while (size-- > 0) {
2898 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002899
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002900 /* Escape quotes and backslashes */
2901 if ((quotes &&
2902 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 *p++ = '\\';
2904 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002905 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002906 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002907
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002908#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002909 /* Map 21-bit characters to '\U00xxxxxx' */
2910 else if (ch >= 0x10000) {
2911 *p++ = '\\';
2912 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002913 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2914 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2915 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2916 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2917 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2918 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2919 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002920 *p++ = hexdigit[ch & 0x0000000F];
2921 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002922 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002923#else
2924 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002925 else if (ch >= 0xD800 && ch < 0xDC00) {
2926 Py_UNICODE ch2;
2927 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002928
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002929 ch2 = *s++;
2930 size--;
2931 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2932 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2933 *p++ = '\\';
2934 *p++ = 'U';
2935 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2936 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2937 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2938 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2939 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2940 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2941 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2942 *p++ = hexdigit[ucs & 0x0000000F];
2943 continue;
2944 }
2945 /* Fall through: isolated surrogates are copied as-is */
2946 s--;
2947 size++;
2948 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002949#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002952 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 *p++ = '\\';
2954 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002955 *p++ = hexdigit[(ch >> 12) & 0x000F];
2956 *p++ = hexdigit[(ch >> 8) & 0x000F];
2957 *p++ = hexdigit[(ch >> 4) & 0x000F];
2958 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002961 /* Map special whitespace to '\t', \n', '\r' */
2962 else if (ch == '\t') {
2963 *p++ = '\\';
2964 *p++ = 't';
2965 }
2966 else if (ch == '\n') {
2967 *p++ = '\\';
2968 *p++ = 'n';
2969 }
2970 else if (ch == '\r') {
2971 *p++ = '\\';
2972 *p++ = 'r';
2973 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002975 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002976 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002978 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002979 *p++ = hexdigit[(ch >> 4) & 0x000F];
2980 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002981 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002982
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 /* Copy everything else as-is */
2984 else
2985 *p++ = (char) ch;
2986 }
2987 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989
2990 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002991 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 return repr;
2993}
2994
2995PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002996 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997{
2998 return unicodeescape_string(s, size, 0);
2999}
3000
3001PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3002{
3003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_BadArgument();
3005 return NULL;
3006 }
3007 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3008 PyUnicode_GET_SIZE(unicode));
3009}
3010
3011/* --- Raw Unicode Escape Codec ------------------------------------------- */
3012
3013PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 const char *errors)
3016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t startinpos;
3019 Py_ssize_t endinpos;
3020 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 const char *end;
3024 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 PyObject *errorHandler = NULL;
3026 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 /* Escaped strings will always be longer than the resulting
3029 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 length after conversion to the true value. (But decoding error
3031 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 v = _PyUnicode_New(size);
3033 if (v == NULL)
3034 goto onError;
3035 if (size == 0)
3036 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 end = s + size;
3039 while (s < end) {
3040 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003041 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003043 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
3045 /* Non-escape characters are interpreted as Unicode ordinals */
3046 if (*s != '\\') {
3047 *p++ = (unsigned char)*s++;
3048 continue;
3049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
3052 /* \u-escapes are only interpreted iff the number of leading
3053 backslashes if odd */
3054 bs = s;
3055 for (;s < end;) {
3056 if (*s != '\\')
3057 break;
3058 *p++ = (unsigned char)*s++;
3059 }
3060 if (((s - bs) & 1) == 0 ||
3061 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 continue;
3064 }
3065 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003066 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 s++;
3068
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003069 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003071 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 endinpos = s-starts;
3075 if (unicode_decode_call_errorhandler(
3076 errors, &errorHandler,
3077 "rawunicodeescape", "truncated \\uXXXX",
3078 starts, size, &startinpos, &endinpos, &exc, &s,
3079 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 }
3083 x = (x<<4) & ~0xF;
3084 if (c >= '0' && c <= '9')
3085 x += c - '0';
3086 else if (c >= 'a' && c <= 'f')
3087 x += 10 + c - 'a';
3088 else
3089 x += 10 + c - 'A';
3090 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003091 if (x <= 0xffff)
3092 /* UCS-2 character */
3093 *p++ = (Py_UNICODE) x;
3094 else if (x <= 0x10ffff) {
3095 /* UCS-4 character. Either store directly, or as
3096 surrogate pair. */
3097#ifdef Py_UNICODE_WIDE
3098 *p++ = (Py_UNIC0DE) x;
3099#else
3100 x -= 0x10000L;
3101 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3102 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3103#endif
3104 } else {
3105 endinpos = s-starts;
3106 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003107 if (unicode_decode_call_errorhandler(
3108 errors, &errorHandler,
3109 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3110 starts, size, &startinpos, &endinpos, &exc, &s,
3111 (PyObject **)&v, &outpos, &p))
3112 goto onError;
3113 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 nextByte:
3115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003117 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 Py_XDECREF(errorHandler);
3120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 onError:
3124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 Py_XDECREF(errorHandler);
3126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 return NULL;
3128}
3129
3130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003131 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
3133 PyObject *repr;
3134 char *p;
3135 char *q;
3136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003139#ifdef Py_UNICODE_WIDE
3140 repr = PyString_FromStringAndSize(NULL, 10 * size);
3141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 if (repr == NULL)
3145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003146 if (size == 0)
3147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148
3149 p = q = PyString_AS_STRING(repr);
3150 while (size-- > 0) {
3151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152#ifdef Py_UNICODE_WIDE
3153 /* Map 32-bit characters to '\Uxxxxxxxx' */
3154 if (ch >= 0x10000) {
3155 *p++ = '\\';
3156 *p++ = 'U';
3157 *p++ = hexdigit[(ch >> 28) & 0xf];
3158 *p++ = hexdigit[(ch >> 24) & 0xf];
3159 *p++ = hexdigit[(ch >> 20) & 0xf];
3160 *p++ = hexdigit[(ch >> 16) & 0xf];
3161 *p++ = hexdigit[(ch >> 12) & 0xf];
3162 *p++ = hexdigit[(ch >> 8) & 0xf];
3163 *p++ = hexdigit[(ch >> 4) & 0xf];
3164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003166 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003167#else
3168 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3169 if (ch >= 0xD800 && ch < 0xDC00) {
3170 Py_UNICODE ch2;
3171 Py_UCS4 ucs;
3172
3173 ch2 = *s++;
3174 size--;
3175 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3176 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3177 *p++ = '\\';
3178 *p++ = 'U';
3179 *p++ = hexdigit[(ucs >> 28) & 0xf];
3180 *p++ = hexdigit[(ucs >> 24) & 0xf];
3181 *p++ = hexdigit[(ucs >> 20) & 0xf];
3182 *p++ = hexdigit[(ucs >> 16) & 0xf];
3183 *p++ = hexdigit[(ucs >> 12) & 0xf];
3184 *p++ = hexdigit[(ucs >> 8) & 0xf];
3185 *p++ = hexdigit[(ucs >> 4) & 0xf];
3186 *p++ = hexdigit[ucs & 0xf];
3187 continue;
3188 }
3189 /* Fall through: isolated surrogates are copied as-is */
3190 s--;
3191 size++;
3192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 /* Map 16-bit characters to '\uxxxx' */
3195 if (ch >= 256) {
3196 *p++ = '\\';
3197 *p++ = 'u';
3198 *p++ = hexdigit[(ch >> 12) & 0xf];
3199 *p++ = hexdigit[(ch >> 8) & 0xf];
3200 *p++ = hexdigit[(ch >> 4) & 0xf];
3201 *p++ = hexdigit[ch & 15];
3202 }
3203 /* Copy everything else as-is */
3204 else
3205 *p++ = (char) ch;
3206 }
3207 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003208 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return repr;
3210}
3211
3212PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3213{
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 return NULL;
3217 }
3218 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3219 PyUnicode_GET_SIZE(unicode));
3220}
3221
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222/* --- Unicode Internal Codec ------------------------------------------- */
3223
3224PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003225 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003226 const char *errors)
3227{
3228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003229 Py_ssize_t startinpos;
3230 Py_ssize_t endinpos;
3231 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 PyUnicodeObject *v;
3233 Py_UNICODE *p;
3234 const char *end;
3235 const char *reason;
3236 PyObject *errorHandler = NULL;
3237 PyObject *exc = NULL;
3238
Neal Norwitzd43069c2006-01-08 01:12:10 +00003239#ifdef Py_UNICODE_WIDE
3240 Py_UNICODE unimax = PyUnicode_GetMax();
3241#endif
3242
Armin Rigo7ccbca92006-10-04 12:17:45 +00003243 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3245 if (v == NULL)
3246 goto onError;
3247 if (PyUnicode_GetSize((PyObject *)v) == 0)
3248 return (PyObject *)v;
3249 p = PyUnicode_AS_UNICODE(v);
3250 end = s + size;
3251
3252 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003253 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003254 /* We have to sanity check the raw data, otherwise doom looms for
3255 some malformed UCS-4 data. */
3256 if (
3257 #ifdef Py_UNICODE_WIDE
3258 *p > unimax || *p < 0 ||
3259 #endif
3260 end-s < Py_UNICODE_SIZE
3261 )
3262 {
3263 startinpos = s - starts;
3264 if (end-s < Py_UNICODE_SIZE) {
3265 endinpos = end-starts;
3266 reason = "truncated input";
3267 }
3268 else {
3269 endinpos = s - starts + Py_UNICODE_SIZE;
3270 reason = "illegal code point (> 0x10FFFF)";
3271 }
3272 outpos = p - PyUnicode_AS_UNICODE(v);
3273 if (unicode_decode_call_errorhandler(
3274 errors, &errorHandler,
3275 "unicode_internal", reason,
3276 starts, size, &startinpos, &endinpos, &exc, &s,
3277 (PyObject **)&v, &outpos, &p)) {
3278 goto onError;
3279 }
3280 }
3281 else {
3282 p++;
3283 s += Py_UNICODE_SIZE;
3284 }
3285 }
3286
Martin v. Löwis412fb672006-04-13 06:34:32 +00003287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003288 goto onError;
3289 Py_XDECREF(errorHandler);
3290 Py_XDECREF(exc);
3291 return (PyObject *)v;
3292
3293 onError:
3294 Py_XDECREF(v);
3295 Py_XDECREF(errorHandler);
3296 Py_XDECREF(exc);
3297 return NULL;
3298}
3299
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300/* --- Latin-1 Codec ------------------------------------------------------ */
3301
3302PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 const char *errors)
3305{
3306 PyUnicodeObject *v;
3307 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003310 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003311 Py_UNICODE r = *(unsigned char*)s;
3312 return PyUnicode_FromUnicode(&r, 1);
3313 }
3314
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 v = _PyUnicode_New(size);
3316 if (v == NULL)
3317 goto onError;
3318 if (size == 0)
3319 return (PyObject *)v;
3320 p = PyUnicode_AS_UNICODE(v);
3321 while (size-- > 0)
3322 *p++ = (unsigned char)*s++;
3323 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003324
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 onError:
3326 Py_XDECREF(v);
3327 return NULL;
3328}
3329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330/* create or adjust a UnicodeEncodeError */
3331static void make_encode_exception(PyObject **exceptionObject,
3332 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 if (*exceptionObject == NULL) {
3338 *exceptionObject = PyUnicodeEncodeError_Create(
3339 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
3341 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3343 goto onError;
3344 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3345 goto onError;
3346 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3347 goto onError;
3348 return;
3349 onError:
3350 Py_DECREF(*exceptionObject);
3351 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 }
3353}
3354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355/* raises a UnicodeEncodeError */
3356static void raise_encode_exception(PyObject **exceptionObject,
3357 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003358 const Py_UNICODE *unicode, Py_ssize_t size,
3359 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 const char *reason)
3361{
3362 make_encode_exception(exceptionObject,
3363 encoding, unicode, size, startpos, endpos, reason);
3364 if (*exceptionObject != NULL)
3365 PyCodec_StrictErrors(*exceptionObject);
3366}
3367
3368/* error handling callback helper:
3369 build arguments, call the callback and check the arguments,
3370 put the result into newpos and return the replacement string, which
3371 has to be freed by the caller */
3372static PyObject *unicode_encode_call_errorhandler(const char *errors,
3373 PyObject **errorHandler,
3374 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3376 Py_ssize_t startpos, Py_ssize_t endpos,
3377 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380
3381 PyObject *restuple;
3382 PyObject *resunicode;
3383
3384 if (*errorHandler == NULL) {
3385 *errorHandler = PyCodec_LookupError(errors);
3386 if (*errorHandler == NULL)
3387 return NULL;
3388 }
3389
3390 make_encode_exception(exceptionObject,
3391 encoding, unicode, size, startpos, endpos, reason);
3392 if (*exceptionObject == NULL)
3393 return NULL;
3394
3395 restuple = PyObject_CallFunctionObjArgs(
3396 *errorHandler, *exceptionObject, NULL);
3397 if (restuple == NULL)
3398 return NULL;
3399 if (!PyTuple_Check(restuple)) {
3400 PyErr_Format(PyExc_TypeError, &argparse[4]);
3401 Py_DECREF(restuple);
3402 return NULL;
3403 }
3404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3405 &resunicode, newpos)) {
3406 Py_DECREF(restuple);
3407 return NULL;
3408 }
3409 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003410 *newpos = size+*newpos;
3411 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003413 Py_DECREF(restuple);
3414 return NULL;
3415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 Py_INCREF(resunicode);
3417 Py_DECREF(restuple);
3418 return resunicode;
3419}
3420
3421static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 const char *errors,
3424 int limit)
3425{
3426 /* output object */
3427 PyObject *res;
3428 /* pointers to the beginning and end+1 of input */
3429 const Py_UNICODE *startp = p;
3430 const Py_UNICODE *endp = p + size;
3431 /* pointer to the beginning of the unencodable characters */
3432 /* const Py_UNICODE *badp = NULL; */
3433 /* pointer into the output */
3434 char *str;
3435 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t respos = 0;
3437 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003438 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3439 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 PyObject *errorHandler = NULL;
3441 PyObject *exc = NULL;
3442 /* the following variable is used for caching string comparisons
3443 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3444 int known_errorHandler = -1;
3445
3446 /* allocate enough for a simple encoding without
3447 replacements, if we need more, we'll resize */
3448 res = PyString_FromStringAndSize(NULL, size);
3449 if (res == NULL)
3450 goto onError;
3451 if (size == 0)
3452 return res;
3453 str = PyString_AS_STRING(res);
3454 ressize = size;
3455
3456 while (p<endp) {
3457 Py_UNICODE c = *p;
3458
3459 /* can we encode this? */
3460 if (c<limit) {
3461 /* no overflow check, because we know that the space is enough */
3462 *str++ = (char)c;
3463 ++p;
3464 }
3465 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003466 Py_ssize_t unicodepos = p-startp;
3467 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003469 Py_ssize_t repsize;
3470 Py_ssize_t newpos;
3471 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 Py_UNICODE *uni2;
3473 /* startpos for collecting unencodable chars */
3474 const Py_UNICODE *collstart = p;
3475 const Py_UNICODE *collend = p;
3476 /* find all unecodable characters */
3477 while ((collend < endp) && ((*collend)>=limit))
3478 ++collend;
3479 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3480 if (known_errorHandler==-1) {
3481 if ((errors==NULL) || (!strcmp(errors, "strict")))
3482 known_errorHandler = 1;
3483 else if (!strcmp(errors, "replace"))
3484 known_errorHandler = 2;
3485 else if (!strcmp(errors, "ignore"))
3486 known_errorHandler = 3;
3487 else if (!strcmp(errors, "xmlcharrefreplace"))
3488 known_errorHandler = 4;
3489 else
3490 known_errorHandler = 0;
3491 }
3492 switch (known_errorHandler) {
3493 case 1: /* strict */
3494 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3495 goto onError;
3496 case 2: /* replace */
3497 while (collstart++<collend)
3498 *str++ = '?'; /* fall through */
3499 case 3: /* ignore */
3500 p = collend;
3501 break;
3502 case 4: /* xmlcharrefreplace */
3503 respos = str-PyString_AS_STRING(res);
3504 /* determine replacement size (temporarily (mis)uses p) */
3505 for (p = collstart, repsize = 0; p < collend; ++p) {
3506 if (*p<10)
3507 repsize += 2+1+1;
3508 else if (*p<100)
3509 repsize += 2+2+1;
3510 else if (*p<1000)
3511 repsize += 2+3+1;
3512 else if (*p<10000)
3513 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003514#ifndef Py_UNICODE_WIDE
3515 else
3516 repsize += 2+5+1;
3517#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 else if (*p<100000)
3519 repsize += 2+5+1;
3520 else if (*p<1000000)
3521 repsize += 2+6+1;
3522 else
3523 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003524#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 }
3526 requiredsize = respos+repsize+(endp-collend);
3527 if (requiredsize > ressize) {
3528 if (requiredsize<2*ressize)
3529 requiredsize = 2*ressize;
3530 if (_PyString_Resize(&res, requiredsize))
3531 goto onError;
3532 str = PyString_AS_STRING(res) + respos;
3533 ressize = requiredsize;
3534 }
3535 /* generate replacement (temporarily (mis)uses p) */
3536 for (p = collstart; p < collend; ++p) {
3537 str += sprintf(str, "&#%d;", (int)*p);
3538 }
3539 p = collend;
3540 break;
3541 default:
3542 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3543 encoding, reason, startp, size, &exc,
3544 collstart-startp, collend-startp, &newpos);
3545 if (repunicode == NULL)
3546 goto onError;
3547 /* need more space? (at least enough for what we
3548 have+the replacement+the rest of the string, so
3549 we won't have to check space for encodable characters) */
3550 respos = str-PyString_AS_STRING(res);
3551 repsize = PyUnicode_GET_SIZE(repunicode);
3552 requiredsize = respos+repsize+(endp-collend);
3553 if (requiredsize > ressize) {
3554 if (requiredsize<2*ressize)
3555 requiredsize = 2*ressize;
3556 if (_PyString_Resize(&res, requiredsize)) {
3557 Py_DECREF(repunicode);
3558 goto onError;
3559 }
3560 str = PyString_AS_STRING(res) + respos;
3561 ressize = requiredsize;
3562 }
3563 /* check if there is anything unencodable in the replacement
3564 and copy it to the output */
3565 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3566 c = *uni2;
3567 if (c >= limit) {
3568 raise_encode_exception(&exc, encoding, startp, size,
3569 unicodepos, unicodepos+1, reason);
3570 Py_DECREF(repunicode);
3571 goto onError;
3572 }
3573 *str = (char)c;
3574 }
3575 p = startp + newpos;
3576 Py_DECREF(repunicode);
3577 }
3578 }
3579 }
3580 /* Resize if we allocated to much */
3581 respos = str-PyString_AS_STRING(res);
3582 if (respos<ressize)
3583 /* If this falls res will be NULL */
3584 _PyString_Resize(&res, respos);
3585 Py_XDECREF(errorHandler);
3586 Py_XDECREF(exc);
3587 return res;
3588
3589 onError:
3590 Py_XDECREF(res);
3591 Py_XDECREF(errorHandler);
3592 Py_XDECREF(exc);
3593 return NULL;
3594}
3595
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003597 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 const char *errors)
3599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601}
3602
3603PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3604{
3605 if (!PyUnicode_Check(unicode)) {
3606 PyErr_BadArgument();
3607 return NULL;
3608 }
3609 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3610 PyUnicode_GET_SIZE(unicode),
3611 NULL);
3612}
3613
3614/* --- 7-bit ASCII Codec -------------------------------------------------- */
3615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 const char *errors)
3619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 PyUnicodeObject *v;
3622 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 Py_ssize_t startinpos;
3624 Py_ssize_t endinpos;
3625 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 const char *e;
3627 PyObject *errorHandler = NULL;
3628 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003631 if (size == 1 && *(unsigned char*)s < 128) {
3632 Py_UNICODE r = *(unsigned char*)s;
3633 return PyUnicode_FromUnicode(&r, 1);
3634 }
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 v = _PyUnicode_New(size);
3637 if (v == NULL)
3638 goto onError;
3639 if (size == 0)
3640 return (PyObject *)v;
3641 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 e = s + size;
3643 while (s < e) {
3644 register unsigned char c = (unsigned char)*s;
3645 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 ++s;
3648 }
3649 else {
3650 startinpos = s-starts;
3651 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003652 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 if (unicode_decode_call_errorhandler(
3654 errors, &errorHandler,
3655 "ascii", "ordinal not in range(128)",
3656 starts, size, &startinpos, &endinpos, &exc, &s,
3657 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003661 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003662 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003663 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 Py_XDECREF(errorHandler);
3665 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 onError:
3669 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 Py_XDECREF(errorHandler);
3671 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 return NULL;
3673}
3674
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003676 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 const char *errors)
3678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680}
3681
3682PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3683{
3684 if (!PyUnicode_Check(unicode)) {
3685 PyErr_BadArgument();
3686 return NULL;
3687 }
3688 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3689 PyUnicode_GET_SIZE(unicode),
3690 NULL);
3691}
3692
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003693#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003694
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003695/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003696
Martin v. Löwisd8251432006-06-14 05:21:04 +00003697#if SIZEOF_INT < SIZEOF_SSIZE_T
3698#define NEED_RETRY
3699#endif
3700
3701/* XXX This code is limited to "true" double-byte encodings, as
3702 a) it assumes an incomplete character consists of a single byte, and
3703 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3704 encodings, see IsDBCSLeadByteEx documentation. */
3705
3706static int is_dbcs_lead_byte(const char *s, int offset)
3707{
3708 const char *curr = s + offset;
3709
3710 if (IsDBCSLeadByte(*curr)) {
3711 const char *prev = CharPrev(s, curr);
3712 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3713 }
3714 return 0;
3715}
3716
3717/*
3718 * Decode MBCS string into unicode object. If 'final' is set, converts
3719 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3720 */
3721static int decode_mbcs(PyUnicodeObject **v,
3722 const char *s, /* MBCS string */
3723 int size, /* sizeof MBCS string */
3724 int final)
3725{
3726 Py_UNICODE *p;
3727 Py_ssize_t n = 0;
3728 int usize = 0;
3729
3730 assert(size >= 0);
3731
3732 /* Skip trailing lead-byte unless 'final' is set */
3733 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3734 --size;
3735
3736 /* First get the size of the result */
3737 if (size > 0) {
3738 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3739 if (usize == 0) {
3740 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3741 return -1;
3742 }
3743 }
3744
3745 if (*v == NULL) {
3746 /* Create unicode object */
3747 *v = _PyUnicode_New(usize);
3748 if (*v == NULL)
3749 return -1;
3750 }
3751 else {
3752 /* Extend unicode object */
3753 n = PyUnicode_GET_SIZE(*v);
3754 if (_PyUnicode_Resize(v, n + usize) < 0)
3755 return -1;
3756 }
3757
3758 /* Do the conversion */
3759 if (size > 0) {
3760 p = PyUnicode_AS_UNICODE(*v) + n;
3761 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3762 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3763 return -1;
3764 }
3765 }
3766
3767 return size;
3768}
3769
3770PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3771 Py_ssize_t size,
3772 const char *errors,
3773 Py_ssize_t *consumed)
3774{
3775 PyUnicodeObject *v = NULL;
3776 int done;
3777
3778 if (consumed)
3779 *consumed = 0;
3780
3781#ifdef NEED_RETRY
3782 retry:
3783 if (size > INT_MAX)
3784 done = decode_mbcs(&v, s, INT_MAX, 0);
3785 else
3786#endif
3787 done = decode_mbcs(&v, s, (int)size, !consumed);
3788
3789 if (done < 0) {
3790 Py_XDECREF(v);
3791 return NULL;
3792 }
3793
3794 if (consumed)
3795 *consumed += done;
3796
3797#ifdef NEED_RETRY
3798 if (size > INT_MAX) {
3799 s += done;
3800 size -= done;
3801 goto retry;
3802 }
3803#endif
3804
3805 return (PyObject *)v;
3806}
3807
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003808PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003809 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003810 const char *errors)
3811{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003812 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3813}
3814
3815/*
3816 * Convert unicode into string object (MBCS).
3817 * Returns 0 if succeed, -1 otherwise.
3818 */
3819static int encode_mbcs(PyObject **repr,
3820 const Py_UNICODE *p, /* unicode */
3821 int size) /* size of unicode */
3822{
3823 int mbcssize = 0;
3824 Py_ssize_t n = 0;
3825
3826 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003827
3828 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003829 if (size > 0) {
3830 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3831 if (mbcssize == 0) {
3832 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3833 return -1;
3834 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003835 }
3836
Martin v. Löwisd8251432006-06-14 05:21:04 +00003837 if (*repr == NULL) {
3838 /* Create string object */
3839 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3840 if (*repr == NULL)
3841 return -1;
3842 }
3843 else {
3844 /* Extend string object */
3845 n = PyString_Size(*repr);
3846 if (_PyString_Resize(repr, n + mbcssize) < 0)
3847 return -1;
3848 }
3849
3850 /* Do the conversion */
3851 if (size > 0) {
3852 char *s = PyString_AS_STRING(*repr) + n;
3853 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3854 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3855 return -1;
3856 }
3857 }
3858
3859 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003860}
3861
3862PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003863 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003864 const char *errors)
3865{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866 PyObject *repr = NULL;
3867 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003868
Martin v. Löwisd8251432006-06-14 05:21:04 +00003869#ifdef NEED_RETRY
3870 retry:
3871 if (size > INT_MAX)
3872 ret = encode_mbcs(&repr, p, INT_MAX);
3873 else
3874#endif
3875 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003876
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 if (ret < 0) {
3878 Py_XDECREF(repr);
3879 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003880 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003881
3882#ifdef NEED_RETRY
3883 if (size > INT_MAX) {
3884 p += INT_MAX;
3885 size -= INT_MAX;
3886 goto retry;
3887 }
3888#endif
3889
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003890 return repr;
3891}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003892
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003893PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3894{
3895 if (!PyUnicode_Check(unicode)) {
3896 PyErr_BadArgument();
3897 return NULL;
3898 }
3899 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3900 PyUnicode_GET_SIZE(unicode),
3901 NULL);
3902}
3903
Martin v. Löwisd8251432006-06-14 05:21:04 +00003904#undef NEED_RETRY
3905
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003906#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003907
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908/* --- Character Mapping Codec -------------------------------------------- */
3909
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003911 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 PyObject *mapping,
3913 const char *errors)
3914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003916 Py_ssize_t startinpos;
3917 Py_ssize_t endinpos;
3918 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 PyUnicodeObject *v;
3921 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 PyObject *errorHandler = NULL;
3924 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003925 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003926 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003927
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 /* Default to Latin-1 */
3929 if (mapping == NULL)
3930 return PyUnicode_DecodeLatin1(s, size, errors);
3931
3932 v = _PyUnicode_New(size);
3933 if (v == NULL)
3934 goto onError;
3935 if (size == 0)
3936 return (PyObject *)v;
3937 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 if (PyUnicode_CheckExact(mapping)) {
3940 mapstring = PyUnicode_AS_UNICODE(mapping);
3941 maplen = PyUnicode_GET_SIZE(mapping);
3942 while (s < e) {
3943 unsigned char ch = *s;
3944 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003946 if (ch < maplen)
3947 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003949 if (x == 0xfffe) {
3950 /* undefined mapping */
3951 outpos = p-PyUnicode_AS_UNICODE(v);
3952 startinpos = s-starts;
3953 endinpos = startinpos+1;
3954 if (unicode_decode_call_errorhandler(
3955 errors, &errorHandler,
3956 "charmap", "character maps to <undefined>",
3957 starts, size, &startinpos, &endinpos, &exc, &s,
3958 (PyObject **)&v, &outpos, &p)) {
3959 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003960 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003961 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003962 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003963 *p++ = x;
3964 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003966 }
3967 else {
3968 while (s < e) {
3969 unsigned char ch = *s;
3970 PyObject *w, *x;
3971
3972 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3973 w = PyInt_FromLong((long)ch);
3974 if (w == NULL)
3975 goto onError;
3976 x = PyObject_GetItem(mapping, w);
3977 Py_DECREF(w);
3978 if (x == NULL) {
3979 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3980 /* No mapping found means: mapping is undefined. */
3981 PyErr_Clear();
3982 x = Py_None;
3983 Py_INCREF(x);
3984 } else
3985 goto onError;
3986 }
3987
3988 /* Apply mapping */
3989 if (PyInt_Check(x)) {
3990 long value = PyInt_AS_LONG(x);
3991 if (value < 0 || value > 65535) {
3992 PyErr_SetString(PyExc_TypeError,
3993 "character mapping must be in range(65536)");
3994 Py_DECREF(x);
3995 goto onError;
3996 }
3997 *p++ = (Py_UNICODE)value;
3998 }
3999 else if (x == Py_None) {
4000 /* undefined mapping */
4001 outpos = p-PyUnicode_AS_UNICODE(v);
4002 startinpos = s-starts;
4003 endinpos = startinpos+1;
4004 if (unicode_decode_call_errorhandler(
4005 errors, &errorHandler,
4006 "charmap", "character maps to <undefined>",
4007 starts, size, &startinpos, &endinpos, &exc, &s,
4008 (PyObject **)&v, &outpos, &p)) {
4009 Py_DECREF(x);
4010 goto onError;
4011 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004012 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004013 continue;
4014 }
4015 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004016 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004017
4018 if (targetsize == 1)
4019 /* 1-1 mapping */
4020 *p++ = *PyUnicode_AS_UNICODE(x);
4021
4022 else if (targetsize > 1) {
4023 /* 1-n mapping */
4024 if (targetsize > extrachars) {
4025 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004026 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4027 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004028 (targetsize << 2);
4029 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004030 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004031 if (_PyUnicode_Resize(&v,
4032 PyUnicode_GET_SIZE(v) + needed) < 0) {
4033 Py_DECREF(x);
4034 goto onError;
4035 }
4036 p = PyUnicode_AS_UNICODE(v) + oldpos;
4037 }
4038 Py_UNICODE_COPY(p,
4039 PyUnicode_AS_UNICODE(x),
4040 targetsize);
4041 p += targetsize;
4042 extrachars -= targetsize;
4043 }
4044 /* 1-0 mapping: skip the character */
4045 }
4046 else {
4047 /* wrong return value */
4048 PyErr_SetString(PyExc_TypeError,
4049 "character mapping must return integer, None or unicode");
4050 Py_DECREF(x);
4051 goto onError;
4052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004054 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 }
4057 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004058 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 Py_XDECREF(errorHandler);
4061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004063
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 Py_XDECREF(errorHandler);
4066 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 Py_XDECREF(v);
4068 return NULL;
4069}
4070
Martin v. Löwis3f767792006-06-04 19:36:28 +00004071/* Charmap encoding: the lookup table */
4072
4073struct encoding_map{
4074 PyObject_HEAD
4075 unsigned char level1[32];
4076 int count2, count3;
4077 unsigned char level23[1];
4078};
4079
4080static PyObject*
4081encoding_map_size(PyObject *obj, PyObject* args)
4082{
4083 struct encoding_map *map = (struct encoding_map*)obj;
4084 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4085 128*map->count3);
4086}
4087
4088static PyMethodDef encoding_map_methods[] = {
4089 {"size", encoding_map_size, METH_NOARGS,
4090 PyDoc_STR("Return the size (in bytes) of this object") },
4091 { 0 }
4092};
4093
4094static void
4095encoding_map_dealloc(PyObject* o)
4096{
4097 PyObject_FREE(o);
4098}
4099
4100static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004101 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004102 "EncodingMap", /*tp_name*/
4103 sizeof(struct encoding_map), /*tp_basicsize*/
4104 0, /*tp_itemsize*/
4105 /* methods */
4106 encoding_map_dealloc, /*tp_dealloc*/
4107 0, /*tp_print*/
4108 0, /*tp_getattr*/
4109 0, /*tp_setattr*/
4110 0, /*tp_compare*/
4111 0, /*tp_repr*/
4112 0, /*tp_as_number*/
4113 0, /*tp_as_sequence*/
4114 0, /*tp_as_mapping*/
4115 0, /*tp_hash*/
4116 0, /*tp_call*/
4117 0, /*tp_str*/
4118 0, /*tp_getattro*/
4119 0, /*tp_setattro*/
4120 0, /*tp_as_buffer*/
4121 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4122 0, /*tp_doc*/
4123 0, /*tp_traverse*/
4124 0, /*tp_clear*/
4125 0, /*tp_richcompare*/
4126 0, /*tp_weaklistoffset*/
4127 0, /*tp_iter*/
4128 0, /*tp_iternext*/
4129 encoding_map_methods, /*tp_methods*/
4130 0, /*tp_members*/
4131 0, /*tp_getset*/
4132 0, /*tp_base*/
4133 0, /*tp_dict*/
4134 0, /*tp_descr_get*/
4135 0, /*tp_descr_set*/
4136 0, /*tp_dictoffset*/
4137 0, /*tp_init*/
4138 0, /*tp_alloc*/
4139 0, /*tp_new*/
4140 0, /*tp_free*/
4141 0, /*tp_is_gc*/
4142};
4143
4144PyObject*
4145PyUnicode_BuildEncodingMap(PyObject* string)
4146{
4147 Py_UNICODE *decode;
4148 PyObject *result;
4149 struct encoding_map *mresult;
4150 int i;
4151 int need_dict = 0;
4152 unsigned char level1[32];
4153 unsigned char level2[512];
4154 unsigned char *mlevel1, *mlevel2, *mlevel3;
4155 int count2 = 0, count3 = 0;
4156
4157 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4158 PyErr_BadArgument();
4159 return NULL;
4160 }
4161 decode = PyUnicode_AS_UNICODE(string);
4162 memset(level1, 0xFF, sizeof level1);
4163 memset(level2, 0xFF, sizeof level2);
4164
4165 /* If there isn't a one-to-one mapping of NULL to \0,
4166 or if there are non-BMP characters, we need to use
4167 a mapping dictionary. */
4168 if (decode[0] != 0)
4169 need_dict = 1;
4170 for (i = 1; i < 256; i++) {
4171 int l1, l2;
4172 if (decode[i] == 0
4173 #ifdef Py_UNICODE_WIDE
4174 || decode[i] > 0xFFFF
4175 #endif
4176 ) {
4177 need_dict = 1;
4178 break;
4179 }
4180 if (decode[i] == 0xFFFE)
4181 /* unmapped character */
4182 continue;
4183 l1 = decode[i] >> 11;
4184 l2 = decode[i] >> 7;
4185 if (level1[l1] == 0xFF)
4186 level1[l1] = count2++;
4187 if (level2[l2] == 0xFF)
4188 level2[l2] = count3++;
4189 }
4190
4191 if (count2 >= 0xFF || count3 >= 0xFF)
4192 need_dict = 1;
4193
4194 if (need_dict) {
4195 PyObject *result = PyDict_New();
4196 PyObject *key, *value;
4197 if (!result)
4198 return NULL;
4199 for (i = 0; i < 256; i++) {
4200 key = value = NULL;
4201 key = PyInt_FromLong(decode[i]);
4202 value = PyInt_FromLong(i);
4203 if (!key || !value)
4204 goto failed1;
4205 if (PyDict_SetItem(result, key, value) == -1)
4206 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004207 Py_DECREF(key);
4208 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004209 }
4210 return result;
4211 failed1:
4212 Py_XDECREF(key);
4213 Py_XDECREF(value);
4214 Py_DECREF(result);
4215 return NULL;
4216 }
4217
4218 /* Create a three-level trie */
4219 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4220 16*count2 + 128*count3 - 1);
4221 if (!result)
4222 return PyErr_NoMemory();
4223 PyObject_Init(result, &EncodingMapType);
4224 mresult = (struct encoding_map*)result;
4225 mresult->count2 = count2;
4226 mresult->count3 = count3;
4227 mlevel1 = mresult->level1;
4228 mlevel2 = mresult->level23;
4229 mlevel3 = mresult->level23 + 16*count2;
4230 memcpy(mlevel1, level1, 32);
4231 memset(mlevel2, 0xFF, 16*count2);
4232 memset(mlevel3, 0, 128*count3);
4233 count3 = 0;
4234 for (i = 1; i < 256; i++) {
4235 int o1, o2, o3, i2, i3;
4236 if (decode[i] == 0xFFFE)
4237 /* unmapped character */
4238 continue;
4239 o1 = decode[i]>>11;
4240 o2 = (decode[i]>>7) & 0xF;
4241 i2 = 16*mlevel1[o1] + o2;
4242 if (mlevel2[i2] == 0xFF)
4243 mlevel2[i2] = count3++;
4244 o3 = decode[i] & 0x7F;
4245 i3 = 128*mlevel2[i2] + o3;
4246 mlevel3[i3] = i;
4247 }
4248 return result;
4249}
4250
4251static int
4252encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4253{
4254 struct encoding_map *map = (struct encoding_map*)mapping;
4255 int l1 = c>>11;
4256 int l2 = (c>>7) & 0xF;
4257 int l3 = c & 0x7F;
4258 int i;
4259
4260#ifdef Py_UNICODE_WIDE
4261 if (c > 0xFFFF) {
4262 return -1;
4263 }
4264#endif
4265 if (c == 0)
4266 return 0;
4267 /* level 1*/
4268 i = map->level1[l1];
4269 if (i == 0xFF) {
4270 return -1;
4271 }
4272 /* level 2*/
4273 i = map->level23[16*i+l2];
4274 if (i == 0xFF) {
4275 return -1;
4276 }
4277 /* level 3 */
4278 i = map->level23[16*map->count2 + 128*i + l3];
4279 if (i == 0) {
4280 return -1;
4281 }
4282 return i;
4283}
4284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285/* Lookup the character ch in the mapping. If the character
4286 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004287 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 PyObject *w = PyInt_FromLong((long)c);
4291 PyObject *x;
4292
4293 if (w == NULL)
4294 return NULL;
4295 x = PyObject_GetItem(mapping, w);
4296 Py_DECREF(w);
4297 if (x == NULL) {
4298 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4299 /* No mapping found means: mapping is undefined. */
4300 PyErr_Clear();
4301 x = Py_None;
4302 Py_INCREF(x);
4303 return x;
4304 } else
4305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004307 else if (x == Py_None)
4308 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 else if (PyInt_Check(x)) {
4310 long value = PyInt_AS_LONG(x);
4311 if (value < 0 || value > 255) {
4312 PyErr_SetString(PyExc_TypeError,
4313 "character mapping must be in range(256)");
4314 Py_DECREF(x);
4315 return NULL;
4316 }
4317 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 else if (PyString_Check(x))
4320 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 /* wrong return value */
4323 PyErr_SetString(PyExc_TypeError,
4324 "character mapping must return integer, None or str");
4325 Py_DECREF(x);
4326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 }
4328}
4329
Martin v. Löwis3f767792006-06-04 19:36:28 +00004330static int
4331charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4332{
4333 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4334 /* exponentially overallocate to minimize reallocations */
4335 if (requiredsize < 2*outsize)
4336 requiredsize = 2*outsize;
4337 if (_PyString_Resize(outobj, requiredsize)) {
4338 return 0;
4339 }
4340 return 1;
4341}
4342
4343typedef enum charmapencode_result {
4344 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4345}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346/* lookup the character, put the result in the output string and adjust
4347 various state variables. Reallocate the output string if not enough
4348 space is available. Return a new reference to the object that
4349 was put in the output buffer, or Py_None, if the mapping was undefined
4350 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004351 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004353charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004356 PyObject *rep;
4357 char *outstart;
4358 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359
Christian Heimese93237d2007-12-19 02:37:44 +00004360 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361 int res = encoding_map_lookup(c, mapping);
4362 Py_ssize_t requiredsize = *outpos+1;
4363 if (res == -1)
4364 return enc_FAILED;
4365 if (outsize<requiredsize)
4366 if (!charmapencode_resize(outobj, outpos, requiredsize))
4367 return enc_EXCEPTION;
4368 outstart = PyString_AS_STRING(*outobj);
4369 outstart[(*outpos)++] = (char)res;
4370 return enc_SUCCESS;
4371 }
4372
4373 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004375 return enc_EXCEPTION;
4376 else if (rep==Py_None) {
4377 Py_DECREF(rep);
4378 return enc_FAILED;
4379 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004382 if (outsize<requiredsize)
4383 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004387 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4389 }
4390 else {
4391 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4393 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004394 if (outsize<requiredsize)
4395 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004397 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 memcpy(outstart + *outpos, repchars, repsize);
4401 *outpos += repsize;
4402 }
4403 }
Georg Brandl9f167602006-06-04 21:46:16 +00004404 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406}
4407
4408/* handle an error in PyUnicode_EncodeCharmap
4409 Return 0 on success, -1 on error */
4410static
4411int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004414 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416{
4417 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 Py_ssize_t repsize;
4419 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 Py_UNICODE *uni2;
4421 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 Py_ssize_t collstartpos = *inpos;
4423 Py_ssize_t collendpos = *inpos+1;
4424 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 char *encoding = "charmap";
4426 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004427 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 /* find all unencodable characters */
4430 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004431 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004432 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 int res = encoding_map_lookup(p[collendpos], mapping);
4434 if (res != -1)
4435 break;
4436 ++collendpos;
4437 continue;
4438 }
4439
4440 rep = charmapencode_lookup(p[collendpos], mapping);
4441 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004443 else if (rep!=Py_None) {
4444 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 break;
4446 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004447 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 ++collendpos;
4449 }
4450 /* cache callback name lookup
4451 * (if not done yet, i.e. it's the first error) */
4452 if (*known_errorHandler==-1) {
4453 if ((errors==NULL) || (!strcmp(errors, "strict")))
4454 *known_errorHandler = 1;
4455 else if (!strcmp(errors, "replace"))
4456 *known_errorHandler = 2;
4457 else if (!strcmp(errors, "ignore"))
4458 *known_errorHandler = 3;
4459 else if (!strcmp(errors, "xmlcharrefreplace"))
4460 *known_errorHandler = 4;
4461 else
4462 *known_errorHandler = 0;
4463 }
4464 switch (*known_errorHandler) {
4465 case 1: /* strict */
4466 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4467 return -1;
4468 case 2: /* replace */
4469 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4470 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004471 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return -1;
4473 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004474 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4476 return -1;
4477 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 }
4479 /* fall through */
4480 case 3: /* ignore */
4481 *inpos = collendpos;
4482 break;
4483 case 4: /* xmlcharrefreplace */
4484 /* generate replacement (temporarily (mis)uses p) */
4485 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4486 char buffer[2+29+1+1];
4487 char *cp;
4488 sprintf(buffer, "&#%d;", (int)p[collpos]);
4489 for (cp = buffer; *cp; ++cp) {
4490 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004491 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004493 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4495 return -1;
4496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 }
4498 }
4499 *inpos = collendpos;
4500 break;
4501 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004502 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 encoding, reason, p, size, exceptionObject,
4504 collstartpos, collendpos, &newpos);
4505 if (repunicode == NULL)
4506 return -1;
4507 /* generate replacement */
4508 repsize = PyUnicode_GET_SIZE(repunicode);
4509 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4510 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004511 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 return -1;
4513 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004514 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4517 return -1;
4518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 }
4520 *inpos = newpos;
4521 Py_DECREF(repunicode);
4522 }
4523 return 0;
4524}
4525
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004527 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 PyObject *mapping,
4529 const char *errors)
4530{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 /* output object */
4532 PyObject *res = NULL;
4533 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004534 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004536 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 PyObject *errorHandler = NULL;
4538 PyObject *exc = NULL;
4539 /* the following variable is used for caching string comparisons
4540 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4541 * 3=ignore, 4=xmlcharrefreplace */
4542 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543
4544 /* Default to Latin-1 */
4545 if (mapping == NULL)
4546 return PyUnicode_EncodeLatin1(p, size, errors);
4547
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 /* allocate enough for a simple encoding without
4549 replacements, if we need more, we'll resize */
4550 res = PyString_FromStringAndSize(NULL, size);
4551 if (res == NULL)
4552 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004553 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 while (inpos<size) {
4557 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004558 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4559 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004561 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 if (charmap_encoding_error(p, size, &inpos, mapping,
4563 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004564 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004565 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004566 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 else
4570 /* done with this character => adjust input position */
4571 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* Resize if we allocated to much */
4575 if (respos<PyString_GET_SIZE(res)) {
4576 if (_PyString_Resize(&res, respos))
4577 goto onError;
4578 }
4579 Py_XDECREF(exc);
4580 Py_XDECREF(errorHandler);
4581 return res;
4582
4583 onError:
4584 Py_XDECREF(res);
4585 Py_XDECREF(exc);
4586 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 return NULL;
4588}
4589
4590PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4591 PyObject *mapping)
4592{
4593 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4594 PyErr_BadArgument();
4595 return NULL;
4596 }
4597 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4598 PyUnicode_GET_SIZE(unicode),
4599 mapping,
4600 NULL);
4601}
4602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603/* create or adjust a UnicodeTranslateError */
4604static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 const Py_UNICODE *unicode, Py_ssize_t size,
4606 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 if (*exceptionObject == NULL) {
4610 *exceptionObject = PyUnicodeTranslateError_Create(
4611 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 }
4613 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4615 goto onError;
4616 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4617 goto onError;
4618 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4619 goto onError;
4620 return;
4621 onError:
4622 Py_DECREF(*exceptionObject);
4623 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 }
4625}
4626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627/* raises a UnicodeTranslateError */
4628static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004629 const Py_UNICODE *unicode, Py_ssize_t size,
4630 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 const char *reason)
4632{
4633 make_translate_exception(exceptionObject,
4634 unicode, size, startpos, endpos, reason);
4635 if (*exceptionObject != NULL)
4636 PyCodec_StrictErrors(*exceptionObject);
4637}
4638
4639/* error handling callback helper:
4640 build arguments, call the callback and check the arguments,
4641 put the result into newpos and return the replacement string, which
4642 has to be freed by the caller */
4643static PyObject *unicode_translate_call_errorhandler(const char *errors,
4644 PyObject **errorHandler,
4645 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4647 Py_ssize_t startpos, Py_ssize_t endpos,
4648 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004650 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651
Martin v. Löwis412fb672006-04-13 06:34:32 +00004652 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 PyObject *restuple;
4654 PyObject *resunicode;
4655
4656 if (*errorHandler == NULL) {
4657 *errorHandler = PyCodec_LookupError(errors);
4658 if (*errorHandler == NULL)
4659 return NULL;
4660 }
4661
4662 make_translate_exception(exceptionObject,
4663 unicode, size, startpos, endpos, reason);
4664 if (*exceptionObject == NULL)
4665 return NULL;
4666
4667 restuple = PyObject_CallFunctionObjArgs(
4668 *errorHandler, *exceptionObject, NULL);
4669 if (restuple == NULL)
4670 return NULL;
4671 if (!PyTuple_Check(restuple)) {
4672 PyErr_Format(PyExc_TypeError, &argparse[4]);
4673 Py_DECREF(restuple);
4674 return NULL;
4675 }
4676 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 Py_DECREF(restuple);
4679 return NULL;
4680 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681 if (i_newpos<0)
4682 *newpos = size+i_newpos;
4683 else
4684 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004685 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004686 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004687 Py_DECREF(restuple);
4688 return NULL;
4689 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_INCREF(resunicode);
4691 Py_DECREF(restuple);
4692 return resunicode;
4693}
4694
4695/* Lookup the character ch in the mapping and put the result in result,
4696 which must be decrefed by the caller.
4697 Return 0 on success, -1 on error */
4698static
4699int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4700{
4701 PyObject *w = PyInt_FromLong((long)c);
4702 PyObject *x;
4703
4704 if (w == NULL)
4705 return -1;
4706 x = PyObject_GetItem(mapping, w);
4707 Py_DECREF(w);
4708 if (x == NULL) {
4709 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4710 /* No mapping found means: use 1:1 mapping. */
4711 PyErr_Clear();
4712 *result = NULL;
4713 return 0;
4714 } else
4715 return -1;
4716 }
4717 else if (x == Py_None) {
4718 *result = x;
4719 return 0;
4720 }
4721 else if (PyInt_Check(x)) {
4722 long value = PyInt_AS_LONG(x);
4723 long max = PyUnicode_GetMax();
4724 if (value < 0 || value > max) {
4725 PyErr_Format(PyExc_TypeError,
4726 "character mapping must be in range(0x%lx)", max+1);
4727 Py_DECREF(x);
4728 return -1;
4729 }
4730 *result = x;
4731 return 0;
4732 }
4733 else if (PyUnicode_Check(x)) {
4734 *result = x;
4735 return 0;
4736 }
4737 else {
4738 /* wrong return value */
4739 PyErr_SetString(PyExc_TypeError,
4740 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004741 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 return -1;
4743 }
4744}
4745/* ensure that *outobj is at least requiredsize characters long,
4746if not reallocate and adjust various state variables.
4747Return 0 on success, -1 on error */
4748static
Walter Dörwald4894c302003-10-24 14:25:28 +00004749int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004753 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004755 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004757 if (requiredsize < 2 * oldsize)
4758 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004759 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 return -1;
4761 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 }
4763 return 0;
4764}
4765/* lookup the character, put the result in the output string and adjust
4766 various state variables. Return a new reference to the object that
4767 was put in the output buffer in *result, or Py_None, if the mapping was
4768 undefined (in which case no character was written).
4769 The called must decref result.
4770 Return 0 on success, -1 on error. */
4771static
Walter Dörwald4894c302003-10-24 14:25:28 +00004772int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004774 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775{
Walter Dörwald4894c302003-10-24 14:25:28 +00004776 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 return -1;
4778 if (*res==NULL) {
4779 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004780 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 }
4782 else if (*res==Py_None)
4783 ;
4784 else if (PyInt_Check(*res)) {
4785 /* no overflow check, because we know that the space is enough */
4786 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4787 }
4788 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004789 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 if (repsize==1) {
4791 /* no overflow check, because we know that the space is enough */
4792 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4793 }
4794 else if (repsize!=0) {
4795 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004796 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004797 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004798 repsize - 1;
4799 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 return -1;
4801 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4802 *outp += repsize;
4803 }
4804 }
4805 else
4806 return -1;
4807 return 0;
4808}
4809
4810PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004811 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 PyObject *mapping,
4813 const char *errors)
4814{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 /* output object */
4816 PyObject *res = NULL;
4817 /* pointers to the beginning and end+1 of input */
4818 const Py_UNICODE *startp = p;
4819 const Py_UNICODE *endp = p + size;
4820 /* pointer into the output */
4821 Py_UNICODE *str;
4822 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 char *reason = "character maps to <undefined>";
4825 PyObject *errorHandler = NULL;
4826 PyObject *exc = NULL;
4827 /* the following variable is used for caching string comparisons
4828 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4829 * 3=ignore, 4=xmlcharrefreplace */
4830 int known_errorHandler = -1;
4831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 if (mapping == NULL) {
4833 PyErr_BadArgument();
4834 return NULL;
4835 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836
4837 /* allocate enough for a simple 1:1 translation without
4838 replacements, if we need more, we'll resize */
4839 res = PyUnicode_FromUnicode(NULL, size);
4840 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004841 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 return res;
4844 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 while (p<endp) {
4847 /* try to encode it */
4848 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004849 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 goto onError;
4852 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004853 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 if (x!=Py_None) /* it worked => adjust input pointer */
4855 ++p;
4856 else { /* untranslatable character */
4857 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t repsize;
4859 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 Py_UNICODE *uni2;
4861 /* startpos for collecting untranslatable chars */
4862 const Py_UNICODE *collstart = p;
4863 const Py_UNICODE *collend = p+1;
4864 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 /* find all untranslatable characters */
4867 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004868 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 goto onError;
4870 Py_XDECREF(x);
4871 if (x!=Py_None)
4872 break;
4873 ++collend;
4874 }
4875 /* cache callback name lookup
4876 * (if not done yet, i.e. it's the first error) */
4877 if (known_errorHandler==-1) {
4878 if ((errors==NULL) || (!strcmp(errors, "strict")))
4879 known_errorHandler = 1;
4880 else if (!strcmp(errors, "replace"))
4881 known_errorHandler = 2;
4882 else if (!strcmp(errors, "ignore"))
4883 known_errorHandler = 3;
4884 else if (!strcmp(errors, "xmlcharrefreplace"))
4885 known_errorHandler = 4;
4886 else
4887 known_errorHandler = 0;
4888 }
4889 switch (known_errorHandler) {
4890 case 1: /* strict */
4891 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4892 goto onError;
4893 case 2: /* replace */
4894 /* No need to check for space, this is a 1:1 replacement */
4895 for (coll = collstart; coll<collend; ++coll)
4896 *str++ = '?';
4897 /* fall through */
4898 case 3: /* ignore */
4899 p = collend;
4900 break;
4901 case 4: /* xmlcharrefreplace */
4902 /* generate replacement (temporarily (mis)uses p) */
4903 for (p = collstart; p < collend; ++p) {
4904 char buffer[2+29+1+1];
4905 char *cp;
4906 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004907 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4909 goto onError;
4910 for (cp = buffer; *cp; ++cp)
4911 *str++ = *cp;
4912 }
4913 p = collend;
4914 break;
4915 default:
4916 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4917 reason, startp, size, &exc,
4918 collstart-startp, collend-startp, &newpos);
4919 if (repunicode == NULL)
4920 goto onError;
4921 /* generate replacement */
4922 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004923 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4925 Py_DECREF(repunicode);
4926 goto onError;
4927 }
4928 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4929 *str++ = *uni2;
4930 p = startp + newpos;
4931 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 }
4933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 /* Resize if we allocated to much */
4936 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004937 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004938 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 }
4941 Py_XDECREF(exc);
4942 Py_XDECREF(errorHandler);
4943 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 onError:
4946 Py_XDECREF(res);
4947 Py_XDECREF(exc);
4948 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 return NULL;
4950}
4951
4952PyObject *PyUnicode_Translate(PyObject *str,
4953 PyObject *mapping,
4954 const char *errors)
4955{
4956 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 str = PyUnicode_FromObject(str);
4959 if (str == NULL)
4960 goto onError;
4961 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4962 PyUnicode_GET_SIZE(str),
4963 mapping,
4964 errors);
4965 Py_DECREF(str);
4966 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004967
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 onError:
4969 Py_XDECREF(str);
4970 return NULL;
4971}
Tim Petersced69f82003-09-16 20:30:58 +00004972
Guido van Rossum9e896b32000-04-05 20:11:21 +00004973/* --- Decimal Encoder ---------------------------------------------------- */
4974
4975int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004976 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004977 char *output,
4978 const char *errors)
4979{
4980 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004981 PyObject *errorHandler = NULL;
4982 PyObject *exc = NULL;
4983 const char *encoding = "decimal";
4984 const char *reason = "invalid decimal Unicode string";
4985 /* the following variable is used for caching string comparisons
4986 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4987 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988
4989 if (output == NULL) {
4990 PyErr_BadArgument();
4991 return -1;
4992 }
4993
4994 p = s;
4995 end = s + length;
4996 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 Py_ssize_t repsize;
5001 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 Py_UNICODE *uni2;
5003 Py_UNICODE *collstart;
5004 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005005
Guido van Rossum9e896b32000-04-05 20:11:21 +00005006 if (Py_UNICODE_ISSPACE(ch)) {
5007 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005009 continue;
5010 }
5011 decimal = Py_UNICODE_TODECIMAL(ch);
5012 if (decimal >= 0) {
5013 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005015 continue;
5016 }
Guido van Rossumba477042000-04-06 18:18:10 +00005017 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005018 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005020 continue;
5021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 /* All other characters are considered unencodable */
5023 collstart = p;
5024 collend = p+1;
5025 while (collend < end) {
5026 if ((0 < *collend && *collend < 256) ||
5027 !Py_UNICODE_ISSPACE(*collend) ||
5028 Py_UNICODE_TODECIMAL(*collend))
5029 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 /* cache callback name lookup
5032 * (if not done yet, i.e. it's the first error) */
5033 if (known_errorHandler==-1) {
5034 if ((errors==NULL) || (!strcmp(errors, "strict")))
5035 known_errorHandler = 1;
5036 else if (!strcmp(errors, "replace"))
5037 known_errorHandler = 2;
5038 else if (!strcmp(errors, "ignore"))
5039 known_errorHandler = 3;
5040 else if (!strcmp(errors, "xmlcharrefreplace"))
5041 known_errorHandler = 4;
5042 else
5043 known_errorHandler = 0;
5044 }
5045 switch (known_errorHandler) {
5046 case 1: /* strict */
5047 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5048 goto onError;
5049 case 2: /* replace */
5050 for (p = collstart; p < collend; ++p)
5051 *output++ = '?';
5052 /* fall through */
5053 case 3: /* ignore */
5054 p = collend;
5055 break;
5056 case 4: /* xmlcharrefreplace */
5057 /* generate replacement (temporarily (mis)uses p) */
5058 for (p = collstart; p < collend; ++p)
5059 output += sprintf(output, "&#%d;", (int)*p);
5060 p = collend;
5061 break;
5062 default:
5063 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5064 encoding, reason, s, length, &exc,
5065 collstart-s, collend-s, &newpos);
5066 if (repunicode == NULL)
5067 goto onError;
5068 /* generate replacement */
5069 repsize = PyUnicode_GET_SIZE(repunicode);
5070 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5071 Py_UNICODE ch = *uni2;
5072 if (Py_UNICODE_ISSPACE(ch))
5073 *output++ = ' ';
5074 else {
5075 decimal = Py_UNICODE_TODECIMAL(ch);
5076 if (decimal >= 0)
5077 *output++ = '0' + decimal;
5078 else if (0 < ch && ch < 256)
5079 *output++ = (char)ch;
5080 else {
5081 Py_DECREF(repunicode);
5082 raise_encode_exception(&exc, encoding,
5083 s, length, collstart-s, collend-s, reason);
5084 goto onError;
5085 }
5086 }
5087 }
5088 p = s + newpos;
5089 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005090 }
5091 }
5092 /* 0-terminate the output string */
5093 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 Py_XDECREF(exc);
5095 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005096 return 0;
5097
5098 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 Py_XDECREF(exc);
5100 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005101 return -1;
5102}
5103
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104/* --- Helpers ------------------------------------------------------------ */
5105
Eric Smitha9f7d622008-02-17 19:46:49 +00005106#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005107
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005108#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005109
Fredrik Lundha50d2012006-05-26 17:04:58 +00005110#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005111
5112#include "stringlib/count.h"
5113#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005114#include "stringlib/partition.h"
5115
Fredrik Lundhc8162812006-05-26 19:33:03 +00005116/* helper macro to fixup start/end slice values */
5117#define FIX_START_END(obj) \
5118 if (start < 0) \
5119 start += (obj)->length; \
5120 if (start < 0) \
5121 start = 0; \
5122 if (end > (obj)->length) \
5123 end = (obj)->length; \
5124 if (end < 0) \
5125 end += (obj)->length; \
5126 if (end < 0) \
5127 end = 0;
5128
Martin v. Löwis18e16552006-02-15 17:27:45 +00005129Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005130 PyObject *substr,
5131 Py_ssize_t start,
5132 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005135 PyUnicodeObject* str_obj;
5136 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005137
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005138 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5139 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005141 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5142 if (!sub_obj) {
5143 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 return -1;
5145 }
Tim Petersced69f82003-09-16 20:30:58 +00005146
Fredrik Lundhc8162812006-05-26 19:33:03 +00005147 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005148
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005149 result = stringlib_count(
5150 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5151 );
5152
5153 Py_DECREF(sub_obj);
5154 Py_DECREF(str_obj);
5155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 return result;
5157}
5158
Martin v. Löwis18e16552006-02-15 17:27:45 +00005159Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005160 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005161 Py_ssize_t start,
5162 Py_ssize_t end,
5163 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005165 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005166
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005167 str = PyUnicode_FromObject(str);
5168 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005169 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005170 sub = PyUnicode_FromObject(sub);
5171 if (!sub) {
5172 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005173 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 }
Tim Petersced69f82003-09-16 20:30:58 +00005175
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005176 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005177 result = stringlib_find_slice(
5178 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5179 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5180 start, end
5181 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005182 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005183 result = stringlib_rfind_slice(
5184 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5185 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5186 start, end
5187 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005188
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005189 Py_DECREF(str);
5190 Py_DECREF(sub);
5191
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 return result;
5193}
5194
Tim Petersced69f82003-09-16 20:30:58 +00005195static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196int tailmatch(PyUnicodeObject *self,
5197 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 Py_ssize_t start,
5199 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 int direction)
5201{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 if (substring->length == 0)
5203 return 1;
5204
Fredrik Lundhc8162812006-05-26 19:33:03 +00005205 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
5207 end -= substring->length;
5208 if (end < start)
5209 return 0;
5210
5211 if (direction > 0) {
5212 if (Py_UNICODE_MATCH(self, end, substring))
5213 return 1;
5214 } else {
5215 if (Py_UNICODE_MATCH(self, start, substring))
5216 return 1;
5217 }
5218
5219 return 0;
5220}
5221
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005224 Py_ssize_t start,
5225 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 int direction)
5227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005229
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 str = PyUnicode_FromObject(str);
5231 if (str == NULL)
5232 return -1;
5233 substr = PyUnicode_FromObject(substr);
5234 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005235 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 return -1;
5237 }
Tim Petersced69f82003-09-16 20:30:58 +00005238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 result = tailmatch((PyUnicodeObject *)str,
5240 (PyUnicodeObject *)substr,
5241 start, end, direction);
5242 Py_DECREF(str);
5243 Py_DECREF(substr);
5244 return result;
5245}
5246
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247/* Apply fixfct filter to the Unicode object self and return a
5248 reference to the modified object */
5249
Tim Petersced69f82003-09-16 20:30:58 +00005250static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251PyObject *fixup(PyUnicodeObject *self,
5252 int (*fixfct)(PyUnicodeObject *s))
5253{
5254
5255 PyUnicodeObject *u;
5256
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005257 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (u == NULL)
5259 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005260
5261 Py_UNICODE_COPY(u->str, self->str, self->length);
5262
Tim Peters7a29bd52001-09-12 03:03:31 +00005263 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 /* fixfct should return TRUE if it modified the buffer. If
5265 FALSE, return a reference to the original buffer instead
5266 (to save space, not time) */
5267 Py_INCREF(self);
5268 Py_DECREF(u);
5269 return (PyObject*) self;
5270 }
5271 return (PyObject*) u;
5272}
5273
Tim Petersced69f82003-09-16 20:30:58 +00005274static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275int fixupper(PyUnicodeObject *self)
5276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 Py_UNICODE *s = self->str;
5279 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 while (len-- > 0) {
5282 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 ch = Py_UNICODE_TOUPPER(*s);
5285 if (ch != *s) {
5286 status = 1;
5287 *s = ch;
5288 }
5289 s++;
5290 }
5291
5292 return status;
5293}
5294
Tim Petersced69f82003-09-16 20:30:58 +00005295static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296int fixlower(PyUnicodeObject *self)
5297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 Py_UNICODE *s = self->str;
5300 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 while (len-- > 0) {
5303 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 ch = Py_UNICODE_TOLOWER(*s);
5306 if (ch != *s) {
5307 status = 1;
5308 *s = ch;
5309 }
5310 s++;
5311 }
5312
5313 return status;
5314}
5315
Tim Petersced69f82003-09-16 20:30:58 +00005316static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317int fixswapcase(PyUnicodeObject *self)
5318{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005319 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 Py_UNICODE *s = self->str;
5321 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 while (len-- > 0) {
5324 if (Py_UNICODE_ISUPPER(*s)) {
5325 *s = Py_UNICODE_TOLOWER(*s);
5326 status = 1;
5327 } else if (Py_UNICODE_ISLOWER(*s)) {
5328 *s = Py_UNICODE_TOUPPER(*s);
5329 status = 1;
5330 }
5331 s++;
5332 }
5333
5334 return status;
5335}
5336
Tim Petersced69f82003-09-16 20:30:58 +00005337static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338int fixcapitalize(PyUnicodeObject *self)
5339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005340 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005341 Py_UNICODE *s = self->str;
5342 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005343
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005344 if (len == 0)
5345 return 0;
5346 if (Py_UNICODE_ISLOWER(*s)) {
5347 *s = Py_UNICODE_TOUPPER(*s);
5348 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005350 s++;
5351 while (--len > 0) {
5352 if (Py_UNICODE_ISUPPER(*s)) {
5353 *s = Py_UNICODE_TOLOWER(*s);
5354 status = 1;
5355 }
5356 s++;
5357 }
5358 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359}
5360
5361static
5362int fixtitle(PyUnicodeObject *self)
5363{
5364 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5365 register Py_UNICODE *e;
5366 int previous_is_cased;
5367
5368 /* Shortcut for single character strings */
5369 if (PyUnicode_GET_SIZE(self) == 1) {
5370 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5371 if (*p != ch) {
5372 *p = ch;
5373 return 1;
5374 }
5375 else
5376 return 0;
5377 }
Tim Petersced69f82003-09-16 20:30:58 +00005378
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 e = p + PyUnicode_GET_SIZE(self);
5380 previous_is_cased = 0;
5381 for (; p < e; p++) {
5382 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005383
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 if (previous_is_cased)
5385 *p = Py_UNICODE_TOLOWER(ch);
5386 else
5387 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005388
5389 if (Py_UNICODE_ISLOWER(ch) ||
5390 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 Py_UNICODE_ISTITLE(ch))
5392 previous_is_cased = 1;
5393 else
5394 previous_is_cased = 0;
5395 }
5396 return 1;
5397}
5398
Tim Peters8ce9f162004-08-27 01:49:32 +00005399PyObject *
5400PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
Tim Peters8ce9f162004-08-27 01:49:32 +00005402 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005403 const Py_UNICODE blank = ' ';
5404 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005405 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005406 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005407 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5408 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005409 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5410 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005411 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005412 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005413 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Tim Peters05eba1f2004-08-27 21:32:02 +00005415 fseq = PySequence_Fast(seq, "");
5416 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005418 }
5419
Tim Peters91879ab2004-08-27 22:35:44 +00005420 /* Grrrr. A codec may be invoked to convert str objects to
5421 * Unicode, and so it's possible to call back into Python code
5422 * during PyUnicode_FromObject(), and so it's possible for a sick
5423 * codec to change the size of fseq (if seq is a list). Therefore
5424 * we have to keep refetching the size -- can't assume seqlen
5425 * is invariant.
5426 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 seqlen = PySequence_Fast_GET_SIZE(fseq);
5428 /* If empty sequence, return u"". */
5429 if (seqlen == 0) {
5430 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5431 goto Done;
5432 }
5433 /* If singleton sequence with an exact Unicode, return that. */
5434 if (seqlen == 1) {
5435 item = PySequence_Fast_GET_ITEM(fseq, 0);
5436 if (PyUnicode_CheckExact(item)) {
5437 Py_INCREF(item);
5438 res = (PyUnicodeObject *)item;
5439 goto Done;
5440 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 }
5442
Tim Peters05eba1f2004-08-27 21:32:02 +00005443 /* At least two items to join, or one that isn't exact Unicode. */
5444 if (seqlen > 1) {
5445 /* Set up sep and seplen -- they're needed. */
5446 if (separator == NULL) {
5447 sep = &blank;
5448 seplen = 1;
5449 }
5450 else {
5451 internal_separator = PyUnicode_FromObject(separator);
5452 if (internal_separator == NULL)
5453 goto onError;
5454 sep = PyUnicode_AS_UNICODE(internal_separator);
5455 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005456 /* In case PyUnicode_FromObject() mutated seq. */
5457 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 }
5459 }
5460
5461 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005462 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005464 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005465 res_p = PyUnicode_AS_UNICODE(res);
5466 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005467
Tim Peters05eba1f2004-08-27 21:32:02 +00005468 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005469 Py_ssize_t itemlen;
5470 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005471
5472 item = PySequence_Fast_GET_ITEM(fseq, i);
5473 /* Convert item to Unicode. */
5474 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5475 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005476 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005477 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005478 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005479 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005480 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 item = PyUnicode_FromObject(item);
5482 if (item == NULL)
5483 goto onError;
5484 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005485
Tim Peters91879ab2004-08-27 22:35:44 +00005486 /* In case PyUnicode_FromObject() mutated seq. */
5487 seqlen = PySequence_Fast_GET_SIZE(fseq);
5488
Tim Peters8ce9f162004-08-27 01:49:32 +00005489 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005491 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005492 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005494 if (i < seqlen - 1) {
5495 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005496 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005497 goto Overflow;
5498 }
5499 if (new_res_used > res_alloc) {
5500 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005501 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005503 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005504 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005506 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005507 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005509 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005510 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005512
5513 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005514 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005515 res_p += itemlen;
5516 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005517 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005518 res_p += seplen;
5519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005521 res_used = new_res_used;
5522 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005523
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 /* Shrink res to match the used area; this probably can't fail,
5525 * but it's cheap to check.
5526 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005527 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 goto onError;
5529
5530 Done:
5531 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 return (PyObject *)res;
5534
Tim Peters8ce9f162004-08-27 01:49:32 +00005535 Overflow:
5536 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005537 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005538 Py_DECREF(item);
5539 /* fall through */
5540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005543 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005544 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 return NULL;
5546}
5547
Tim Petersced69f82003-09-16 20:30:58 +00005548static
5549PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005550 Py_ssize_t left,
5551 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 Py_UNICODE fill)
5553{
5554 PyUnicodeObject *u;
5555
5556 if (left < 0)
5557 left = 0;
5558 if (right < 0)
5559 right = 0;
5560
Tim Peters7a29bd52001-09-12 03:03:31 +00005561 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 Py_INCREF(self);
5563 return self;
5564 }
5565
5566 u = _PyUnicode_New(left + self->length + right);
5567 if (u) {
5568 if (left)
5569 Py_UNICODE_FILL(u->str, fill, left);
5570 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5571 if (right)
5572 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5573 }
5574
5575 return u;
5576}
5577
5578#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005579 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 if (!str) \
5581 goto onError; \
5582 if (PyList_Append(list, str)) { \
5583 Py_DECREF(str); \
5584 goto onError; \
5585 } \
5586 else \
5587 Py_DECREF(str);
5588
5589static
5590PyObject *split_whitespace(PyUnicodeObject *self,
5591 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005592 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005594 register Py_ssize_t i;
5595 register Py_ssize_t j;
5596 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005598 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
5600 for (i = j = 0; i < len; ) {
5601 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005602 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 i++;
5604 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005605 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 i++;
5607 if (j < i) {
5608 if (maxcount-- <= 0)
5609 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005610 SPLIT_APPEND(buf, j, i);
5611 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 i++;
5613 j = i;
5614 }
5615 }
5616 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005617 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
5619 return list;
5620
5621 onError:
5622 Py_DECREF(list);
5623 return NULL;
5624}
5625
5626PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005627 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 register Py_ssize_t i;
5630 register Py_ssize_t j;
5631 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 PyObject *list;
5633 PyObject *str;
5634 Py_UNICODE *data;
5635
5636 string = PyUnicode_FromObject(string);
5637 if (string == NULL)
5638 return NULL;
5639 data = PyUnicode_AS_UNICODE(string);
5640 len = PyUnicode_GET_SIZE(string);
5641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 list = PyList_New(0);
5643 if (!list)
5644 goto onError;
5645
5646 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005647 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005650 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
5653 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005654 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 if (i < len) {
5656 if (data[i] == '\r' && i + 1 < len &&
5657 data[i+1] == '\n')
5658 i += 2;
5659 else
5660 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005661 if (keepends)
5662 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 }
Guido van Rossum86662912000-04-11 15:38:46 +00005664 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 j = i;
5666 }
5667 if (j < len) {
5668 SPLIT_APPEND(data, j, len);
5669 }
5670
5671 Py_DECREF(string);
5672 return list;
5673
5674 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005675 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 Py_DECREF(string);
5677 return NULL;
5678}
5679
Tim Petersced69f82003-09-16 20:30:58 +00005680static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681PyObject *split_char(PyUnicodeObject *self,
5682 PyObject *list,
5683 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005686 register Py_ssize_t i;
5687 register Py_ssize_t j;
5688 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005690 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691
5692 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005693 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 if (maxcount-- <= 0)
5695 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005696 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 i = j = i + 1;
5698 } else
5699 i++;
5700 }
5701 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005702 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
5704 return list;
5705
5706 onError:
5707 Py_DECREF(list);
5708 return NULL;
5709}
5710
Tim Petersced69f82003-09-16 20:30:58 +00005711static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712PyObject *split_substring(PyUnicodeObject *self,
5713 PyObject *list,
5714 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 register Py_ssize_t i;
5718 register Py_ssize_t j;
5719 Py_ssize_t len = self->length;
5720 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 PyObject *str;
5722
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005723 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 if (Py_UNICODE_MATCH(self, i, substring)) {
5725 if (maxcount-- <= 0)
5726 break;
5727 SPLIT_APPEND(self->str, j, i);
5728 i = j = i + sublen;
5729 } else
5730 i++;
5731 }
5732 if (j <= len) {
5733 SPLIT_APPEND(self->str, j, len);
5734 }
5735 return list;
5736
5737 onError:
5738 Py_DECREF(list);
5739 return NULL;
5740}
5741
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005742static
5743PyObject *rsplit_whitespace(PyUnicodeObject *self,
5744 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005746{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 register Py_ssize_t i;
5748 register Py_ssize_t j;
5749 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005750 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005751 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752
5753 for (i = j = len - 1; i >= 0; ) {
5754 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005755 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756 i--;
5757 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005758 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005759 i--;
5760 if (j > i) {
5761 if (maxcount-- <= 0)
5762 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005763 SPLIT_APPEND(buf, i + 1, j + 1);
5764 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 i--;
5766 j = i;
5767 }
5768 }
5769 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005770 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005772 if (PyList_Reverse(list) < 0)
5773 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774 return list;
5775
5776 onError:
5777 Py_DECREF(list);
5778 return NULL;
5779}
5780
5781static
5782PyObject *rsplit_char(PyUnicodeObject *self,
5783 PyObject *list,
5784 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005785 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 register Py_ssize_t i;
5788 register Py_ssize_t j;
5789 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005791 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792
5793 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005794 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005795 if (maxcount-- <= 0)
5796 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005797 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798 j = i = i - 1;
5799 } else
5800 i--;
5801 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005802 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005803 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005804 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005805 if (PyList_Reverse(list) < 0)
5806 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807 return list;
5808
5809 onError:
5810 Py_DECREF(list);
5811 return NULL;
5812}
5813
5814static
5815PyObject *rsplit_substring(PyUnicodeObject *self,
5816 PyObject *list,
5817 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005819{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005820 register Py_ssize_t i;
5821 register Py_ssize_t j;
5822 Py_ssize_t len = self->length;
5823 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005824 PyObject *str;
5825
5826 for (i = len - sublen, j = len; i >= 0; ) {
5827 if (Py_UNICODE_MATCH(self, i, substring)) {
5828 if (maxcount-- <= 0)
5829 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005830 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831 j = i;
5832 i -= sublen;
5833 } else
5834 i--;
5835 }
5836 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005837 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005839 if (PyList_Reverse(list) < 0)
5840 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 return list;
5842
5843 onError:
5844 Py_DECREF(list);
5845 return NULL;
5846}
5847
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848#undef SPLIT_APPEND
5849
5850static
5851PyObject *split(PyUnicodeObject *self,
5852 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854{
5855 PyObject *list;
5856
5857 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005858 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
5860 list = PyList_New(0);
5861 if (!list)
5862 return NULL;
5863
5864 if (substring == NULL)
5865 return split_whitespace(self,list,maxcount);
5866
5867 else if (substring->length == 1)
5868 return split_char(self,list,substring->str[0],maxcount);
5869
5870 else if (substring->length == 0) {
5871 Py_DECREF(list);
5872 PyErr_SetString(PyExc_ValueError, "empty separator");
5873 return NULL;
5874 }
5875 else
5876 return split_substring(self,list,substring,maxcount);
5877}
5878
Tim Petersced69f82003-09-16 20:30:58 +00005879static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005880PyObject *rsplit(PyUnicodeObject *self,
5881 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005882 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005883{
5884 PyObject *list;
5885
5886 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005887 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005888
5889 list = PyList_New(0);
5890 if (!list)
5891 return NULL;
5892
5893 if (substring == NULL)
5894 return rsplit_whitespace(self,list,maxcount);
5895
5896 else if (substring->length == 1)
5897 return rsplit_char(self,list,substring->str[0],maxcount);
5898
5899 else if (substring->length == 0) {
5900 Py_DECREF(list);
5901 PyErr_SetString(PyExc_ValueError, "empty separator");
5902 return NULL;
5903 }
5904 else
5905 return rsplit_substring(self,list,substring,maxcount);
5906}
5907
5908static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909PyObject *replace(PyUnicodeObject *self,
5910 PyUnicodeObject *str1,
5911 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913{
5914 PyUnicodeObject *u;
5915
5916 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005917 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918
Fredrik Lundh347ee272006-05-24 16:35:18 +00005919 if (str1->length == str2->length) {
5920 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005921 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005922 if (str1->length == 1) {
5923 /* replace characters */
5924 Py_UNICODE u1, u2;
5925 if (!findchar(self->str, self->length, str1->str[0]))
5926 goto nothing;
5927 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5928 if (!u)
5929 return NULL;
5930 Py_UNICODE_COPY(u->str, self->str, self->length);
5931 u1 = str1->str[0];
5932 u2 = str2->str[0];
5933 for (i = 0; i < u->length; i++)
5934 if (u->str[i] == u1) {
5935 if (--maxcount < 0)
5936 break;
5937 u->str[i] = u2;
5938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005940 i = fastsearch(
5941 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005943 if (i < 0)
5944 goto nothing;
5945 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5946 if (!u)
5947 return NULL;
5948 Py_UNICODE_COPY(u->str, self->str, self->length);
5949 while (i <= self->length - str1->length)
5950 if (Py_UNICODE_MATCH(self, i, str1)) {
5951 if (--maxcount < 0)
5952 break;
5953 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5954 i += str1->length;
5955 } else
5956 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005959
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005960 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005961 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 Py_UNICODE *p;
5963
5964 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005965 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 if (n > maxcount)
5967 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005968 if (n == 0)
5969 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005970 /* new_size = self->length + n * (str2->length - str1->length)); */
5971 delta = (str2->length - str1->length);
5972 if (delta == 0) {
5973 new_size = self->length;
5974 } else {
5975 product = n * (str2->length - str1->length);
5976 if ((product / (str2->length - str1->length)) != n) {
5977 PyErr_SetString(PyExc_OverflowError,
5978 "replace string is too long");
5979 return NULL;
5980 }
5981 new_size = self->length + product;
5982 if (new_size < 0) {
5983 PyErr_SetString(PyExc_OverflowError,
5984 "replace string is too long");
5985 return NULL;
5986 }
5987 }
5988 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005989 if (!u)
5990 return NULL;
5991 i = 0;
5992 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005993 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005994 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005995 while (n-- > 0) {
5996 /* look for next match */
5997 j = i;
5998 while (j <= e) {
5999 if (Py_UNICODE_MATCH(self, j, str1))
6000 break;
6001 j++;
6002 }
6003 if (j > i) {
6004 if (j > e)
6005 break;
6006 /* copy unchanged part [i:j] */
6007 Py_UNICODE_COPY(p, self->str+i, j-i);
6008 p += j - i;
6009 }
6010 /* copy substitution string */
6011 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006012 Py_UNICODE_COPY(p, str2->str, str2->length);
6013 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006014 }
6015 i = j + str1->length;
6016 }
6017 if (i < self->length)
6018 /* copy tail [i:] */
6019 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006020 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006021 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006022 while (n > 0) {
6023 Py_UNICODE_COPY(p, str2->str, str2->length);
6024 p += str2->length;
6025 if (--n <= 0)
6026 break;
6027 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006029 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 }
6031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006033
6034nothing:
6035 /* nothing to replace; return original string (when possible) */
6036 if (PyUnicode_CheckExact(self)) {
6037 Py_INCREF(self);
6038 return (PyObject *) self;
6039 }
6040 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Unicode Object Methods --------------------------------------------- */
6044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046"S.title() -> unicode\n\
6047\n\
6048Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006049characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006052unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 return fixup(self, fixtitle);
6055}
6056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006057PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058"S.capitalize() -> unicode\n\
6059\n\
6060Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006061have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006064unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 return fixup(self, fixcapitalize);
6067}
6068
6069#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071"S.capwords() -> unicode\n\
6072\n\
6073Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006074normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
6076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
6079 PyObject *list;
6080 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 /* Split into words */
6084 list = split(self, NULL, -1);
6085 if (!list)
6086 return NULL;
6087
6088 /* Capitalize each word */
6089 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6090 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6091 fixcapitalize);
6092 if (item == NULL)
6093 goto onError;
6094 Py_DECREF(PyList_GET_ITEM(list, i));
6095 PyList_SET_ITEM(list, i, item);
6096 }
6097
6098 /* Join the words to form a new string */
6099 item = PyUnicode_Join(NULL, list);
6100
6101onError:
6102 Py_DECREF(list);
6103 return (PyObject *)item;
6104}
6105#endif
6106
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006107/* Argument converter. Coerces to a single unicode character */
6108
6109static int
6110convert_uc(PyObject *obj, void *addr)
6111{
6112 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6113 PyObject *uniobj;
6114 Py_UNICODE *unistr;
6115
6116 uniobj = PyUnicode_FromObject(obj);
6117 if (uniobj == NULL) {
6118 PyErr_SetString(PyExc_TypeError,
6119 "The fill character cannot be converted to Unicode");
6120 return 0;
6121 }
6122 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6123 PyErr_SetString(PyExc_TypeError,
6124 "The fill character must be exactly one character long");
6125 Py_DECREF(uniobj);
6126 return 0;
6127 }
6128 unistr = PyUnicode_AS_UNICODE(uniobj);
6129 *fillcharloc = unistr[0];
6130 Py_DECREF(uniobj);
6131 return 1;
6132}
6133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006134PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006135"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006137Return S centered in a Unicode string of length width. Padding is\n\
6138done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
6140static PyObject *
6141unicode_center(PyUnicodeObject *self, PyObject *args)
6142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006143 Py_ssize_t marg, left;
6144 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006145 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Thomas Woutersde017742006-02-16 19:34:37 +00006147 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 return NULL;
6149
Tim Peters7a29bd52001-09-12 03:03:31 +00006150 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 Py_INCREF(self);
6152 return (PyObject*) self;
6153 }
6154
6155 marg = width - self->length;
6156 left = marg / 2 + (marg & width & 1);
6157
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006158 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159}
6160
Marc-André Lemburge5034372000-08-08 08:04:29 +00006161#if 0
6162
6163/* This code should go into some future Unicode collation support
6164 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006165 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006166
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006167/* speedy UTF-16 code point order comparison */
6168/* gleaned from: */
6169/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6170
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006171static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006172{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006173 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006174 0, 0, 0, 0, 0, 0, 0, 0,
6175 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006176 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006177};
6178
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179static int
6180unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6181{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 Py_UNICODE *s1 = str1->str;
6185 Py_UNICODE *s2 = str2->str;
6186
6187 len1 = str1->length;
6188 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006191 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006192
6193 c1 = *s1++;
6194 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006195
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006196 if (c1 > (1<<11) * 26)
6197 c1 += utf16Fixup[c1>>11];
6198 if (c2 > (1<<11) * 26)
6199 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006200 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006201
6202 if (c1 != c2)
6203 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006204
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006205 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
6207
6208 return (len1 < len2) ? -1 : (len1 != len2);
6209}
6210
Marc-André Lemburge5034372000-08-08 08:04:29 +00006211#else
6212
6213static int
6214unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006216 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006217
6218 Py_UNICODE *s1 = str1->str;
6219 Py_UNICODE *s2 = str2->str;
6220
6221 len1 = str1->length;
6222 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006223
Marc-André Lemburge5034372000-08-08 08:04:29 +00006224 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006225 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006226
Fredrik Lundh45714e92001-06-26 16:39:36 +00006227 c1 = *s1++;
6228 c2 = *s2++;
6229
6230 if (c1 != c2)
6231 return (c1 < c2) ? -1 : 1;
6232
Marc-André Lemburge5034372000-08-08 08:04:29 +00006233 len1--; len2--;
6234 }
6235
6236 return (len1 < len2) ? -1 : (len1 != len2);
6237}
6238
6239#endif
6240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241int PyUnicode_Compare(PyObject *left,
6242 PyObject *right)
6243{
6244 PyUnicodeObject *u = NULL, *v = NULL;
6245 int result;
6246
6247 /* Coerce the two arguments */
6248 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6249 if (u == NULL)
6250 goto onError;
6251 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6252 if (v == NULL)
6253 goto onError;
6254
Thomas Wouters7e474022000-07-16 12:04:32 +00006255 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 if (v == u) {
6257 Py_DECREF(u);
6258 Py_DECREF(v);
6259 return 0;
6260 }
6261
6262 result = unicode_compare(u, v);
6263
6264 Py_DECREF(u);
6265 Py_DECREF(v);
6266 return result;
6267
6268onError:
6269 Py_XDECREF(u);
6270 Py_XDECREF(v);
6271 return -1;
6272}
6273
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006274PyObject *PyUnicode_RichCompare(PyObject *left,
6275 PyObject *right,
6276 int op)
6277{
6278 int result;
6279
6280 result = PyUnicode_Compare(left, right);
6281 if (result == -1 && PyErr_Occurred())
6282 goto onError;
6283
6284 /* Convert the return value to a Boolean */
6285 switch (op) {
6286 case Py_EQ:
6287 result = (result == 0);
6288 break;
6289 case Py_NE:
6290 result = (result != 0);
6291 break;
6292 case Py_LE:
6293 result = (result <= 0);
6294 break;
6295 case Py_GE:
6296 result = (result >= 0);
6297 break;
6298 case Py_LT:
6299 result = (result == -1);
6300 break;
6301 case Py_GT:
6302 result = (result == 1);
6303 break;
6304 }
6305 return PyBool_FromLong(result);
6306
6307 onError:
6308
6309 /* Standard case
6310
6311 Type errors mean that PyUnicode_FromObject() could not convert
6312 one of the arguments (usually the right hand side) to Unicode,
6313 ie. we can't handle the comparison request. However, it is
6314 possible that the other object knows a comparison method, which
6315 is why we return Py_NotImplemented to give the other object a
6316 chance.
6317
6318 */
6319 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6320 PyErr_Clear();
6321 Py_INCREF(Py_NotImplemented);
6322 return Py_NotImplemented;
6323 }
6324 if (op != Py_EQ && op != Py_NE)
6325 return NULL;
6326
6327 /* Equality comparison.
6328
6329 This is a special case: we silence any PyExc_UnicodeDecodeError
6330 and instead turn it into a PyErr_UnicodeWarning.
6331
6332 */
6333 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6334 return NULL;
6335 PyErr_Clear();
6336 if (PyErr_Warn(PyExc_UnicodeWarning,
6337 (op == Py_EQ) ?
6338 "Unicode equal comparison "
6339 "failed to convert both arguments to Unicode - "
6340 "interpreting them as being unequal" :
6341 "Unicode unequal comparison "
6342 "failed to convert both arguments to Unicode - "
6343 "interpreting them as being unequal"
6344 ) < 0)
6345 return NULL;
6346 result = (op == Py_NE);
6347 return PyBool_FromLong(result);
6348}
6349
Guido van Rossum403d68b2000-03-13 15:55:09 +00006350int PyUnicode_Contains(PyObject *container,
6351 PyObject *element)
6352{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006353 PyObject *str, *sub;
6354 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006355
6356 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006357 sub = PyUnicode_FromObject(element);
6358 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006359 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006360 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006361 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006362 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006363
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006364 str = PyUnicode_FromObject(container);
6365 if (!str) {
6366 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006367 return -1;
6368 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006369
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006370 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006371
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006372 Py_DECREF(str);
6373 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006374
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006375 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006376}
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378/* Concat to string or Unicode object giving a new Unicode object. */
6379
6380PyObject *PyUnicode_Concat(PyObject *left,
6381 PyObject *right)
6382{
6383 PyUnicodeObject *u = NULL, *v = NULL, *w;
6384
6385 /* Coerce the two arguments */
6386 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6387 if (u == NULL)
6388 goto onError;
6389 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6390 if (v == NULL)
6391 goto onError;
6392
6393 /* Shortcuts */
6394 if (v == unicode_empty) {
6395 Py_DECREF(v);
6396 return (PyObject *)u;
6397 }
6398 if (u == unicode_empty) {
6399 Py_DECREF(u);
6400 return (PyObject *)v;
6401 }
6402
6403 /* Concat the two Unicode strings */
6404 w = _PyUnicode_New(u->length + v->length);
6405 if (w == NULL)
6406 goto onError;
6407 Py_UNICODE_COPY(w->str, u->str, u->length);
6408 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6409
6410 Py_DECREF(u);
6411 Py_DECREF(v);
6412 return (PyObject *)w;
6413
6414onError:
6415 Py_XDECREF(u);
6416 Py_XDECREF(v);
6417 return NULL;
6418}
6419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006420PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421"S.count(sub[, start[, end]]) -> int\n\
6422\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006423Return the number of non-overlapping occurrences of substring sub in\n\
6424Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006425interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
6427static PyObject *
6428unicode_count(PyUnicodeObject *self, PyObject *args)
6429{
6430 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006431 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006432 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 PyObject *result;
6434
Guido van Rossumb8872e62000-05-09 14:14:27 +00006435 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6436 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 return NULL;
6438
6439 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006440 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 if (substring == NULL)
6442 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006443
Fredrik Lundhc8162812006-05-26 19:33:03 +00006444 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006446 result = PyInt_FromSsize_t(
6447 stringlib_count(self->str + start, end - start,
6448 substring->str, substring->length)
6449 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
6451 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 return result;
6454}
6455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006456PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006457"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006459Encodes S using the codec registered for encoding. encoding defaults\n\
6460to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006461handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6463'xmlcharrefreplace' as well as any other name registered with\n\
6464codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
6466static PyObject *
6467unicode_encode(PyUnicodeObject *self, PyObject *args)
6468{
6469 char *encoding = NULL;
6470 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006471 PyObject *v;
6472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6474 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006475 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006476 if (v == NULL)
6477 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006478 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6479 PyErr_Format(PyExc_TypeError,
6480 "encoder did not return a string/unicode object "
6481 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006482 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006483 Py_DECREF(v);
6484 return NULL;
6485 }
6486 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006487
6488 onError:
6489 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490}
6491
6492PyDoc_STRVAR(decode__doc__,
6493"S.decode([encoding[,errors]]) -> string or unicode\n\
6494\n\
6495Decodes S using the codec registered for encoding. encoding defaults\n\
6496to the default encoding. errors may be given to set a different error\n\
6497handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6498a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6499as well as any other name registerd with codecs.register_error that is\n\
6500able to handle UnicodeDecodeErrors.");
6501
6502static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006503unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006504{
6505 char *encoding = NULL;
6506 char *errors = NULL;
6507 PyObject *v;
6508
6509 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6510 return NULL;
6511 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006512 if (v == NULL)
6513 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006514 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6515 PyErr_Format(PyExc_TypeError,
6516 "decoder did not return a string/unicode object "
6517 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006518 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 Py_DECREF(v);
6520 return NULL;
6521 }
6522 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006523
6524 onError:
6525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526}
6527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529"S.expandtabs([tabsize]) -> unicode\n\
6530\n\
6531Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534static PyObject*
6535unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6536{
6537 Py_UNICODE *e;
6538 Py_UNICODE *p;
6539 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006540 Py_UNICODE *qe;
6541 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 PyUnicodeObject *u;
6543 int tabsize = 8;
6544
6545 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6546 return NULL;
6547
Thomas Wouters7e474022000-07-16 12:04:32 +00006548 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006549 i = 0; /* chars up to and including most recent \n or \r */
6550 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6551 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 for (p = self->str; p < e; p++)
6553 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006554 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006555 incr = tabsize - (j % tabsize); /* cannot overflow */
6556 if (j > PY_SSIZE_T_MAX - incr)
6557 goto overflow1;
6558 j += incr;
6559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 }
6561 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006562 if (j > PY_SSIZE_T_MAX - 1)
6563 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 j++;
6565 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006566 if (i > PY_SSIZE_T_MAX - j)
6567 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006569 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 }
6571 }
6572
Guido van Rossum5bdff602008-03-11 21:18:06 +00006573 if (i > PY_SSIZE_T_MAX - j)
6574 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006575
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 /* Second pass: create output string and fill it */
6577 u = _PyUnicode_New(i + j);
6578 if (!u)
6579 return NULL;
6580
Guido van Rossum5bdff602008-03-11 21:18:06 +00006581 j = 0; /* same as in first pass */
6582 q = u->str; /* next output char */
6583 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 for (p = self->str; p < e; p++)
6586 if (*p == '\t') {
6587 if (tabsize > 0) {
6588 i = tabsize - (j % tabsize);
6589 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006590 while (i--) {
6591 if (q >= qe)
6592 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 }
6596 }
6597 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006598 if (q >= qe)
6599 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006601 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 if (*p == '\n' || *p == '\r')
6603 j = 0;
6604 }
6605
6606 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006607
6608 overflow2:
6609 Py_DECREF(u);
6610 overflow1:
6611 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006615PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616"S.find(sub [,start [,end]]) -> int\n\
6617\n\
6618Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006619such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620arguments start and end are interpreted as in slice notation.\n\
6621\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006622Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
6624static PyObject *
6625unicode_find(PyUnicodeObject *self, PyObject *args)
6626{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006627 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006628 Py_ssize_t start;
6629 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006630 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631
Facundo Batista57d56692007-11-16 18:04:14 +00006632 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006635 result = stringlib_find_slice(
6636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6638 start, end
6639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
6641 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006642
6643 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
6646static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006647unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
6649 if (index < 0 || index >= self->length) {
6650 PyErr_SetString(PyExc_IndexError, "string index out of range");
6651 return NULL;
6652 }
6653
6654 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6655}
6656
6657static long
6658unicode_hash(PyUnicodeObject *self)
6659{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006660 /* Since Unicode objects compare equal to their ASCII string
6661 counterparts, they should use the individual character values
6662 as basis for their hash value. This is needed to assure that
6663 strings and Unicode objects behave in the same way as
6664 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006667 register Py_UNICODE *p;
6668 register long x;
6669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 if (self->hash != -1)
6671 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006672 len = PyUnicode_GET_SIZE(self);
6673 p = PyUnicode_AS_UNICODE(self);
6674 x = *p << 7;
6675 while (--len >= 0)
6676 x = (1000003*x) ^ *p++;
6677 x ^= PyUnicode_GET_SIZE(self);
6678 if (x == -1)
6679 x = -2;
6680 self->hash = x;
6681 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682}
6683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006684PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685"S.index(sub [,start [,end]]) -> int\n\
6686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006687Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject *
6690unicode_index(PyUnicodeObject *self, PyObject *args)
6691{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006692 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006693 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006694 Py_ssize_t start;
6695 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696
Facundo Batista57d56692007-11-16 18:04:14 +00006697 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006700 result = stringlib_find_slice(
6701 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6702 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6703 start, end
6704 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 if (result < 0) {
6709 PyErr_SetString(PyExc_ValueError, "substring not found");
6710 return NULL;
6711 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006712
Martin v. Löwis18e16552006-02-15 17:27:45 +00006713 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006717"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
6722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006723unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
6725 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6726 register const Py_UNICODE *e;
6727 int cased;
6728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 /* Shortcut for single character strings */
6730 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006734 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 e = p + PyUnicode_GET_SIZE(self);
6738 cased = 0;
6739 for (; p < e; p++) {
6740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 else if (!cased && Py_UNICODE_ISLOWER(ch))
6745 cased = 1;
6746 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748}
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006753Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006757unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
6759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6760 register const Py_UNICODE *e;
6761 int cased;
6762
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006767 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006768 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006769 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 e = p + PyUnicode_GET_SIZE(self);
6772 cased = 0;
6773 for (; p < e; p++) {
6774 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 else if (!cased && Py_UNICODE_ISUPPER(ch))
6779 cased = 1;
6780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006787Return True if S is a titlecased string and there is at least one\n\
6788character in S, i.e. upper- and titlecase characters may only\n\
6789follow uncased characters and lowercase characters only cased ones.\n\
6790Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
6792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006793unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
6795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6796 register const Py_UNICODE *e;
6797 int cased, previous_is_cased;
6798
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 /* Shortcut for single character strings */
6800 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6802 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006804 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006805 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006807
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 e = p + PyUnicode_GET_SIZE(self);
6809 cased = 0;
6810 previous_is_cased = 0;
6811 for (; p < e; p++) {
6812 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6815 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 previous_is_cased = 1;
6818 cased = 1;
6819 }
6820 else if (Py_UNICODE_ISLOWER(ch)) {
6821 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 previous_is_cased = 1;
6824 cased = 1;
6825 }
6826 else
6827 previous_is_cased = 0;
6828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006835Return True if all characters in S are whitespace\n\
6836and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006839unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
6841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6842 register const Py_UNICODE *e;
6843
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 /* Shortcut for single character strings */
6845 if (PyUnicode_GET_SIZE(self) == 1 &&
6846 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006849 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006850 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 e = p + PyUnicode_GET_SIZE(self);
6854 for (; p < e; p++) {
6855 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006864Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006865and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866
6867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006868unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869{
6870 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6871 register const Py_UNICODE *e;
6872
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873 /* Shortcut for single character strings */
6874 if (PyUnicode_GET_SIZE(self) == 1 &&
6875 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877
6878 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006879 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881
6882 e = p + PyUnicode_GET_SIZE(self);
6883 for (; p < e; p++) {
6884 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006886 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888}
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006893Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895
6896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006897unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898{
6899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6900 register const Py_UNICODE *e;
6901
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902 /* Shortcut for single character strings */
6903 if (PyUnicode_GET_SIZE(self) == 1 &&
6904 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906
6907 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006908 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910
6911 e = p + PyUnicode_GET_SIZE(self);
6912 for (; p < e; p++) {
6913 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917}
6918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
6925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006926unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6929 register const Py_UNICODE *e;
6930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* Shortcut for single character strings */
6932 if (PyUnicode_GET_SIZE(self) == 1 &&
6933 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006936 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006937 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 e = p + PyUnicode_GET_SIZE(self);
6941 for (; p < e; p++) {
6942 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006945 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946}
6947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006951Return True if all characters in S are digits\n\
6952and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
6954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006955unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956{
6957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6958 register const Py_UNICODE *e;
6959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 /* Shortcut for single character strings */
6961 if (PyUnicode_GET_SIZE(self) == 1 &&
6962 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006965 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006966 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006968
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 e = p + PyUnicode_GET_SIZE(self);
6970 for (; p < e; p++) {
6971 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975}
6976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006977PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006984unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
6986 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6987 register const Py_UNICODE *e;
6988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 /* Shortcut for single character strings */
6990 if (PyUnicode_GET_SIZE(self) == 1 &&
6991 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006994 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006995 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006997
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 e = p + PyUnicode_GET_SIZE(self);
6999 for (; p < e; p++) {
7000 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007001 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004}
7005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007006PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007"S.join(sequence) -> unicode\n\
7008\n\
7009Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007010sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
7012static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007013unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Martin v. Löwis18e16552006-02-15 17:27:45 +00007018static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019unicode_length(PyUnicodeObject *self)
7020{
7021 return self->length;
7022}
7023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007024PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007025"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026\n\
7027Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007028done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029
7030static PyObject *
7031unicode_ljust(PyUnicodeObject *self, PyObject *args)
7032{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007033 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007034 Py_UNICODE fillchar = ' ';
7035
Martin v. Löwis412fb672006-04-13 06:34:32 +00007036 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 return NULL;
7038
Tim Peters7a29bd52001-09-12 03:03:31 +00007039 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 Py_INCREF(self);
7041 return (PyObject*) self;
7042 }
7043
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007044 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045}
7046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007047PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048"S.lower() -> unicode\n\
7049\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007050Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
7052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 return fixup(self, fixlower);
7056}
7057
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058#define LEFTSTRIP 0
7059#define RIGHTSTRIP 1
7060#define BOTHSTRIP 2
7061
7062/* Arrays indexed by above */
7063static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7064
7065#define STRIPNAME(i) (stripformat[i]+3)
7066
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067/* externally visible for str.strip(unicode) */
7068PyObject *
7069_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7070{
7071 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007072 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007073 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007074 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7075 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007077 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7078
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079 i = 0;
7080 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007081 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7082 i++;
7083 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084 }
7085
7086 j = len;
7087 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007088 do {
7089 j--;
7090 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7091 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092 }
7093
7094 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007095 Py_INCREF(self);
7096 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097 }
7098 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007099 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007100}
7101
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102
7103static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007107 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
7109 i = 0;
7110 if (striptype != RIGHTSTRIP) {
7111 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7112 i++;
7113 }
7114 }
7115
7116 j = len;
7117 if (striptype != LEFTSTRIP) {
7118 do {
7119 j--;
7120 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7121 j++;
7122 }
7123
7124 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7125 Py_INCREF(self);
7126 return (PyObject*)self;
7127 }
7128 else
7129 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130}
7131
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007132
7133static PyObject *
7134do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7135{
7136 PyObject *sep = NULL;
7137
7138 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7139 return NULL;
7140
7141 if (sep != NULL && sep != Py_None) {
7142 if (PyUnicode_Check(sep))
7143 return _PyUnicode_XStrip(self, striptype, sep);
7144 else if (PyString_Check(sep)) {
7145 PyObject *res;
7146 sep = PyUnicode_FromObject(sep);
7147 if (sep==NULL)
7148 return NULL;
7149 res = _PyUnicode_XStrip(self, striptype, sep);
7150 Py_DECREF(sep);
7151 return res;
7152 }
7153 else {
7154 PyErr_Format(PyExc_TypeError,
7155 "%s arg must be None, unicode or str",
7156 STRIPNAME(striptype));
7157 return NULL;
7158 }
7159 }
7160
7161 return do_strip(self, striptype);
7162}
7163
7164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007165PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007166"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167\n\
7168Return a copy of the string S with leading and trailing\n\
7169whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007170If chars is given and not None, remove characters in chars instead.\n\
7171If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007172
7173static PyObject *
7174unicode_strip(PyUnicodeObject *self, PyObject *args)
7175{
7176 if (PyTuple_GET_SIZE(args) == 0)
7177 return do_strip(self, BOTHSTRIP); /* Common case */
7178 else
7179 return do_argstrip(self, BOTHSTRIP, args);
7180}
7181
7182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007184"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185\n\
7186Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007187If chars is given and not None, remove characters in chars instead.\n\
7188If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007189
7190static PyObject *
7191unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7192{
7193 if (PyTuple_GET_SIZE(args) == 0)
7194 return do_strip(self, LEFTSTRIP); /* Common case */
7195 else
7196 return do_argstrip(self, LEFTSTRIP, args);
7197}
7198
7199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007200PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007201"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007202\n\
7203Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007204If chars is given and not None, remove characters in chars instead.\n\
7205If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206
7207static PyObject *
7208unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7209{
7210 if (PyTuple_GET_SIZE(args) == 0)
7211 return do_strip(self, RIGHTSTRIP); /* Common case */
7212 else
7213 return do_argstrip(self, RIGHTSTRIP, args);
7214}
7215
7216
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007218unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219{
7220 PyUnicodeObject *u;
7221 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007223 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224
7225 if (len < 0)
7226 len = 0;
7227
Tim Peters7a29bd52001-09-12 03:03:31 +00007228 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 /* no repeat, return original string */
7230 Py_INCREF(str);
7231 return (PyObject*) str;
7232 }
Tim Peters8f422462000-09-09 06:13:41 +00007233
7234 /* ensure # of chars needed doesn't overflow int and # of bytes
7235 * needed doesn't overflow size_t
7236 */
7237 nchars = len * str->length;
7238 if (len && nchars / len != str->length) {
7239 PyErr_SetString(PyExc_OverflowError,
7240 "repeated string is too long");
7241 return NULL;
7242 }
7243 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7244 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7245 PyErr_SetString(PyExc_OverflowError,
7246 "repeated string is too long");
7247 return NULL;
7248 }
7249 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 if (!u)
7251 return NULL;
7252
7253 p = u->str;
7254
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007255 if (str->length == 1 && len > 0) {
7256 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007257 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007258 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007259 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007260 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007261 done = str->length;
7262 }
7263 while (done < nchars) {
7264 int n = (done <= nchars-done) ? done : nchars-done;
7265 Py_UNICODE_COPY(p+done, p, n);
7266 done += n;
7267 }
7268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
7270 return (PyObject*) u;
7271}
7272
7273PyObject *PyUnicode_Replace(PyObject *obj,
7274 PyObject *subobj,
7275 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007276 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277{
7278 PyObject *self;
7279 PyObject *str1;
7280 PyObject *str2;
7281 PyObject *result;
7282
7283 self = PyUnicode_FromObject(obj);
7284 if (self == NULL)
7285 return NULL;
7286 str1 = PyUnicode_FromObject(subobj);
7287 if (str1 == NULL) {
7288 Py_DECREF(self);
7289 return NULL;
7290 }
7291 str2 = PyUnicode_FromObject(replobj);
7292 if (str2 == NULL) {
7293 Py_DECREF(self);
7294 Py_DECREF(str1);
7295 return NULL;
7296 }
Tim Petersced69f82003-09-16 20:30:58 +00007297 result = replace((PyUnicodeObject *)self,
7298 (PyUnicodeObject *)str1,
7299 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 maxcount);
7301 Py_DECREF(self);
7302 Py_DECREF(str1);
7303 Py_DECREF(str2);
7304 return result;
7305}
7306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007307PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308"S.replace (old, new[, maxsplit]) -> unicode\n\
7309\n\
7310Return a copy of S with all occurrences of substring\n\
7311old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007312given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
7314static PyObject*
7315unicode_replace(PyUnicodeObject *self, PyObject *args)
7316{
7317 PyUnicodeObject *str1;
7318 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007319 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 PyObject *result;
7321
Martin v. Löwis18e16552006-02-15 17:27:45 +00007322 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 return NULL;
7324 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7325 if (str1 == NULL)
7326 return NULL;
7327 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007328 if (str2 == NULL) {
7329 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332
7333 result = replace(self, str1, str2, maxcount);
7334
7335 Py_DECREF(str1);
7336 Py_DECREF(str2);
7337 return result;
7338}
7339
7340static
7341PyObject *unicode_repr(PyObject *unicode)
7342{
7343 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7344 PyUnicode_GET_SIZE(unicode),
7345 1);
7346}
7347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349"S.rfind(sub [,start [,end]]) -> int\n\
7350\n\
7351Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007352such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353arguments start and end are interpreted as in slice notation.\n\
7354\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357static PyObject *
7358unicode_rfind(PyUnicodeObject *self, PyObject *args)
7359{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007360 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007361 Py_ssize_t start;
7362 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007363 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
Facundo Batista57d56692007-11-16 18:04:14 +00007365 if (!_ParseTupleFinds(args, &substring, &start, &end))
7366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007368 result = stringlib_rfind_slice(
7369 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7370 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7371 start, end
7372 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007375
7376 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377}
7378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007379PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380"S.rindex(sub [,start [,end]]) -> int\n\
7381\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007382Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
7384static PyObject *
7385unicode_rindex(PyUnicodeObject *self, PyObject *args)
7386{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007387 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007388 Py_ssize_t start;
7389 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007390 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
Facundo Batista57d56692007-11-16 18:04:14 +00007392 if (!_ParseTupleFinds(args, &substring, &start, &end))
7393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007395 result = stringlib_rfind_slice(
7396 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7397 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7398 start, end
7399 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007402
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 if (result < 0) {
7404 PyErr_SetString(PyExc_ValueError, "substring not found");
7405 return NULL;
7406 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007407 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408}
7409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007410PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007411"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412\n\
7413Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007414done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
7416static PyObject *
7417unicode_rjust(PyUnicodeObject *self, PyObject *args)
7418{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007419 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007420 Py_UNICODE fillchar = ' ';
7421
Martin v. Löwis412fb672006-04-13 06:34:32 +00007422 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 return NULL;
7424
Tim Peters7a29bd52001-09-12 03:03:31 +00007425 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 Py_INCREF(self);
7427 return (PyObject*) self;
7428 }
7429
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007430 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431}
7432
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007434unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435{
7436 /* standard clamping */
7437 if (start < 0)
7438 start = 0;
7439 if (end < 0)
7440 end = 0;
7441 if (end > self->length)
7442 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007443 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 /* full slice, return original string */
7445 Py_INCREF(self);
7446 return (PyObject*) self;
7447 }
7448 if (start > end)
7449 start = end;
7450 /* copy slice */
7451 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7452 end - start);
7453}
7454
7455PyObject *PyUnicode_Split(PyObject *s,
7456 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007457 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458{
7459 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007460
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 s = PyUnicode_FromObject(s);
7462 if (s == NULL)
7463 return NULL;
7464 if (sep != NULL) {
7465 sep = PyUnicode_FromObject(sep);
7466 if (sep == NULL) {
7467 Py_DECREF(s);
7468 return NULL;
7469 }
7470 }
7471
7472 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7473
7474 Py_DECREF(s);
7475 Py_XDECREF(sep);
7476 return result;
7477}
7478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480"S.split([sep [,maxsplit]]) -> list of strings\n\
7481\n\
7482Return a list of the words in S, using sep as the\n\
7483delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007484splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007485any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
7487static PyObject*
7488unicode_split(PyUnicodeObject *self, PyObject *args)
7489{
7490 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007491 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
Martin v. Löwis18e16552006-02-15 17:27:45 +00007493 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 return NULL;
7495
7496 if (substring == Py_None)
7497 return split(self, NULL, maxcount);
7498 else if (PyUnicode_Check(substring))
7499 return split(self, (PyUnicodeObject *)substring, maxcount);
7500 else
7501 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7502}
7503
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007504PyObject *
7505PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7506{
7507 PyObject* str_obj;
7508 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007509 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007510
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007511 str_obj = PyUnicode_FromObject(str_in);
7512 if (!str_obj)
7513 return NULL;
7514 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007515 if (!sep_obj) {
7516 Py_DECREF(str_obj);
7517 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518 }
7519
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007520 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007521 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7522 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7523 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007524
Fredrik Lundhb9479482006-05-26 17:22:38 +00007525 Py_DECREF(sep_obj);
7526 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007527
7528 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007529}
7530
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007531
7532PyObject *
7533PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7534{
7535 PyObject* str_obj;
7536 PyObject* sep_obj;
7537 PyObject* out;
7538
7539 str_obj = PyUnicode_FromObject(str_in);
7540 if (!str_obj)
7541 return NULL;
7542 sep_obj = PyUnicode_FromObject(sep_in);
7543 if (!sep_obj) {
7544 Py_DECREF(str_obj);
7545 return NULL;
7546 }
7547
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007548 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007549 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7550 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7551 );
7552
7553 Py_DECREF(sep_obj);
7554 Py_DECREF(str_obj);
7555
7556 return out;
7557}
7558
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007559PyDoc_STRVAR(partition__doc__,
7560"S.partition(sep) -> (head, sep, tail)\n\
7561\n\
7562Searches for the separator sep in S, and returns the part before it,\n\
7563the separator itself, and the part after it. If the separator is not\n\
7564found, returns S and two empty strings.");
7565
7566static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007567unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007568{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007569 return PyUnicode_Partition((PyObject *)self, separator);
7570}
7571
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007572PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007573"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007574\n\
7575Searches for the separator sep in S, starting at the end of S, and returns\n\
7576the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007577separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007578
7579static PyObject*
7580unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7581{
7582 return PyUnicode_RPartition((PyObject *)self, separator);
7583}
7584
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007585PyObject *PyUnicode_RSplit(PyObject *s,
7586 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007587 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007588{
7589 PyObject *result;
7590
7591 s = PyUnicode_FromObject(s);
7592 if (s == NULL)
7593 return NULL;
7594 if (sep != NULL) {
7595 sep = PyUnicode_FromObject(sep);
7596 if (sep == NULL) {
7597 Py_DECREF(s);
7598 return NULL;
7599 }
7600 }
7601
7602 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7603
7604 Py_DECREF(s);
7605 Py_XDECREF(sep);
7606 return result;
7607}
7608
7609PyDoc_STRVAR(rsplit__doc__,
7610"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7611\n\
7612Return a list of the words in S, using sep as the\n\
7613delimiter string, starting at the end of the string and\n\
7614working to the front. If maxsplit is given, at most maxsplit\n\
7615splits are done. If sep is not specified, any whitespace string\n\
7616is a separator.");
7617
7618static PyObject*
7619unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7620{
7621 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007622 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007623
Martin v. Löwis18e16552006-02-15 17:27:45 +00007624 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007625 return NULL;
7626
7627 if (substring == Py_None)
7628 return rsplit(self, NULL, maxcount);
7629 else if (PyUnicode_Check(substring))
7630 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7631 else
7632 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7633}
7634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007635PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007636"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637\n\
7638Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007639Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007640is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642static PyObject*
7643unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7644{
Guido van Rossum86662912000-04-11 15:38:46 +00007645 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646
Guido van Rossum86662912000-04-11 15:38:46 +00007647 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 return NULL;
7649
Guido van Rossum86662912000-04-11 15:38:46 +00007650 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651}
7652
7653static
7654PyObject *unicode_str(PyUnicodeObject *self)
7655{
Fred Drakee4315f52000-05-09 19:53:39 +00007656 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660"S.swapcase() -> unicode\n\
7661\n\
7662Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007663and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
7665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007666unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 return fixup(self, fixswapcase);
7669}
7670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007671PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672"S.translate(table) -> unicode\n\
7673\n\
7674Return a copy of the string S, where all characters have been mapped\n\
7675through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007676Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7677Unmapped characters are left untouched. Characters mapped to None\n\
7678are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679
7680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007681unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682{
Tim Petersced69f82003-09-16 20:30:58 +00007683 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007685 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 "ignore");
7687}
7688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690"S.upper() -> unicode\n\
7691\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007692Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007695unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 return fixup(self, fixupper);
7698}
7699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007700PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701"S.zfill(width) -> unicode\n\
7702\n\
7703Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706static PyObject *
7707unicode_zfill(PyUnicodeObject *self, PyObject *args)
7708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007709 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 PyUnicodeObject *u;
7711
Martin v. Löwis18e16552006-02-15 17:27:45 +00007712 Py_ssize_t width;
7713 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 return NULL;
7715
7716 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007717 if (PyUnicode_CheckExact(self)) {
7718 Py_INCREF(self);
7719 return (PyObject*) self;
7720 }
7721 else
7722 return PyUnicode_FromUnicode(
7723 PyUnicode_AS_UNICODE(self),
7724 PyUnicode_GET_SIZE(self)
7725 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 }
7727
7728 fill = width - self->length;
7729
7730 u = pad(self, fill, 0, '0');
7731
Walter Dörwald068325e2002-04-15 13:36:47 +00007732 if (u == NULL)
7733 return NULL;
7734
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 if (u->str[fill] == '+' || u->str[fill] == '-') {
7736 /* move sign to beginning of string */
7737 u->str[0] = u->str[fill];
7738 u->str[fill] = '0';
7739 }
7740
7741 return (PyObject*) u;
7742}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
7744#if 0
7745static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007746free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007748 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749}
7750#endif
7751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007752PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007753"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007755Return True if S starts with the specified prefix, False otherwise.\n\
7756With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007757With optional end, stop comparing S at that position.\n\
7758prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
7760static PyObject *
7761unicode_startswith(PyUnicodeObject *self,
7762 PyObject *args)
7763{
Georg Brandl24250812006-06-09 18:45:48 +00007764 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007766 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007767 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007768 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
Georg Brandl24250812006-06-09 18:45:48 +00007770 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007771 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007773 if (PyTuple_Check(subobj)) {
7774 Py_ssize_t i;
7775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7776 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7777 PyTuple_GET_ITEM(subobj, i));
7778 if (substring == NULL)
7779 return NULL;
7780 result = tailmatch(self, substring, start, end, -1);
7781 Py_DECREF(substring);
7782 if (result) {
7783 Py_RETURN_TRUE;
7784 }
7785 }
7786 /* nothing matched */
7787 Py_RETURN_FALSE;
7788 }
7789 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007791 return NULL;
7792 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007794 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795}
7796
7797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007798PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007799"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007801Return True if S ends with the specified suffix, False otherwise.\n\
7802With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007803With optional end, stop comparing S at that position.\n\
7804suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
7806static PyObject *
7807unicode_endswith(PyUnicodeObject *self,
7808 PyObject *args)
7809{
Georg Brandl24250812006-06-09 18:45:48 +00007810 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007812 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007813 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007814 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
Georg Brandl24250812006-06-09 18:45:48 +00007816 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007819 if (PyTuple_Check(subobj)) {
7820 Py_ssize_t i;
7821 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7822 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7823 PyTuple_GET_ITEM(subobj, i));
7824 if (substring == NULL)
7825 return NULL;
7826 result = tailmatch(self, substring, start, end, +1);
7827 Py_DECREF(substring);
7828 if (result) {
7829 Py_RETURN_TRUE;
7830 }
7831 }
7832 Py_RETURN_FALSE;
7833 }
7834 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837
Georg Brandl24250812006-06-09 18:45:48 +00007838 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007840 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841}
7842
7843
Eric Smitha9f7d622008-02-17 19:46:49 +00007844/* Implements do_string_format, which is unicode because of stringlib */
7845#include "stringlib/string_format.h"
7846
7847PyDoc_STRVAR(format__doc__,
7848"S.format(*args, **kwargs) -> unicode\n\
7849\n\
7850");
7851
7852PyDoc_STRVAR(p_format__doc__,
7853"S.__format__(format_spec) -> unicode\n\
7854\n\
7855");
7856
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007857
7858static PyObject *
7859unicode_getnewargs(PyUnicodeObject *v)
7860{
7861 return Py_BuildValue("(u#)", v->str, v->length);
7862}
7863
7864
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865static PyMethodDef unicode_methods[] = {
7866
7867 /* Order is according to common usage: often used methods should
7868 appear first, since lookup is done sequentially. */
7869
Georg Brandlecdc0a92006-03-30 12:19:07 +00007870 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007871 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7872 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007873 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007874 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7875 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7876 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7877 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7878 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7879 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7880 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007881 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007882 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7883 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7884 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007885 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007886 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007887/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7888 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7889 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7890 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007891 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007892 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007893 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007894 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007895 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7896 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7897 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7898 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7899 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7900 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7901 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7902 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7903 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7904 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7905 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7906 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7907 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7908 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007910 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7911 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7912 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7913 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007914#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007915 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916#endif
7917
7918#if 0
7919 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007920 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921#endif
7922
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007923 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 {NULL, NULL}
7925};
7926
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007927static PyObject *
7928unicode_mod(PyObject *v, PyObject *w)
7929{
7930 if (!PyUnicode_Check(v)) {
7931 Py_INCREF(Py_NotImplemented);
7932 return Py_NotImplemented;
7933 }
7934 return PyUnicode_Format(v, w);
7935}
7936
7937static PyNumberMethods unicode_as_number = {
7938 0, /*nb_add*/
7939 0, /*nb_subtract*/
7940 0, /*nb_multiply*/
7941 0, /*nb_divide*/
7942 unicode_mod, /*nb_remainder*/
7943};
7944
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007946 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007947 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007948 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7949 (ssizeargfunc) unicode_getitem, /* sq_item */
7950 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 0, /* sq_ass_item */
7952 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007953 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954};
7955
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007956static PyObject*
7957unicode_subscript(PyUnicodeObject* self, PyObject* item)
7958{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007959 if (PyIndex_Check(item)) {
7960 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007961 if (i == -1 && PyErr_Occurred())
7962 return NULL;
7963 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007964 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007965 return unicode_getitem(self, i);
7966 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007967 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007968 Py_UNICODE* source_buf;
7969 Py_UNICODE* result_buf;
7970 PyObject* result;
7971
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007972 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007973 &start, &stop, &step, &slicelength) < 0) {
7974 return NULL;
7975 }
7976
7977 if (slicelength <= 0) {
7978 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007979 } else if (start == 0 && step == 1 && slicelength == self->length &&
7980 PyUnicode_CheckExact(self)) {
7981 Py_INCREF(self);
7982 return (PyObject *)self;
7983 } else if (step == 1) {
7984 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007985 } else {
7986 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007987 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7988 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007989
7990 if (result_buf == NULL)
7991 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007992
7993 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7994 result_buf[i] = source_buf[cur];
7995 }
Tim Petersced69f82003-09-16 20:30:58 +00007996
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007997 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007998 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007999 return result;
8000 }
8001 } else {
8002 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8003 return NULL;
8004 }
8005}
8006
8007static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008009 (binaryfunc)unicode_subscript, /* mp_subscript */
8010 (objobjargproc)0, /* mp_ass_subscript */
8011};
8012
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 const void **ptr)
8017{
8018 if (index != 0) {
8019 PyErr_SetString(PyExc_SystemError,
8020 "accessing non-existent unicode segment");
8021 return -1;
8022 }
8023 *ptr = (void *) self->str;
8024 return PyUnicode_GET_DATA_SIZE(self);
8025}
8026
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027static Py_ssize_t
8028unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 const void **ptr)
8030{
8031 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008032 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 return -1;
8034}
8035
8036static int
8037unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
8040 if (lenp)
8041 *lenp = PyUnicode_GET_DATA_SIZE(self);
8042 return 1;
8043}
8044
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008045static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008047 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 const void **ptr)
8049{
8050 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008051
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 if (index != 0) {
8053 PyErr_SetString(PyExc_SystemError,
8054 "accessing non-existent unicode segment");
8055 return -1;
8056 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008057 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (str == NULL)
8059 return -1;
8060 *ptr = (void *) PyString_AS_STRING(str);
8061 return PyString_GET_SIZE(str);
8062}
8063
8064/* Helpers for PyUnicode_Format() */
8065
8066static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008067getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008069 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 if (argidx < arglen) {
8071 (*p_argidx)++;
8072 if (arglen < 0)
8073 return args;
8074 else
8075 return PyTuple_GetItem(args, argidx);
8076 }
8077 PyErr_SetString(PyExc_TypeError,
8078 "not enough arguments for format string");
8079 return NULL;
8080}
8081
8082#define F_LJUST (1<<0)
8083#define F_SIGN (1<<1)
8084#define F_BLANK (1<<2)
8085#define F_ALT (1<<3)
8086#define F_ZERO (1<<4)
8087
Martin v. Löwis18e16552006-02-15 17:27:45 +00008088static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008089strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008091 register Py_ssize_t i;
8092 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 for (i = len - 1; i >= 0; i--)
8094 buffer[i] = (Py_UNICODE) charbuffer[i];
8095
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 return len;
8097}
8098
Neal Norwitzfc76d632006-01-10 06:03:13 +00008099static int
8100doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8101{
Tim Peters15231542006-02-16 01:08:01 +00008102 Py_ssize_t result;
8103
Neal Norwitzfc76d632006-01-10 06:03:13 +00008104 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008105 result = strtounicode(buffer, (char *)buffer);
8106 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008107}
8108
8109static int
8110longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8111{
Tim Peters15231542006-02-16 01:08:01 +00008112 Py_ssize_t result;
8113
Neal Norwitzfc76d632006-01-10 06:03:13 +00008114 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008115 result = strtounicode(buffer, (char *)buffer);
8116 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008117}
8118
Guido van Rossum078151d2002-08-11 04:24:12 +00008119/* XXX To save some code duplication, formatfloat/long/int could have been
8120 shared with stringobject.c, converting from 8-bit to Unicode after the
8121 formatting is done. */
8122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123static int
8124formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008125 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 int flags,
8127 int prec,
8128 int type,
8129 PyObject *v)
8130{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008131 /* fmt = '%#.' + `prec` + `type`
8132 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 char fmt[20];
8134 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008135
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 x = PyFloat_AsDouble(v);
8137 if (x == -1.0 && PyErr_Occurred())
8138 return -1;
8139 if (prec < 0)
8140 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8142 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008143 /* Worst case length calc to ensure no buffer overrun:
8144
8145 'g' formats:
8146 fmt = %#.<prec>g
8147 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8148 for any double rep.)
8149 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8150
8151 'f' formats:
8152 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8153 len = 1 + 50 + 1 + prec = 52 + prec
8154
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008155 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008156 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008157
8158 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008159 if (((type == 'g' || type == 'G') &&
8160 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008161 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008162 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008163 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008164 return -1;
8165 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008166 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8167 (flags&F_ALT) ? "#" : "",
8168 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008169 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170}
8171
Tim Peters38fd5b62000-09-21 05:43:11 +00008172static PyObject*
8173formatlong(PyObject *val, int flags, int prec, int type)
8174{
8175 char *buf;
8176 int i, len;
8177 PyObject *str; /* temporary string object. */
8178 PyUnicodeObject *result;
8179
8180 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8181 if (!str)
8182 return NULL;
8183 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008184 if (!result) {
8185 Py_DECREF(str);
8186 return NULL;
8187 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008188 for (i = 0; i < len; i++)
8189 result->str[i] = buf[i];
8190 result->str[len] = 0;
8191 Py_DECREF(str);
8192 return (PyObject*)result;
8193}
8194
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195static int
8196formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 int flags,
8199 int prec,
8200 int type,
8201 PyObject *v)
8202{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008203 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008204 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8205 * + 1 + 1
8206 * = 24
8207 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008208 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008209 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 long x;
8211
8212 x = PyInt_AsLong(v);
8213 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008214 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008215 if (x < 0 && type == 'u') {
8216 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008217 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008218 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8219 sign = "-";
8220 else
8221 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008223 prec = 1;
8224
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008225 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8226 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008227 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008228 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008230 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008231 return -1;
8232 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008233
8234 if ((flags & F_ALT) &&
8235 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008236 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008237 * of issues that cause pain:
8238 * - when 0 is being converted, the C standard leaves off
8239 * the '0x' or '0X', which is inconsistent with other
8240 * %#x/%#X conversions and inconsistent with Python's
8241 * hex() function
8242 * - there are platforms that violate the standard and
8243 * convert 0 with the '0x' or '0X'
8244 * (Metrowerks, Compaq Tru64)
8245 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008246 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008247 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008248 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008249 * We can achieve the desired consistency by inserting our
8250 * own '0x' or '0X' prefix, and substituting %x/%X in place
8251 * of %#x/%#X.
8252 *
8253 * Note that this is the same approach as used in
8254 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008255 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008256 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8257 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008258 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008259 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008260 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8261 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008262 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008263 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008264 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008265 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008266 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008267 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268}
8269
8270static int
8271formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008272 size_t buflen,
8273 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008275 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008276 if (PyUnicode_Check(v)) {
8277 if (PyUnicode_GET_SIZE(v) != 1)
8278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008282 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008283 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008284 goto onError;
8285 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287
8288 else {
8289 /* Integer input truncated to a character */
8290 long x;
8291 x = PyInt_AsLong(v);
8292 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008293 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008294#ifdef Py_UNICODE_WIDE
8295 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008296 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008297 "%c arg not in range(0x110000) "
8298 "(wide Python build)");
8299 return -1;
8300 }
8301#else
8302 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008303 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008304 "%c arg not in range(0x10000) "
8305 "(narrow Python build)");
8306 return -1;
8307 }
8308#endif
8309 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 }
8311 buf[1] = '\0';
8312 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008313
8314 onError:
8315 PyErr_SetString(PyExc_TypeError,
8316 "%c requires int or char");
8317 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318}
8319
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008320/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8321
8322 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8323 chars are formatted. XXX This is a magic number. Each formatting
8324 routine does bounds checking to ensure no overflow, but a better
8325 solution may be to malloc a buffer of appropriate size for each
8326 format. For now, the current solution is sufficient.
8327*/
8328#define FORMATBUFLEN (size_t)120
8329
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330PyObject *PyUnicode_Format(PyObject *format,
8331 PyObject *args)
8332{
8333 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008334 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 int args_owned = 0;
8336 PyUnicodeObject *result = NULL;
8337 PyObject *dict = NULL;
8338 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008339
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 if (format == NULL || args == NULL) {
8341 PyErr_BadInternalCall();
8342 return NULL;
8343 }
8344 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008345 if (uformat == NULL)
8346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 fmt = PyUnicode_AS_UNICODE(uformat);
8348 fmtcnt = PyUnicode_GET_SIZE(uformat);
8349
8350 reslen = rescnt = fmtcnt + 100;
8351 result = _PyUnicode_New(reslen);
8352 if (result == NULL)
8353 goto onError;
8354 res = PyUnicode_AS_UNICODE(result);
8355
8356 if (PyTuple_Check(args)) {
8357 arglen = PyTuple_Size(args);
8358 argidx = 0;
8359 }
8360 else {
8361 arglen = -1;
8362 argidx = -2;
8363 }
Christian Heimese93237d2007-12-19 02:37:44 +00008364 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008365 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 dict = args;
8367
8368 while (--fmtcnt >= 0) {
8369 if (*fmt != '%') {
8370 if (--rescnt < 0) {
8371 rescnt = fmtcnt + 100;
8372 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008373 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8376 --rescnt;
8377 }
8378 *res++ = *fmt++;
8379 }
8380 else {
8381 /* Got a format specifier */
8382 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008383 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 Py_UNICODE c = '\0';
8386 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008387 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 PyObject *v = NULL;
8389 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008390 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008392 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008393 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
8395 fmt++;
8396 if (*fmt == '(') {
8397 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 PyObject *key;
8400 int pcount = 1;
8401
8402 if (dict == NULL) {
8403 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008404 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 goto onError;
8406 }
8407 ++fmt;
8408 --fmtcnt;
8409 keystart = fmt;
8410 /* Skip over balanced parentheses */
8411 while (pcount > 0 && --fmtcnt >= 0) {
8412 if (*fmt == ')')
8413 --pcount;
8414 else if (*fmt == '(')
8415 ++pcount;
8416 fmt++;
8417 }
8418 keylen = fmt - keystart - 1;
8419 if (fmtcnt < 0 || pcount > 0) {
8420 PyErr_SetString(PyExc_ValueError,
8421 "incomplete format key");
8422 goto onError;
8423 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008424#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008425 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 then looked up since Python uses strings to hold
8427 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008428 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 key = PyUnicode_EncodeUTF8(keystart,
8430 keylen,
8431 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008432#else
8433 key = PyUnicode_FromUnicode(keystart, keylen);
8434#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (key == NULL)
8436 goto onError;
8437 if (args_owned) {
8438 Py_DECREF(args);
8439 args_owned = 0;
8440 }
8441 args = PyObject_GetItem(dict, key);
8442 Py_DECREF(key);
8443 if (args == NULL) {
8444 goto onError;
8445 }
8446 args_owned = 1;
8447 arglen = -1;
8448 argidx = -2;
8449 }
8450 while (--fmtcnt >= 0) {
8451 switch (c = *fmt++) {
8452 case '-': flags |= F_LJUST; continue;
8453 case '+': flags |= F_SIGN; continue;
8454 case ' ': flags |= F_BLANK; continue;
8455 case '#': flags |= F_ALT; continue;
8456 case '0': flags |= F_ZERO; continue;
8457 }
8458 break;
8459 }
8460 if (c == '*') {
8461 v = getnextarg(args, arglen, &argidx);
8462 if (v == NULL)
8463 goto onError;
8464 if (!PyInt_Check(v)) {
8465 PyErr_SetString(PyExc_TypeError,
8466 "* wants int");
8467 goto onError;
8468 }
8469 width = PyInt_AsLong(v);
8470 if (width < 0) {
8471 flags |= F_LJUST;
8472 width = -width;
8473 }
8474 if (--fmtcnt >= 0)
8475 c = *fmt++;
8476 }
8477 else if (c >= '0' && c <= '9') {
8478 width = c - '0';
8479 while (--fmtcnt >= 0) {
8480 c = *fmt++;
8481 if (c < '0' || c > '9')
8482 break;
8483 if ((width*10) / 10 != width) {
8484 PyErr_SetString(PyExc_ValueError,
8485 "width too big");
8486 goto onError;
8487 }
8488 width = width*10 + (c - '0');
8489 }
8490 }
8491 if (c == '.') {
8492 prec = 0;
8493 if (--fmtcnt >= 0)
8494 c = *fmt++;
8495 if (c == '*') {
8496 v = getnextarg(args, arglen, &argidx);
8497 if (v == NULL)
8498 goto onError;
8499 if (!PyInt_Check(v)) {
8500 PyErr_SetString(PyExc_TypeError,
8501 "* wants int");
8502 goto onError;
8503 }
8504 prec = PyInt_AsLong(v);
8505 if (prec < 0)
8506 prec = 0;
8507 if (--fmtcnt >= 0)
8508 c = *fmt++;
8509 }
8510 else if (c >= '0' && c <= '9') {
8511 prec = c - '0';
8512 while (--fmtcnt >= 0) {
8513 c = Py_CHARMASK(*fmt++);
8514 if (c < '0' || c > '9')
8515 break;
8516 if ((prec*10) / 10 != prec) {
8517 PyErr_SetString(PyExc_ValueError,
8518 "prec too big");
8519 goto onError;
8520 }
8521 prec = prec*10 + (c - '0');
8522 }
8523 }
8524 } /* prec */
8525 if (fmtcnt >= 0) {
8526 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 if (--fmtcnt >= 0)
8528 c = *fmt++;
8529 }
8530 }
8531 if (fmtcnt < 0) {
8532 PyErr_SetString(PyExc_ValueError,
8533 "incomplete format");
8534 goto onError;
8535 }
8536 if (c != '%') {
8537 v = getnextarg(args, arglen, &argidx);
8538 if (v == NULL)
8539 goto onError;
8540 }
8541 sign = 0;
8542 fill = ' ';
8543 switch (c) {
8544
8545 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008546 pbuf = formatbuf;
8547 /* presume that buffer length is at least 1 */
8548 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 len = 1;
8550 break;
8551
8552 case 's':
8553 case 'r':
8554 if (PyUnicode_Check(v) && c == 's') {
8555 temp = v;
8556 Py_INCREF(temp);
8557 }
8558 else {
8559 PyObject *unicode;
8560 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008561 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 else
8563 temp = PyObject_Repr(v);
8564 if (temp == NULL)
8565 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008566 if (PyUnicode_Check(temp))
8567 /* nothing to do */;
8568 else if (PyString_Check(temp)) {
8569 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008570 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008572 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008574 Py_DECREF(temp);
8575 temp = unicode;
8576 if (temp == NULL)
8577 goto onError;
8578 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008579 else {
8580 Py_DECREF(temp);
8581 PyErr_SetString(PyExc_TypeError,
8582 "%s argument has non-string str()");
8583 goto onError;
8584 }
8585 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008586 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 len = PyUnicode_GET_SIZE(temp);
8588 if (prec >= 0 && len > prec)
8589 len = prec;
8590 break;
8591
8592 case 'i':
8593 case 'd':
8594 case 'u':
8595 case 'o':
8596 case 'x':
8597 case 'X':
8598 if (c == 'i')
8599 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008600 isnumok = 0;
8601 if (PyNumber_Check(v)) {
8602 PyObject *iobj=NULL;
8603
8604 if (PyInt_Check(v) || (PyLong_Check(v))) {
8605 iobj = v;
8606 Py_INCREF(iobj);
8607 }
8608 else {
8609 iobj = PyNumber_Int(v);
8610 if (iobj==NULL) iobj = PyNumber_Long(v);
8611 }
8612 if (iobj!=NULL) {
8613 if (PyInt_Check(iobj)) {
8614 isnumok = 1;
8615 pbuf = formatbuf;
8616 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8617 flags, prec, c, iobj);
8618 Py_DECREF(iobj);
8619 if (len < 0)
8620 goto onError;
8621 sign = 1;
8622 }
8623 else if (PyLong_Check(iobj)) {
8624 isnumok = 1;
8625 temp = formatlong(iobj, flags, prec, c);
8626 Py_DECREF(iobj);
8627 if (!temp)
8628 goto onError;
8629 pbuf = PyUnicode_AS_UNICODE(temp);
8630 len = PyUnicode_GET_SIZE(temp);
8631 sign = 1;
8632 }
8633 else {
8634 Py_DECREF(iobj);
8635 }
8636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008638 if (!isnumok) {
8639 PyErr_Format(PyExc_TypeError,
8640 "%%%c format: a number is required, "
8641 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008642 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008643 }
8644 if (flags & F_ZERO)
8645 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 break;
8647
8648 case 'e':
8649 case 'E':
8650 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008651 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 case 'g':
8653 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008654 if (c == 'F')
8655 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008656 pbuf = formatbuf;
8657 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8658 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 if (len < 0)
8660 goto onError;
8661 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008662 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 fill = '0';
8664 break;
8665
8666 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008667 pbuf = formatbuf;
8668 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 if (len < 0)
8670 goto onError;
8671 break;
8672
8673 default:
8674 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008675 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008676 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008677 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008678 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008679 (Py_ssize_t)(fmt - 1 -
8680 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 goto onError;
8682 }
8683 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008684 if (*pbuf == '-' || *pbuf == '+') {
8685 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 len--;
8687 }
8688 else if (flags & F_SIGN)
8689 sign = '+';
8690 else if (flags & F_BLANK)
8691 sign = ' ';
8692 else
8693 sign = 0;
8694 }
8695 if (width < len)
8696 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008697 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 reslen -= rescnt;
8699 rescnt = width + fmtcnt + 100;
8700 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008701 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008702 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008703 PyErr_NoMemory();
8704 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008705 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008706 if (_PyUnicode_Resize(&result, reslen) < 0) {
8707 Py_XDECREF(temp);
8708 goto onError;
8709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 res = PyUnicode_AS_UNICODE(result)
8711 + reslen - rescnt;
8712 }
8713 if (sign) {
8714 if (fill != ' ')
8715 *res++ = sign;
8716 rescnt--;
8717 if (width > len)
8718 width--;
8719 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008720 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8721 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008722 assert(pbuf[1] == c);
8723 if (fill != ' ') {
8724 *res++ = *pbuf++;
8725 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008726 }
Tim Petersfff53252001-04-12 18:38:48 +00008727 rescnt -= 2;
8728 width -= 2;
8729 if (width < 0)
8730 width = 0;
8731 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 if (width > len && !(flags & F_LJUST)) {
8734 do {
8735 --rescnt;
8736 *res++ = fill;
8737 } while (--width > len);
8738 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008739 if (fill == ' ') {
8740 if (sign)
8741 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008742 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008743 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008744 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008745 *res++ = *pbuf++;
8746 *res++ = *pbuf++;
8747 }
8748 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008749 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 res += len;
8751 rescnt -= len;
8752 while (--width >= len) {
8753 --rescnt;
8754 *res++ = ' ';
8755 }
8756 if (dict && (argidx < arglen) && c != '%') {
8757 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008758 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008759 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 goto onError;
8761 }
8762 Py_XDECREF(temp);
8763 } /* '%' */
8764 } /* until end */
8765 if (argidx < arglen && !dict) {
8766 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008767 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 goto onError;
8769 }
8770
Thomas Woutersa96affe2006-03-12 00:29:36 +00008771 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 if (args_owned) {
8774 Py_DECREF(args);
8775 }
8776 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 return (PyObject *)result;
8778
8779 onError:
8780 Py_XDECREF(result);
8781 Py_DECREF(uformat);
8782 if (args_owned) {
8783 Py_DECREF(args);
8784 }
8785 return NULL;
8786}
8787
8788static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008789 (readbufferproc) unicode_buffer_getreadbuf,
8790 (writebufferproc) unicode_buffer_getwritebuf,
8791 (segcountproc) unicode_buffer_getsegcount,
8792 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793};
8794
Jeremy Hylton938ace62002-07-17 16:30:39 +00008795static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008796unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8797
Tim Peters6d6c1a32001-08-02 04:15:00 +00008798static PyObject *
8799unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8800{
8801 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008802 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008803 char *encoding = NULL;
8804 char *errors = NULL;
8805
Guido van Rossume023fe02001-08-30 03:12:59 +00008806 if (type != &PyUnicode_Type)
8807 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008808 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8809 kwlist, &x, &encoding, &errors))
8810 return NULL;
8811 if (x == NULL)
8812 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008813 if (encoding == NULL && errors == NULL)
8814 return PyObject_Unicode(x);
8815 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008816 return PyUnicode_FromEncodedObject(x, encoding, errors);
8817}
8818
Guido van Rossume023fe02001-08-30 03:12:59 +00008819static PyObject *
8820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8821{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008822 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008823 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008824
8825 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8826 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8827 if (tmp == NULL)
8828 return NULL;
8829 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008830 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008831 if (pnew == NULL) {
8832 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008833 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008834 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008835 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008836 if (pnew->str == NULL) {
8837 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008838 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008839 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008840 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008841 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008842 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8843 pnew->length = n;
8844 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008845 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008846 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008847}
8848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008849PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008850"unicode(string [, encoding[, errors]]) -> object\n\
8851\n\
8852Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008853encoding defaults to the current default string encoding.\n\
8854errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008855
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008857 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 "unicode", /* tp_name */
8859 sizeof(PyUnicodeObject), /* tp_size */
8860 0, /* tp_itemsize */
8861 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008862 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008864 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008866 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008867 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008868 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008870 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 (hashfunc) unicode_hash, /* tp_hash*/
8872 0, /* tp_call*/
8873 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008874 PyObject_GenericGetAttr, /* tp_getattro */
8875 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008877 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008878 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008879 unicode_doc, /* tp_doc */
8880 0, /* tp_traverse */
8881 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008882 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008883 0, /* tp_weaklistoffset */
8884 0, /* tp_iter */
8885 0, /* tp_iternext */
8886 unicode_methods, /* tp_methods */
8887 0, /* tp_members */
8888 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008889 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008890 0, /* tp_dict */
8891 0, /* tp_descr_get */
8892 0, /* tp_descr_set */
8893 0, /* tp_dictoffset */
8894 0, /* tp_init */
8895 0, /* tp_alloc */
8896 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008897 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898};
8899
8900/* Initialize the Unicode implementation */
8901
Thomas Wouters78890102000-07-22 19:25:51 +00008902void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008904 int i;
8905
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008906 /* XXX - move this array to unicodectype.c ? */
8907 Py_UNICODE linebreak[] = {
8908 0x000A, /* LINE FEED */
8909 0x000D, /* CARRIAGE RETURN */
8910 0x001C, /* FILE SEPARATOR */
8911 0x001D, /* GROUP SEPARATOR */
8912 0x001E, /* RECORD SEPARATOR */
8913 0x0085, /* NEXT LINE */
8914 0x2028, /* LINE SEPARATOR */
8915 0x2029, /* PARAGRAPH SEPARATOR */
8916 };
8917
Fred Drakee4315f52000-05-09 19:53:39 +00008918 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008919 free_list = NULL;
8920 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008922 if (!unicode_empty)
8923 return;
8924
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008925 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008926 for (i = 0; i < 256; i++)
8927 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008928 if (PyType_Ready(&PyUnicode_Type) < 0)
8929 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008930
8931 /* initialize the linebreak bloom filter */
8932 bloom_linebreak = make_bloom_mask(
8933 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8934 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008935
8936 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937}
8938
8939/* Finalize the Unicode implementation */
8940
Christian Heimes3b718a72008-02-14 12:47:33 +00008941int
8942PyUnicode_ClearFreeList(void)
8943{
8944 int freelist_size = numfree;
8945 PyUnicodeObject *u;
8946
8947 for (u = free_list; u != NULL;) {
8948 PyUnicodeObject *v = u;
8949 u = *(PyUnicodeObject **)u;
8950 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00008951 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00008952 Py_XDECREF(v->defenc);
8953 PyObject_Del(v);
8954 numfree--;
8955 }
8956 free_list = NULL;
8957 assert(numfree == 0);
8958 return freelist_size;
8959}
8960
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961void
Thomas Wouters78890102000-07-22 19:25:51 +00008962_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008964 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008966 Py_XDECREF(unicode_empty);
8967 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008968
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008969 for (i = 0; i < 256; i++) {
8970 if (unicode_latin1[i]) {
8971 Py_DECREF(unicode_latin1[i]);
8972 unicode_latin1[i] = NULL;
8973 }
8974 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008975 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008977
Anthony Baxterac6bd462006-04-13 02:06:09 +00008978#ifdef __cplusplus
8979}
8980#endif
8981
8982
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008983/*
8984Local variables:
8985c-basic-offset: 4
8986indent-tabs-mode: nil
8987End:
8988*/