blob: 0aed3ab1abffccbd27f1860be3632f3dec941937 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
264 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
265 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000266 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_NoMemory();
268 return -1;
269 }
270 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000271 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000273 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000275 if (unicode->defenc) {
276 Py_DECREF(unicode->defenc);
277 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000280
Guido van Rossumd57fd912000-03-10 22:53:23 +0000281 return 0;
282}
283
284/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000285 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286
287 XXX This allocator could further be enhanced by assuring that the
288 free list never reduces its size below 1.
289
290*/
291
292static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000293PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
295 register PyUnicodeObject *unicode;
296
Andrew Dalkee0df7622006-05-27 11:04:36 +0000297 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 if (length == 0 && unicode_empty != NULL) {
299 Py_INCREF(unicode_empty);
300 return unicode_empty;
301 }
302
303 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000304 if (free_list) {
305 unicode = free_list;
306 free_list = *(PyUnicodeObject **)unicode;
307 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000309 /* Keep-Alive optimization: we only upsize the buffer,
310 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000311 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000312 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 }
316 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000317 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000319 }
320 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000323 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode == NULL)
325 return NULL;
326 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
327 }
328
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000329 if (!unicode->str) {
330 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000331 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000332 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000333 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000334 * the caller fails before initializing str -- unicode_resize()
335 * reads str[0], and the Keep-Alive optimization can keep memory
336 * allocated for str alive across a call to unicode_dealloc(unicode).
337 * We don't want unicode_resize to read uninitialized memory in
338 * that case.
339 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000342 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000344 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000346
347 onError:
348 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000349 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351}
352
353static
Guido van Rossum9475a232001-10-05 20:51:39 +0000354void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000356 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000357 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000358 /* Keep-Alive optimization */
359 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000360 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str = NULL;
362 unicode->length = 0;
363 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 if (unicode->defenc) {
365 Py_DECREF(unicode->defenc);
366 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 }
368 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000369 *(PyUnicodeObject **)unicode = free_list;
370 free_list = unicode;
371 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 }
373 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000374 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000376 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 }
378}
379
Martin v. Löwis18e16552006-02-15 17:27:45 +0000380int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000381{
382 register PyUnicodeObject *v;
383
384 /* Argument checks */
385 if (unicode == NULL) {
386 PyErr_BadInternalCall();
387 return -1;
388 }
389 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000390 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 PyErr_BadInternalCall();
392 return -1;
393 }
394
395 /* Resizing unicode_empty and single character objects is not
396 possible since these are being shared. We simply return a fresh
397 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000398 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000399 (v == unicode_empty || v->length == 1)) {
400 PyUnicodeObject *w = _PyUnicode_New(length);
401 if (w == NULL)
402 return -1;
403 Py_UNICODE_COPY(w->str, v->str,
404 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000405 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 *unicode = (PyObject *)w;
407 return 0;
408 }
409
410 /* Note that we don't have to modify *unicode for unshared Unicode
411 objects, since we can modify them in-place. */
412 return unicode_resize(v, length);
413}
414
415/* Internal API for use in unicodeobject.c only ! */
416#define _PyUnicode_Resize(unicodevar, length) \
417 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
418
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000420 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421{
422 PyUnicodeObject *unicode;
423
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 /* If the Unicode data is known at construction time, we can apply
425 some optimizations which share commonly used objects. */
426 if (u != NULL) {
427
428 /* Optimization for empty strings */
429 if (size == 0 && unicode_empty != NULL) {
430 Py_INCREF(unicode_empty);
431 return (PyObject *)unicode_empty;
432 }
433
434 /* Single character Unicode objects in the Latin-1 range are
435 shared when using this constructor */
436 if (size == 1 && *u < 256) {
437 unicode = unicode_latin1[*u];
438 if (!unicode) {
439 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 if (!unicode)
441 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000442 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 unicode_latin1[*u] = unicode;
444 }
445 Py_INCREF(unicode);
446 return (PyObject *)unicode;
447 }
448 }
Tim Petersced69f82003-09-16 20:30:58 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450 unicode = _PyUnicode_New(size);
451 if (!unicode)
452 return NULL;
453
454 /* Copy the Unicode data into the new object */
455 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000456 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 return (PyObject *)unicode;
459}
460
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000461PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
462{
463 PyUnicodeObject *unicode;
464 /* If the Unicode data is known at construction time, we can apply
465 some optimizations which share commonly used objects.
466 Also, this means the input must be UTF-8, so fall back to the
467 UTF-8 decoder at the end. */
468 if (u != NULL) {
469
470 /* Optimization for empty strings */
471 if (size == 0 && unicode_empty != NULL) {
472 Py_INCREF(unicode_empty);
473 return (PyObject *)unicode_empty;
474 }
475
476 /* Single characters are shared when using this constructor.
477 Restrict to ASCII, since the input must be UTF-8. */
478 if (size == 1 && Py_CHARMASK(*u) < 128) {
479 unicode = unicode_latin1[Py_CHARMASK(*u)];
480 if (!unicode) {
481 unicode = _PyUnicode_New(1);
482 if (!unicode)
483 return NULL;
484 unicode->str[0] = Py_CHARMASK(*u);
485 unicode_latin1[Py_CHARMASK(*u)] = unicode;
486 }
487 Py_INCREF(unicode);
488 return (PyObject *)unicode;
489 }
490
491 return PyUnicode_DecodeUTF8(u, size, NULL);
492 }
493
494 unicode = _PyUnicode_New(size);
495 if (!unicode)
496 return NULL;
497
498 return (PyObject *)unicode;
499}
500
501PyObject *PyUnicode_FromString(const char *u)
502{
503 size_t size = strlen(u);
504 if (size > PY_SSIZE_T_MAX) {
505 PyErr_SetString(PyExc_OverflowError, "input too long");
506 return NULL;
507 }
508
509 return PyUnicode_FromStringAndSize(u, size);
510}
511
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512#ifdef HAVE_WCHAR_H
513
514PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516{
517 PyUnicodeObject *unicode;
518
519 if (w == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
523
524 unicode = _PyUnicode_New(size);
525 if (!unicode)
526 return NULL;
527
528 /* Copy the wchar_t data into the new object */
529#ifdef HAVE_USABLE_WCHAR_T
530 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000531#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 {
533 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000534 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000536 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 *u++ = *w++;
538 }
539#endif
540
541 return (PyObject *)unicode;
542}
543
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000544static void
545makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
546{
547 *fmt++ = '%';
548 if (width) {
549 if (zeropad)
550 *fmt++ = '0';
551 fmt += sprintf(fmt, "%d", width);
552 }
553 if (precision)
554 fmt += sprintf(fmt, ".%d", precision);
555 if (longflag)
556 *fmt++ = 'l';
557 else if (size_tflag) {
558 char *f = PY_FORMAT_SIZE_T;
559 while (*f)
560 *fmt++ = *f++;
561 }
562 *fmt++ = c;
563 *fmt = '\0';
564}
565
566#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
567
568PyObject *
569PyUnicode_FromFormatV(const char *format, va_list vargs)
570{
571 va_list count;
572 Py_ssize_t callcount = 0;
573 PyObject **callresults = NULL;
574 PyObject **callresult = NULL;
575 Py_ssize_t n = 0;
576 int width = 0;
577 int precision = 0;
578 int zeropad;
579 const char* f;
580 Py_UNICODE *s;
581 PyObject *string;
582 /* used by sprintf */
583 char buffer[21];
584 /* use abuffer instead of buffer, if we need more space
585 * (which can happen if there's a format specifier with width). */
586 char *abuffer = NULL;
587 char *realbuffer;
588 Py_ssize_t abuffersize = 0;
589 char fmt[60]; /* should be enough for %0width.precisionld */
590 const char *copy;
591
592#ifdef VA_LIST_IS_ARRAY
593 Py_MEMCPY(count, vargs, sizeof(va_list));
594#else
595#ifdef __va_copy
596 __va_copy(count, vargs);
597#else
598 count = vargs;
599#endif
600#endif
601 /* step 1: count the number of %S/%R format specifications
602 * (we call PyObject_Str()/PyObject_Repr() for these objects
603 * once during step 3 and put the result in an array) */
604 for (f = format; *f; f++) {
605 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
606 ++callcount;
607 }
608 /* step 2: allocate memory for the results of
609 * PyObject_Str()/PyObject_Repr() calls */
610 if (callcount) {
611 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
612 if (!callresults) {
613 PyErr_NoMemory();
614 return NULL;
615 }
616 callresult = callresults;
617 }
618 /* step 3: figure out how large a buffer we need */
619 for (f = format; *f; f++) {
620 if (*f == '%') {
621 const char* p = f;
622 width = 0;
623 while (isdigit(*f))
624 width = (width*10) + *f++ - '0';
625 while (*++f && *f != '%' && !isalpha(*f))
626 ;
627
628 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
629 * they don't affect the amount of space we reserve.
630 */
631 if ((*f == 'l' || *f == 'z') &&
632 (f[1] == 'd' || f[1] == 'u'))
633 ++f;
634
635 switch (*f) {
636 case 'c':
637 (void)va_arg(count, int);
638 /* fall through... */
639 case '%':
640 n++;
641 break;
642 case 'd': case 'u': case 'i': case 'x':
643 (void) va_arg(count, int);
644 /* 20 bytes is enough to hold a 64-bit
645 integer. Decimal takes the most space.
646 This isn't enough for octal.
647 If a width is specified we need more
648 (which we allocate later). */
649 if (width < 20)
650 width = 20;
651 n += width;
652 if (abuffersize < width)
653 abuffersize = width;
654 break;
655 case 's':
656 {
657 /* UTF-8 */
658 unsigned char*s;
659 s = va_arg(count, unsigned char*);
660 while (*s) {
661 if (*s < 128) {
662 n++; s++;
663 } else if (*s < 0xc0) {
664 /* invalid UTF-8 */
665 n++; s++;
666 } else if (*s < 0xc0) {
667 n++;
668 s++; if(!*s)break;
669 s++;
670 } else if (*s < 0xe0) {
671 n++;
672 s++; if(!*s)break;
673 s++; if(!*s)break;
674 s++;
675 } else {
676 #ifdef Py_UNICODE_WIDE
677 n++;
678 #else
679 n+=2;
680 #endif
681 s++; if(!*s)break;
682 s++; if(!*s)break;
683 s++; if(!*s)break;
684 s++;
685 }
686 }
687 break;
688 }
689 case 'U':
690 {
691 PyObject *obj = va_arg(count, PyObject *);
692 assert(obj && PyUnicode_Check(obj));
693 n += PyUnicode_GET_SIZE(obj);
694 break;
695 }
696 case 'V':
697 {
698 PyObject *obj = va_arg(count, PyObject *);
699 const char *str = va_arg(count, const char *);
700 assert(obj || str);
701 assert(!obj || PyUnicode_Check(obj));
702 if (obj)
703 n += PyUnicode_GET_SIZE(obj);
704 else
705 n += strlen(str);
706 break;
707 }
708 case 'S':
709 {
710 PyObject *obj = va_arg(count, PyObject *);
711 PyObject *str;
712 assert(obj);
713 str = PyObject_Str(obj);
714 if (!str)
715 goto fail;
716 n += PyUnicode_GET_SIZE(str);
717 /* Remember the str and switch to the next slot */
718 *callresult++ = str;
719 break;
720 }
721 case 'R':
722 {
723 PyObject *obj = va_arg(count, PyObject *);
724 PyObject *repr;
725 assert(obj);
726 repr = PyObject_Repr(obj);
727 if (!repr)
728 goto fail;
729 n += PyUnicode_GET_SIZE(repr);
730 /* Remember the repr and switch to the next slot */
731 *callresult++ = repr;
732 break;
733 }
734 case 'p':
735 (void) va_arg(count, int);
736 /* maximum 64-bit pointer representation:
737 * 0xffffffffffffffff
738 * so 19 characters is enough.
739 * XXX I count 18 -- what's the extra for?
740 */
741 n += 19;
742 break;
743 default:
744 /* if we stumble upon an unknown
745 formatting code, copy the rest of
746 the format string to the output
747 string. (we cannot just skip the
748 code, since there's no way to know
749 what's in the argument list) */
750 n += strlen(p);
751 goto expand;
752 }
753 } else
754 n++;
755 }
756 expand:
757 if (abuffersize > 20) {
758 abuffer = PyMem_Malloc(abuffersize);
759 if (!abuffer) {
760 PyErr_NoMemory();
761 goto fail;
762 }
763 realbuffer = abuffer;
764 }
765 else
766 realbuffer = buffer;
767 /* step 4: fill the buffer */
768 /* Since we've analyzed how much space we need for the worst case,
769 we don't have to resize the string.
770 There can be no errors beyond this point. */
771 string = PyUnicode_FromUnicode(NULL, n);
772 if (!string)
773 goto fail;
774
775 s = PyUnicode_AS_UNICODE(string);
776 callresult = callresults;
777
778 for (f = format; *f; f++) {
779 if (*f == '%') {
780 const char* p = f++;
781 int longflag = 0;
782 int size_tflag = 0;
783 zeropad = (*f == '0');
784 /* parse the width.precision part */
785 width = 0;
786 while (isdigit(*f))
787 width = (width*10) + *f++ - '0';
788 precision = 0;
789 if (*f == '.') {
790 f++;
791 while (isdigit(*f))
792 precision = (precision*10) + *f++ - '0';
793 }
794 /* handle the long flag, but only for %ld and %lu.
795 others can be added when necessary. */
796 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
797 longflag = 1;
798 ++f;
799 }
800 /* handle the size_t flag. */
801 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
802 size_tflag = 1;
803 ++f;
804 }
805
806 switch (*f) {
807 case 'c':
808 *s++ = va_arg(vargs, int);
809 break;
810 case 'd':
811 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
812 if (longflag)
813 sprintf(realbuffer, fmt, va_arg(vargs, long));
814 else if (size_tflag)
815 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
816 else
817 sprintf(realbuffer, fmt, va_arg(vargs, int));
818 appendstring(realbuffer);
819 break;
820 case 'u':
821 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
822 if (longflag)
823 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
824 else if (size_tflag)
825 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
826 else
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
828 appendstring(realbuffer);
829 break;
830 case 'i':
831 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
832 sprintf(realbuffer, fmt, va_arg(vargs, int));
833 appendstring(realbuffer);
834 break;
835 case 'x':
836 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
837 sprintf(realbuffer, fmt, va_arg(vargs, int));
838 appendstring(realbuffer);
839 break;
840 case 's':
841 {
842 /* Parameter must be UTF-8 encoded.
843 In case of encoding errors, use
844 the replacement character. */
845 PyObject *u;
846 p = va_arg(vargs, char*);
847 u = PyUnicode_DecodeUTF8(p, strlen(p),
848 "replace");
849 if (!u)
850 goto fail;
851 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
852 PyUnicode_GET_SIZE(u));
853 s += PyUnicode_GET_SIZE(u);
854 Py_DECREF(u);
855 break;
856 }
857 case 'U':
858 {
859 PyObject *obj = va_arg(vargs, PyObject *);
860 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
861 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
862 s += size;
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(vargs, PyObject *);
868 const char *str = va_arg(vargs, const char *);
869 if (obj) {
870 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
871 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
872 s += size;
873 } else {
874 appendstring(str);
875 }
876 break;
877 }
878 case 'S':
879 case 'R':
880 {
881 Py_UNICODE *ucopy;
882 Py_ssize_t usize;
883 Py_ssize_t upos;
884 /* unused, since we already have the result */
885 (void) va_arg(vargs, PyObject *);
886 ucopy = PyUnicode_AS_UNICODE(*callresult);
887 usize = PyUnicode_GET_SIZE(*callresult);
888 for (upos = 0; upos<usize;)
889 *s++ = ucopy[upos++];
890 /* We're done with the unicode()/repr() => forget it */
891 Py_DECREF(*callresult);
892 /* switch to next unicode()/repr() result */
893 ++callresult;
894 break;
895 }
896 case 'p':
897 sprintf(buffer, "%p", va_arg(vargs, void*));
898 /* %p is ill-defined: ensure leading 0x. */
899 if (buffer[1] == 'X')
900 buffer[1] = 'x';
901 else if (buffer[1] != 'x') {
902 memmove(buffer+2, buffer, strlen(buffer)+1);
903 buffer[0] = '0';
904 buffer[1] = 'x';
905 }
906 appendstring(buffer);
907 break;
908 case '%':
909 *s++ = '%';
910 break;
911 default:
912 appendstring(p);
913 goto end;
914 }
915 } else
916 *s++ = *f;
917 }
918
919 end:
920 if (callresults)
921 PyMem_Free(callresults);
922 if (abuffer)
923 PyMem_Free(abuffer);
924 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
925 return string;
926 fail:
927 if (callresults) {
928 PyObject **callresult2 = callresults;
929 while (callresult2 < callresult) {
930 Py_DECREF(*callresult2);
931 ++callresult2;
932 }
933 PyMem_Free(callresults);
934 }
935 if (abuffer)
936 PyMem_Free(abuffer);
937 return NULL;
938}
939
940#undef appendstring
941
942PyObject *
943PyUnicode_FromFormat(const char *format, ...)
944{
945 PyObject* ret;
946 va_list vargs;
947
948#ifdef HAVE_STDARG_PROTOTYPES
949 va_start(vargs, format);
950#else
951 va_start(vargs);
952#endif
953 ret = PyUnicode_FromFormatV(format, vargs);
954 va_end(vargs);
955 return ret;
956}
957
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
959 wchar_t *w,
960 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961{
962 if (unicode == NULL) {
963 PyErr_BadInternalCall();
964 return -1;
965 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000966
967 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000968 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000969 size = PyUnicode_GET_SIZE(unicode) + 1;
970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971#ifdef HAVE_USABLE_WCHAR_T
972 memcpy(w, unicode->str, size * sizeof(wchar_t));
973#else
974 {
975 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000976 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000978 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979 *w++ = *u++;
980 }
981#endif
982
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000983 if (size > PyUnicode_GET_SIZE(unicode))
984 return PyUnicode_GET_SIZE(unicode);
985 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 return size;
987}
988
989#endif
990
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000991PyObject *PyUnicode_FromOrdinal(int ordinal)
992{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000993 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000994
995#ifdef Py_UNICODE_WIDE
996 if (ordinal < 0 || ordinal > 0x10ffff) {
997 PyErr_SetString(PyExc_ValueError,
998 "unichr() arg not in range(0x110000) "
999 "(wide Python build)");
1000 return NULL;
1001 }
1002#else
1003 if (ordinal < 0 || ordinal > 0xffff) {
1004 PyErr_SetString(PyExc_ValueError,
1005 "unichr() arg not in range(0x10000) "
1006 "(narrow Python build)");
1007 return NULL;
1008 }
1009#endif
1010
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 s[0] = (Py_UNICODE)ordinal;
1012 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001013}
1014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015PyObject *PyUnicode_FromObject(register PyObject *obj)
1016{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 /* XXX Perhaps we should make this API an alias of
1018 PyObject_Unicode() instead ?! */
1019 if (PyUnicode_CheckExact(obj)) {
1020 Py_INCREF(obj);
1021 return obj;
1022 }
1023 if (PyUnicode_Check(obj)) {
1024 /* For a Unicode subtype that's not a Unicode object,
1025 return a true Unicode object with the same data. */
1026 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1027 PyUnicode_GET_SIZE(obj));
1028 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001029 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1030}
1031
1032PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1033 const char *encoding,
1034 const char *errors)
1035{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001039
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (obj == NULL) {
1041 PyErr_BadInternalCall();
1042 return NULL;
1043 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001045#if 0
1046 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001047 that no encodings is given and then redirect to
1048 PyObject_Unicode() which then applies the additional logic for
1049 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001050
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001051 NOTE: This API should really only be used for object which
1052 represent *encoded* Unicode !
1053
1054 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001055 if (PyUnicode_Check(obj)) {
1056 if (encoding) {
1057 PyErr_SetString(PyExc_TypeError,
1058 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001059 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001061 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001062 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#else
1064 if (PyUnicode_Check(obj)) {
1065 PyErr_SetString(PyExc_TypeError,
1066 "decoding Unicode is not supported");
1067 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069#endif
1070
1071 /* Coerce object */
1072 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001073 s = PyString_AS_STRING(obj);
1074 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001075 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001076 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1077 /* Overwrite the error message with something more useful in
1078 case of a TypeError. */
1079 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001080 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081 "coercing to Unicode: need string or buffer, "
1082 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001083 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001084 goto onError;
1085 }
Tim Petersced69f82003-09-16 20:30:58 +00001086
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (len == 0) {
1089 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001090 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001094
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095 return v;
1096
1097 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099}
1100
1101PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001102 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 const char *encoding,
1104 const char *errors)
1105{
1106 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107
1108 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001109 encoding = PyUnicode_GetDefaultEncoding();
1110
1111 /* Shortcuts for common default encodings */
1112 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001114 else if (strcmp(encoding, "latin-1") == 0)
1115 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001116#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1117 else if (strcmp(encoding, "mbcs") == 0)
1118 return PyUnicode_DecodeMBCS(s, size, errors);
1119#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001120 else if (strcmp(encoding, "ascii") == 0)
1121 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122
1123 /* Decode via the codec registry */
1124 buffer = PyBuffer_FromMemory((void *)s, size);
1125 if (buffer == NULL)
1126 goto onError;
1127 unicode = PyCodec_Decode(buffer, encoding, errors);
1128 if (unicode == NULL)
1129 goto onError;
1130 if (!PyUnicode_Check(unicode)) {
1131 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001132 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001133 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 Py_DECREF(unicode);
1135 goto onError;
1136 }
1137 Py_DECREF(buffer);
1138 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001139
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 onError:
1141 Py_XDECREF(buffer);
1142 return NULL;
1143}
1144
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001145PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1146 const char *encoding,
1147 const char *errors)
1148{
1149 PyObject *v;
1150
1151 if (!PyUnicode_Check(unicode)) {
1152 PyErr_BadArgument();
1153 goto onError;
1154 }
1155
1156 if (encoding == NULL)
1157 encoding = PyUnicode_GetDefaultEncoding();
1158
1159 /* Decode via the codec registry */
1160 v = PyCodec_Decode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
1163 return v;
1164
1165 onError:
1166 return NULL;
1167}
1168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001170 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 const char *encoding,
1172 const char *errors)
1173{
1174 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 unicode = PyUnicode_FromUnicode(s, size);
1177 if (unicode == NULL)
1178 return NULL;
1179 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1180 Py_DECREF(unicode);
1181 return v;
1182}
1183
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001184PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1185 const char *encoding,
1186 const char *errors)
1187{
1188 PyObject *v;
1189
1190 if (!PyUnicode_Check(unicode)) {
1191 PyErr_BadArgument();
1192 goto onError;
1193 }
1194
1195 if (encoding == NULL)
1196 encoding = PyUnicode_GetDefaultEncoding();
1197
1198 /* Encode via the codec registry */
1199 v = PyCodec_Encode(unicode, encoding, errors);
1200 if (v == NULL)
1201 goto onError;
1202 return v;
1203
1204 onError:
1205 return NULL;
1206}
1207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1211{
1212 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1217 }
Fred Drakee4315f52000-05-09 19:53:39 +00001218
Tim Petersced69f82003-09-16 20:30:58 +00001219 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001220 encoding = PyUnicode_GetDefaultEncoding();
1221
1222 /* Shortcuts for common default encodings */
1223 if (errors == NULL) {
1224 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001225 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001226 else if (strcmp(encoding, "latin-1") == 0)
1227 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001228#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1229 else if (strcmp(encoding, "mbcs") == 0)
1230 return PyUnicode_AsMBCSString(unicode);
1231#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001232 else if (strcmp(encoding, "ascii") == 0)
1233 return PyUnicode_AsASCIIString(unicode);
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
1236 /* Encode via the codec registry */
1237 v = PyCodec_Encode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (!PyString_Check(v)) {
1241 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001242 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001243 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 Py_DECREF(v);
1245 goto onError;
1246 }
1247 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001248
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001253PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1254 const char *errors)
1255{
1256 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1257
1258 if (v)
1259 return v;
1260 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1261 if (v && errors == NULL)
1262 ((PyUnicodeObject *)unicode)->defenc = v;
1263 return v;
1264}
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1267{
1268 if (!PyUnicode_Check(unicode)) {
1269 PyErr_BadArgument();
1270 goto onError;
1271 }
1272 return PyUnicode_AS_UNICODE(unicode);
1273
1274 onError:
1275 return NULL;
1276}
1277
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279{
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1283 }
1284 return PyUnicode_GET_SIZE(unicode);
1285
1286 onError:
1287 return -1;
1288}
1289
Thomas Wouters78890102000-07-22 19:25:51 +00001290const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001291{
1292 return unicode_default_encoding;
1293}
1294
1295int PyUnicode_SetDefaultEncoding(const char *encoding)
1296{
1297 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001298
Fred Drakee4315f52000-05-09 19:53:39 +00001299 /* Make sure the encoding is valid. As side effect, this also
1300 loads the encoding into the codec registry cache. */
1301 v = _PyCodec_Lookup(encoding);
1302 if (v == NULL)
1303 goto onError;
1304 Py_DECREF(v);
1305 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001306 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001307 sizeof(unicode_default_encoding));
1308 return 0;
1309
1310 onError:
1311 return -1;
1312}
1313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314/* error handling callback helper:
1315 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001316 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 and adjust various state variables.
1318 return 0 on success, -1 on error
1319*/
1320
1321static
1322int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1323 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001324 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1325 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001326 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001328 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329
1330 PyObject *restuple = NULL;
1331 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001332 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1333 Py_ssize_t requiredsize;
1334 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 int res = -1;
1338
1339 if (*errorHandler == NULL) {
1340 *errorHandler = PyCodec_LookupError(errors);
1341 if (*errorHandler == NULL)
1342 goto onError;
1343 }
1344
1345 if (*exceptionObject == NULL) {
1346 *exceptionObject = PyUnicodeDecodeError_Create(
1347 encoding, input, insize, *startinpos, *endinpos, reason);
1348 if (*exceptionObject == NULL)
1349 goto onError;
1350 }
1351 else {
1352 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1353 goto onError;
1354 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1355 goto onError;
1356 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1357 goto onError;
1358 }
1359
1360 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1361 if (restuple == NULL)
1362 goto onError;
1363 if (!PyTuple_Check(restuple)) {
1364 PyErr_Format(PyExc_TypeError, &argparse[4]);
1365 goto onError;
1366 }
1367 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1368 goto onError;
1369 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001370 newpos = insize+newpos;
1371 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001373 goto onError;
1374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375
1376 /* need more space? (at least enough for what we
1377 have+the replacement+the rest of the string (starting
1378 at the new input position), so we won't have to check space
1379 when there are no errors in the rest of the string) */
1380 repptr = PyUnicode_AS_UNICODE(repunicode);
1381 repsize = PyUnicode_GET_SIZE(repunicode);
1382 requiredsize = *outpos + repsize + insize-newpos;
1383 if (requiredsize > outsize) {
1384 if (requiredsize<2*outsize)
1385 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001386 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 goto onError;
1388 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1389 }
1390 *endinpos = newpos;
1391 *inptr = input + newpos;
1392 Py_UNICODE_COPY(*outptr, repptr, repsize);
1393 *outptr += repsize;
1394 *outpos += repsize;
1395 /* we made it! */
1396 res = 0;
1397
1398 onError:
1399 Py_XDECREF(restuple);
1400 return res;
1401}
1402
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001403/* --- UTF-7 Codec -------------------------------------------------------- */
1404
1405/* see RFC2152 for details */
1406
Tim Petersced69f82003-09-16 20:30:58 +00001407static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408char utf7_special[128] = {
1409 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1410 encoded:
1411 0 - not special
1412 1 - special
1413 2 - whitespace (optional)
1414 3 - RFC2152 Set O (optional) */
1415 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1416 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1417 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1419 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1421 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1423
1424};
1425
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001426/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1427 warnings about the comparison always being false; since
1428 utf7_special[0] is 1, we can safely make that one comparison
1429 true */
1430
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001432 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001433 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434 (encodeO && (utf7_special[(c)] == 3)))
1435
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001436#define B64(n) \
1437 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1438#define B64CHAR(c) \
1439 (isalnum(c) || (c) == '+' || (c) == '/')
1440#define UB64(c) \
1441 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1442 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001444#define ENCODE(out, ch, bits) \
1445 while (bits >= 6) { \
1446 *out++ = B64(ch >> (bits-6)); \
1447 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448 }
1449
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001450#define DECODE(out, ch, bits, surrogate) \
1451 while (bits >= 16) { \
1452 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1453 bits -= 16; \
1454 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001455 /* We have already generated an error for the high surrogate \
1456 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001457 surrogate = 0; \
1458 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001460 it in a 16-bit character */ \
1461 surrogate = 1; \
1462 errmsg = "code pairs are not supported"; \
1463 goto utf7Error; \
1464 } else { \
1465 *out++ = outCh; \
1466 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001467 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001470 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 const char *errors)
1472{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001473 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1474}
1475
1476PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1477 Py_ssize_t size,
1478 const char *errors,
1479 Py_ssize_t *consumed)
1480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001482 Py_ssize_t startinpos;
1483 Py_ssize_t endinpos;
1484 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485 const char *e;
1486 PyUnicodeObject *unicode;
1487 Py_UNICODE *p;
1488 const char *errmsg = "";
1489 int inShift = 0;
1490 unsigned int bitsleft = 0;
1491 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 int surrogate = 0;
1493 PyObject *errorHandler = NULL;
1494 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495
1496 unicode = _PyUnicode_New(size);
1497 if (!unicode)
1498 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001499 if (size == 0) {
1500 if (consumed)
1501 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504
1505 p = unicode->str;
1506 e = s + size;
1507
1508 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 Py_UNICODE ch;
1510 restart:
1511 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 if (inShift) {
1514 if ((ch == '-') || !B64CHAR(ch)) {
1515 inShift = 0;
1516 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1519 if (bitsleft >= 6) {
1520 /* The shift sequence has a partial character in it. If
1521 bitsleft < 6 then we could just classify it as padding
1522 but that is not the case here */
1523
1524 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001525 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 }
1527 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001528 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529 here so indicate the potential of a misencoded character. */
1530
1531 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1532 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1533 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 }
1536
1537 if (ch == '-') {
1538 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001539 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 inShift = 1;
1541 }
1542 } else if (SPECIAL(ch,0,0)) {
1543 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001544 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001545 } else {
1546 *p++ = ch;
1547 }
1548 } else {
1549 charsleft = (charsleft << 6) | UB64(ch);
1550 bitsleft += 6;
1551 s++;
1552 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1553 }
1554 }
1555 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 s++;
1558 if (s < e && *s == '-') {
1559 s++;
1560 *p++ = '+';
1561 } else
1562 {
1563 inShift = 1;
1564 bitsleft = 0;
1565 }
1566 }
1567 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001568 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 errmsg = "unexpected special character";
1570 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001571 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 }
1573 else {
1574 *p++ = ch;
1575 s++;
1576 }
1577 continue;
1578 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 outpos = p-PyUnicode_AS_UNICODE(unicode);
1580 endinpos = s-starts;
1581 if (unicode_decode_call_errorhandler(
1582 errors, &errorHandler,
1583 "utf7", errmsg,
1584 starts, size, &startinpos, &endinpos, &exc, &s,
1585 (PyObject **)&unicode, &outpos, &p))
1586 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 }
1588
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001589 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 outpos = p-PyUnicode_AS_UNICODE(unicode);
1591 endinpos = size;
1592 if (unicode_decode_call_errorhandler(
1593 errors, &errorHandler,
1594 "utf7", "unterminated shift sequence",
1595 starts, size, &startinpos, &endinpos, &exc, &s,
1596 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 if (s < e)
1599 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001601 if (consumed) {
1602 if(inShift)
1603 *consumed = startinpos;
1604 else
1605 *consumed = s-starts;
1606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001608 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 goto onError;
1610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001611 Py_XDECREF(errorHandler);
1612 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 return (PyObject *)unicode;
1614
1615onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001616 Py_XDECREF(errorHandler);
1617 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 Py_DECREF(unicode);
1619 return NULL;
1620}
1621
1622
1623PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001624 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 int encodeSetO,
1626 int encodeWhiteSpace,
1627 const char *errors)
1628{
1629 PyObject *v;
1630 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001631 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001633 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 unsigned int bitsleft = 0;
1635 unsigned long charsleft = 0;
1636 char * out;
1637 char * start;
1638
1639 if (size == 0)
1640 return PyString_FromStringAndSize(NULL, 0);
1641
1642 v = PyString_FromStringAndSize(NULL, cbAllocated);
1643 if (v == NULL)
1644 return NULL;
1645
1646 start = out = PyString_AS_STRING(v);
1647 for (;i < size; ++i) {
1648 Py_UNICODE ch = s[i];
1649
1650 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001651 if (ch == '+') {
1652 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 *out++ = '-';
1654 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1655 charsleft = ch;
1656 bitsleft = 16;
1657 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001658 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001660 } else {
1661 *out++ = (char) ch;
1662 }
1663 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1665 *out++ = B64(charsleft << (6-bitsleft));
1666 charsleft = 0;
1667 bitsleft = 0;
1668 /* Characters not in the BASE64 set implicitly unshift the sequence
1669 so no '-' is required, except if the character is itself a '-' */
1670 if (B64CHAR(ch) || ch == '-') {
1671 *out++ = '-';
1672 }
1673 inShift = 0;
1674 *out++ = (char) ch;
1675 } else {
1676 bitsleft += 16;
1677 charsleft = (charsleft << 16) | ch;
1678 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1679
1680 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001681 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 or '-' then the shift sequence will be terminated implicitly and we
1683 don't have to insert a '-'. */
1684
1685 if (bitsleft == 0) {
1686 if (i + 1 < size) {
1687 Py_UNICODE ch2 = s[i+1];
1688
1689 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001690
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 } else if (B64CHAR(ch2) || ch2 == '-') {
1692 *out++ = '-';
1693 inShift = 0;
1694 } else {
1695 inShift = 0;
1696 }
1697
1698 }
1699 else {
1700 *out++ = '-';
1701 inShift = 0;
1702 }
1703 }
Tim Petersced69f82003-09-16 20:30:58 +00001704 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 if (bitsleft) {
1708 *out++= B64(charsleft << (6-bitsleft) );
1709 *out++ = '-';
1710 }
1711
Tim Peters5de98422002-04-27 18:44:32 +00001712 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713 return v;
1714}
1715
1716#undef SPECIAL
1717#undef B64
1718#undef B64CHAR
1719#undef UB64
1720#undef ENCODE
1721#undef DECODE
1722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723/* --- UTF-8 Codec -------------------------------------------------------- */
1724
Tim Petersced69f82003-09-16 20:30:58 +00001725static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726char utf8_code_length[256] = {
1727 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1728 illegal prefix. see RFC 2279 for details */
1729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1730 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1731 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1732 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1733 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1734 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1735 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1736 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1741 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1742 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1744 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1745};
1746
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001748 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 const char *errors)
1750{
Walter Dörwald69652032004-09-07 20:24:22 +00001751 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1752}
1753
1754PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001755 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001756 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761 Py_ssize_t startinpos;
1762 Py_ssize_t endinpos;
1763 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 const char *e;
1765 PyUnicodeObject *unicode;
1766 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 PyObject *errorHandler = NULL;
1769 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
1771 /* Note: size will always be longer than the resulting Unicode
1772 character count */
1773 unicode = _PyUnicode_New(size);
1774 if (!unicode)
1775 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001776 if (size == 0) {
1777 if (consumed)
1778 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781
1782 /* Unpack UTF-8 encoded data */
1783 p = unicode->str;
1784 e = s + size;
1785
1786 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001787 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
1789 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001790 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 s++;
1792 continue;
1793 }
1794
1795 n = utf8_code_length[ch];
1796
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001798 if (consumed)
1799 break;
1800 else {
1801 errmsg = "unexpected end of data";
1802 startinpos = s-starts;
1803 endinpos = size;
1804 goto utf8Error;
1805 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
1808 switch (n) {
1809
1810 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 startinpos = s-starts;
1813 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815
1816 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 if ((s[1] & 0xc0) != 0x80) {
1824 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 startinpos = s-starts;
1826 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 goto utf8Error;
1828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 startinpos = s-starts;
1832 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 errmsg = "illegal encoding";
1834 goto utf8Error;
1835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 break;
1839
1840 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001841 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 (s[2] & 0xc0) != 0x80) {
1843 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 startinpos = s-starts;
1845 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 goto utf8Error;
1847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001849 if (ch < 0x0800) {
1850 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001851 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001852
1853 XXX For wide builds (UCS-4) we should probably try
1854 to recombine the surrogates into a single code
1855 unit.
1856 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
1861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001863 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001864 break;
1865
1866 case 4:
1867 if ((s[1] & 0xc0) != 0x80 ||
1868 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 (s[3] & 0xc0) != 0x80) {
1870 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 startinpos = s-starts;
1872 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 goto utf8Error;
1874 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1876 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1877 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001878 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001879 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001880 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001881 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001882 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001883 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 startinpos = s-starts;
1885 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001886 goto utf8Error;
1887 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001888#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001889 *p++ = (Py_UNICODE)ch;
1890#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001892
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 /* translate from 10000..10FFFF to 0..FFFF */
1894 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001895
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001896 /* high surrogate = top 10 bits added to D800 */
1897 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001898
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001899 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001900 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 break;
1903
1904 default:
1905 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 startinpos = s-starts;
1908 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001909 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910 }
1911 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001912 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001913
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001914 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 outpos = p-PyUnicode_AS_UNICODE(unicode);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "utf8", errmsg,
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&unicode, &outpos, &p))
1921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 }
Walter Dörwald69652032004-09-07 20:24:22 +00001923 if (consumed)
1924 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925
1926 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001927 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 goto onError;
1929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001930 Py_XDECREF(errorHandler);
1931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 return (PyObject *)unicode;
1933
1934onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 Py_XDECREF(errorHandler);
1936 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 Py_DECREF(unicode);
1938 return NULL;
1939}
1940
Tim Peters602f7402002-04-27 18:03:26 +00001941/* Allocation strategy: if the string is short, convert into a stack buffer
1942 and allocate exactly as much space needed at the end. Else allocate the
1943 maximum possible needed (4 result bytes per Unicode character), and return
1944 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001945*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001946PyObject *
1947PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001948 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950{
Tim Peters602f7402002-04-27 18:03:26 +00001951#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001952
Martin v. Löwis18e16552006-02-15 17:27:45 +00001953 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001954 PyObject *v; /* result string object */
1955 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001956 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001957 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001958 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001959
Tim Peters602f7402002-04-27 18:03:26 +00001960 assert(s != NULL);
1961 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962
Tim Peters602f7402002-04-27 18:03:26 +00001963 if (size <= MAX_SHORT_UNICHARS) {
1964 /* Write into the stack buffer; nallocated can't overflow.
1965 * At the end, we'll allocate exactly as much heap space as it
1966 * turns out we need.
1967 */
1968 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1969 v = NULL; /* will allocate after we're done */
1970 p = stackbuf;
1971 }
1972 else {
1973 /* Overallocate on the heap, and give the excess back at the end. */
1974 nallocated = size * 4;
1975 if (nallocated / 4 != size) /* overflow! */
1976 return PyErr_NoMemory();
1977 v = PyString_FromStringAndSize(NULL, nallocated);
1978 if (v == NULL)
1979 return NULL;
1980 p = PyString_AS_STRING(v);
1981 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001982
Tim Peters602f7402002-04-27 18:03:26 +00001983 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001984 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001985
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001986 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001987 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001989
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001992 *p++ = (char)(0xc0 | (ch >> 6));
1993 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001995 else {
Tim Peters602f7402002-04-27 18:03:26 +00001996 /* Encode UCS2 Unicode ordinals */
1997 if (ch < 0x10000) {
1998 /* Special case: check for high surrogate */
1999 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2000 Py_UCS4 ch2 = s[i];
2001 /* Check for low surrogate and combine the two to
2002 form a UCS4 value */
2003 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002004 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002005 i++;
2006 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002007 }
Tim Peters602f7402002-04-27 18:03:26 +00002008 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002010 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002011 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2012 *p++ = (char)(0x80 | (ch & 0x3f));
2013 continue;
2014 }
2015encodeUCS4:
2016 /* Encode UCS4 Unicode ordinals */
2017 *p++ = (char)(0xf0 | (ch >> 18));
2018 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2019 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2020 *p++ = (char)(0x80 | (ch & 0x3f));
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002023
Tim Peters602f7402002-04-27 18:03:26 +00002024 if (v == NULL) {
2025 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002026 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002027 assert(nneeded <= nallocated);
2028 v = PyString_FromStringAndSize(stackbuf, nneeded);
2029 }
2030 else {
2031 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002032 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002033 assert(nneeded <= nallocated);
2034 _PyString_Resize(&v, nneeded);
2035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002037
Tim Peters602f7402002-04-27 18:03:26 +00002038#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039}
2040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 if (!PyUnicode_Check(unicode)) {
2044 PyErr_BadArgument();
2045 return NULL;
2046 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002047 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2048 PyUnicode_GET_SIZE(unicode),
2049 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Walter Dörwald6e390802007-08-17 16:41:28 +00002052/* --- UTF-32 Codec ------------------------------------------------------- */
2053
2054PyObject *
2055PyUnicode_DecodeUTF32(const char *s,
2056 Py_ssize_t size,
2057 const char *errors,
2058 int *byteorder)
2059{
2060 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2061}
2062
2063PyObject *
2064PyUnicode_DecodeUTF32Stateful(const char *s,
2065 Py_ssize_t size,
2066 const char *errors,
2067 int *byteorder,
2068 Py_ssize_t *consumed)
2069{
2070 const char *starts = s;
2071 Py_ssize_t startinpos;
2072 Py_ssize_t endinpos;
2073 Py_ssize_t outpos;
2074 PyUnicodeObject *unicode;
2075 Py_UNICODE *p;
2076#ifndef Py_UNICODE_WIDE
2077 int i, pairs;
2078#else
2079 const int pairs = 0;
2080#endif
2081 const unsigned char *q, *e;
2082 int bo = 0; /* assume native ordering by default */
2083 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002084 /* Offsets from q for retrieving bytes in the right order. */
2085#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2086 int iorder[] = {0, 1, 2, 3};
2087#else
2088 int iorder[] = {3, 2, 1, 0};
2089#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002090 PyObject *errorHandler = NULL;
2091 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002092 /* On narrow builds we split characters outside the BMP into two
2093 codepoints => count how much extra space we need. */
2094#ifndef Py_UNICODE_WIDE
2095 for (i = pairs = 0; i < size/4; i++)
2096 if (((Py_UCS4 *)s)[i] >= 0x10000)
2097 pairs++;
2098#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002099
2100 /* This might be one to much, because of a BOM */
2101 unicode = _PyUnicode_New((size+3)/4+pairs);
2102 if (!unicode)
2103 return NULL;
2104 if (size == 0)
2105 return (PyObject *)unicode;
2106
2107 /* Unpack UTF-32 encoded data */
2108 p = unicode->str;
2109 q = (unsigned char *)s;
2110 e = q + size;
2111
2112 if (byteorder)
2113 bo = *byteorder;
2114
2115 /* Check for BOM marks (U+FEFF) in the input and adjust current
2116 byte order setting accordingly. In native mode, the leading BOM
2117 mark is skipped, in all other modes, it is copied to the output
2118 stream as-is (giving a ZWNBSP character). */
2119 if (bo == 0) {
2120 if (size >= 4) {
2121 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2122 (q[iorder[1]] << 8) | q[iorder[0]];
2123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2124 if (bom == 0x0000FEFF) {
2125 q += 4;
2126 bo = -1;
2127 }
2128 else if (bom == 0xFFFE0000) {
2129 q += 4;
2130 bo = 1;
2131 }
2132#else
2133 if (bom == 0x0000FEFF) {
2134 q += 4;
2135 bo = 1;
2136 }
2137 else if (bom == 0xFFFE0000) {
2138 q += 4;
2139 bo = -1;
2140 }
2141#endif
2142 }
2143 }
2144
2145 if (bo == -1) {
2146 /* force LE */
2147 iorder[0] = 0;
2148 iorder[1] = 1;
2149 iorder[2] = 2;
2150 iorder[3] = 3;
2151 }
2152 else if (bo == 1) {
2153 /* force BE */
2154 iorder[0] = 3;
2155 iorder[1] = 2;
2156 iorder[2] = 1;
2157 iorder[3] = 0;
2158 }
2159
2160 while (q < e) {
2161 Py_UCS4 ch;
2162 /* remaining bytes at the end? (size should be divisible by 4) */
2163 if (e-q<4) {
2164 if (consumed)
2165 break;
2166 errmsg = "truncated data";
2167 startinpos = ((const char *)q)-starts;
2168 endinpos = ((const char *)e)-starts;
2169 goto utf32Error;
2170 /* The remaining input chars are ignored if the callback
2171 chooses to skip the input */
2172 }
2173 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2174 (q[iorder[1]] << 8) | q[iorder[0]];
2175
2176 if (ch >= 0x110000)
2177 {
2178 errmsg = "codepoint not in range(0x110000)";
2179 startinpos = ((const char *)q)-starts;
2180 endinpos = startinpos+4;
2181 goto utf32Error;
2182 }
2183#ifndef Py_UNICODE_WIDE
2184 if (ch >= 0x10000)
2185 {
2186 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2187 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2188 }
2189 else
2190#endif
2191 *p++ = ch;
2192 q += 4;
2193 continue;
2194 utf32Error:
2195 outpos = p-PyUnicode_AS_UNICODE(unicode);
2196 if (unicode_decode_call_errorhandler(
2197 errors, &errorHandler,
2198 "utf32", errmsg,
2199 starts, size, &startinpos, &endinpos, &exc, &s,
2200 (PyObject **)&unicode, &outpos, &p))
2201 goto onError;
2202 }
2203
2204 if (byteorder)
2205 *byteorder = bo;
2206
2207 if (consumed)
2208 *consumed = (const char *)q-starts;
2209
2210 /* Adjust length */
2211 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2212 goto onError;
2213
2214 Py_XDECREF(errorHandler);
2215 Py_XDECREF(exc);
2216 return (PyObject *)unicode;
2217
2218onError:
2219 Py_DECREF(unicode);
2220 Py_XDECREF(errorHandler);
2221 Py_XDECREF(exc);
2222 return NULL;
2223}
2224
2225PyObject *
2226PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2227 Py_ssize_t size,
2228 const char *errors,
2229 int byteorder)
2230{
2231 PyObject *v;
2232 unsigned char *p;
2233#ifndef Py_UNICODE_WIDE
2234 int i, pairs;
2235#else
2236 const int pairs = 0;
2237#endif
2238 /* Offsets from p for storing byte pairs in the right order. */
2239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240 int iorder[] = {0, 1, 2, 3};
2241#else
2242 int iorder[] = {3, 2, 1, 0};
2243#endif
2244
2245#define STORECHAR(CH) \
2246 do { \
2247 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2248 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2249 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2250 p[iorder[0]] = (CH) & 0xff; \
2251 p += 4; \
2252 } while(0)
2253
2254 /* In narrow builds we can output surrogate pairs as one codepoint,
2255 so we need less space. */
2256#ifndef Py_UNICODE_WIDE
2257 for (i = pairs = 0; i < size-1; i++)
2258 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2259 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2260 pairs++;
2261#endif
2262 v = PyString_FromStringAndSize(NULL,
2263 4 * (size - pairs + (byteorder == 0)));
2264 if (v == NULL)
2265 return NULL;
2266
2267 p = (unsigned char *)PyString_AS_STRING(v);
2268 if (byteorder == 0)
2269 STORECHAR(0xFEFF);
2270 if (size == 0)
2271 return v;
2272
2273 if (byteorder == -1) {
2274 /* force LE */
2275 iorder[0] = 0;
2276 iorder[1] = 1;
2277 iorder[2] = 2;
2278 iorder[3] = 3;
2279 }
2280 else if (byteorder == 1) {
2281 /* force BE */
2282 iorder[0] = 3;
2283 iorder[1] = 2;
2284 iorder[2] = 1;
2285 iorder[3] = 0;
2286 }
2287
2288 while (size-- > 0) {
2289 Py_UCS4 ch = *s++;
2290#ifndef Py_UNICODE_WIDE
2291 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2292 Py_UCS4 ch2 = *s;
2293 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2294 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2295 s++;
2296 size--;
2297 }
2298 }
2299#endif
2300 STORECHAR(ch);
2301 }
2302 return v;
2303#undef STORECHAR
2304}
2305
2306PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2307{
2308 if (!PyUnicode_Check(unicode)) {
2309 PyErr_BadArgument();
2310 return NULL;
2311 }
2312 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2313 PyUnicode_GET_SIZE(unicode),
2314 NULL,
2315 0);
2316}
2317
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318/* --- UTF-16 Codec ------------------------------------------------------- */
2319
Tim Peters772747b2001-08-09 22:21:55 +00002320PyObject *
2321PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002322 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002323 const char *errors,
2324 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325{
Walter Dörwald69652032004-09-07 20:24:22 +00002326 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2327}
2328
2329PyObject *
2330PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002331 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002332 const char *errors,
2333 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002334 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002336 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002337 Py_ssize_t startinpos;
2338 Py_ssize_t endinpos;
2339 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 PyUnicodeObject *unicode;
2341 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002342 const unsigned char *q, *e;
2343 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002344 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002345 /* Offsets from q for retrieving byte pairs in the right order. */
2346#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2347 int ihi = 1, ilo = 0;
2348#else
2349 int ihi = 0, ilo = 1;
2350#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 PyObject *errorHandler = NULL;
2352 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 /* Note: size will always be longer than the resulting Unicode
2355 character count */
2356 unicode = _PyUnicode_New(size);
2357 if (!unicode)
2358 return NULL;
2359 if (size == 0)
2360 return (PyObject *)unicode;
2361
2362 /* Unpack UTF-16 encoded data */
2363 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002364 q = (unsigned char *)s;
2365 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366
2367 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002368 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002370 /* Check for BOM marks (U+FEFF) in the input and adjust current
2371 byte order setting accordingly. In native mode, the leading BOM
2372 mark is skipped, in all other modes, it is copied to the output
2373 stream as-is (giving a ZWNBSP character). */
2374 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002375 if (size >= 2) {
2376 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002377#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002378 if (bom == 0xFEFF) {
2379 q += 2;
2380 bo = -1;
2381 }
2382 else if (bom == 0xFFFE) {
2383 q += 2;
2384 bo = 1;
2385 }
Tim Petersced69f82003-09-16 20:30:58 +00002386#else
Walter Dörwald69652032004-09-07 20:24:22 +00002387 if (bom == 0xFEFF) {
2388 q += 2;
2389 bo = 1;
2390 }
2391 else if (bom == 0xFFFE) {
2392 q += 2;
2393 bo = -1;
2394 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002395#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002396 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398
Tim Peters772747b2001-08-09 22:21:55 +00002399 if (bo == -1) {
2400 /* force LE */
2401 ihi = 1;
2402 ilo = 0;
2403 }
2404 else if (bo == 1) {
2405 /* force BE */
2406 ihi = 0;
2407 ilo = 1;
2408 }
2409
2410 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002412 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002414 if (consumed)
2415 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 errmsg = "truncated data";
2417 startinpos = ((const char *)q)-starts;
2418 endinpos = ((const char *)e)-starts;
2419 goto utf16Error;
2420 /* The remaining input chars are ignored if the callback
2421 chooses to skip the input */
2422 }
2423 ch = (q[ihi] << 8) | q[ilo];
2424
Tim Peters772747b2001-08-09 22:21:55 +00002425 q += 2;
2426
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 if (ch < 0xD800 || ch > 0xDFFF) {
2428 *p++ = ch;
2429 continue;
2430 }
2431
2432 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002433 if (q >= e) {
2434 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435 startinpos = (((const char *)q)-2)-starts;
2436 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 goto utf16Error;
2438 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002439 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002440 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2441 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002442 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002443#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002444 *p++ = ch;
2445 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002446#else
2447 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002448#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002449 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002450 }
2451 else {
2452 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 startinpos = (((const char *)q)-4)-starts;
2454 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002455 goto utf16Error;
2456 }
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002459 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 startinpos = (((const char *)q)-2)-starts;
2461 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002462 /* Fall through to report the error */
2463
2464 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 outpos = p-PyUnicode_AS_UNICODE(unicode);
2466 if (unicode_decode_call_errorhandler(
2467 errors, &errorHandler,
2468 "utf16", errmsg,
2469 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2470 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
2473
2474 if (byteorder)
2475 *byteorder = bo;
2476
Walter Dörwald69652032004-09-07 20:24:22 +00002477 if (consumed)
2478 *consumed = (const char *)q-starts;
2479
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002481 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 goto onError;
2483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_XDECREF(errorHandler);
2485 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 return (PyObject *)unicode;
2487
2488onError:
2489 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 Py_XDECREF(errorHandler);
2491 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 return NULL;
2493}
2494
Tim Peters772747b2001-08-09 22:21:55 +00002495PyObject *
2496PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002497 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002498 const char *errors,
2499 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500{
2501 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002502 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002503#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002504 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002505#else
2506 const int pairs = 0;
2507#endif
Tim Peters772747b2001-08-09 22:21:55 +00002508 /* Offsets from p for storing byte pairs in the right order. */
2509#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2510 int ihi = 1, ilo = 0;
2511#else
2512 int ihi = 0, ilo = 1;
2513#endif
2514
2515#define STORECHAR(CH) \
2516 do { \
2517 p[ihi] = ((CH) >> 8) & 0xff; \
2518 p[ilo] = (CH) & 0xff; \
2519 p += 2; \
2520 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002522#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002523 for (i = pairs = 0; i < size; i++)
2524 if (s[i] >= 0x10000)
2525 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002526#endif
Tim Petersced69f82003-09-16 20:30:58 +00002527 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002528 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 if (v == NULL)
2530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
Tim Peters772747b2001-08-09 22:21:55 +00002532 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002534 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002535 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002536 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002537
2538 if (byteorder == -1) {
2539 /* force LE */
2540 ihi = 1;
2541 ilo = 0;
2542 }
2543 else if (byteorder == 1) {
2544 /* force BE */
2545 ihi = 0;
2546 ilo = 1;
2547 }
2548
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002549 while (size-- > 0) {
2550 Py_UNICODE ch = *s++;
2551 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002552#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002553 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002554 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2555 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002557#endif
Tim Peters772747b2001-08-09 22:21:55 +00002558 STORECHAR(ch);
2559 if (ch2)
2560 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002563#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564}
2565
2566PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2567{
2568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadArgument();
2570 return NULL;
2571 }
2572 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2573 PyUnicode_GET_SIZE(unicode),
2574 NULL,
2575 0);
2576}
2577
2578/* --- Unicode Escape Codec ----------------------------------------------- */
2579
Fredrik Lundh06d12682001-01-24 07:59:11 +00002580static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002583 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 const char *errors)
2585{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002586 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002587 Py_ssize_t startinpos;
2588 Py_ssize_t endinpos;
2589 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002594 char* message;
2595 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 PyObject *errorHandler = NULL;
2597 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002598
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 /* Escaped strings will always be longer than the resulting
2600 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 length after conversion to the true value.
2602 (but if the error callback returns a long replacement string
2603 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 v = _PyUnicode_New(size);
2605 if (v == NULL)
2606 goto onError;
2607 if (size == 0)
2608 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002612
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 while (s < end) {
2614 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002615 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617
2618 /* Non-escape characters are interpreted as Unicode ordinals */
2619 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002620 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 continue;
2622 }
2623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 /* \ - Escapes */
2626 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002627 c = *s++;
2628 if (s > end)
2629 c = '\0'; /* Invalid after \ */
2630 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631
2632 /* \x escapes */
2633 case '\n': break;
2634 case '\\': *p++ = '\\'; break;
2635 case '\'': *p++ = '\''; break;
2636 case '\"': *p++ = '\"'; break;
2637 case 'b': *p++ = '\b'; break;
2638 case 'f': *p++ = '\014'; break; /* FF */
2639 case 't': *p++ = '\t'; break;
2640 case 'n': *p++ = '\n'; break;
2641 case 'r': *p++ = '\r'; break;
2642 case 'v': *p++ = '\013'; break; /* VT */
2643 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2644
2645 /* \OOO (octal) escapes */
2646 case '0': case '1': case '2': case '3':
2647 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002648 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002649 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002650 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002651 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002652 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002654 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 break;
2656
Fredrik Lundhccc74732001-02-18 22:13:49 +00002657 /* hex escapes */
2658 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002660 digits = 2;
2661 message = "truncated \\xXX escape";
2662 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663
Fredrik Lundhccc74732001-02-18 22:13:49 +00002664 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002666 digits = 4;
2667 message = "truncated \\uXXXX escape";
2668 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002671 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002672 digits = 8;
2673 message = "truncated \\UXXXXXXXX escape";
2674 hexescape:
2675 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002676 outpos = p-PyUnicode_AS_UNICODE(v);
2677 if (s+digits>end) {
2678 endinpos = size;
2679 if (unicode_decode_call_errorhandler(
2680 errors, &errorHandler,
2681 "unicodeescape", "end of string in escape sequence",
2682 starts, size, &startinpos, &endinpos, &exc, &s,
2683 (PyObject **)&v, &outpos, &p))
2684 goto onError;
2685 goto nextByte;
2686 }
2687 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002689 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 endinpos = (s+i+1)-starts;
2691 if (unicode_decode_call_errorhandler(
2692 errors, &errorHandler,
2693 "unicodeescape", message,
2694 starts, size, &startinpos, &endinpos, &exc, &s,
2695 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002696 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002698 }
2699 chr = (chr<<4) & ~0xF;
2700 if (c >= '0' && c <= '9')
2701 chr += c - '0';
2702 else if (c >= 'a' && c <= 'f')
2703 chr += 10 + c - 'a';
2704 else
2705 chr += 10 + c - 'A';
2706 }
2707 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002708 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 /* _decoding_error will have already written into the
2710 target buffer. */
2711 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 /* when we get here, chr is a 32-bit unicode character */
2714 if (chr <= 0xffff)
2715 /* UCS-2 character */
2716 *p++ = (Py_UNICODE) chr;
2717 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002718 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002719 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002720#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002721 *p++ = chr;
2722#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002723 chr -= 0x10000L;
2724 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002725 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002726#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002727 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 endinpos = s-starts;
2729 outpos = p-PyUnicode_AS_UNICODE(v);
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "unicodeescape", "illegal Unicode character",
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002735 goto onError;
2736 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 break;
2738
2739 /* \N{name} */
2740 case 'N':
2741 message = "malformed \\N character escape";
2742 if (ucnhash_CAPI == NULL) {
2743 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002744 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002745 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746 if (m == NULL)
2747 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002750 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002752 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002753 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 if (ucnhash_CAPI == NULL)
2755 goto ucnhashError;
2756 }
2757 if (*s == '{') {
2758 const char *start = s+1;
2759 /* look for the closing brace */
2760 while (*s != '}' && s < end)
2761 s++;
2762 if (s > start && s < end && *s == '}') {
2763 /* found a name. look it up in the unicode database */
2764 message = "unknown Unicode character name";
2765 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002766 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 goto store;
2768 }
2769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 endinpos = s-starts;
2771 outpos = p-PyUnicode_AS_UNICODE(v);
2772 if (unicode_decode_call_errorhandler(
2773 errors, &errorHandler,
2774 "unicodeescape", message,
2775 starts, size, &startinpos, &endinpos, &exc, &s,
2776 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 break;
2779
2780 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002781 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 message = "\\ at end of string";
2783 s--;
2784 endinpos = s-starts;
2785 outpos = p-PyUnicode_AS_UNICODE(v);
2786 if (unicode_decode_call_errorhandler(
2787 errors, &errorHandler,
2788 "unicodeescape", message,
2789 starts, size, &startinpos, &endinpos, &exc, &s,
2790 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002791 goto onError;
2792 }
2793 else {
2794 *p++ = '\\';
2795 *p++ = (unsigned char)s[-1];
2796 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002797 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 nextByte:
2800 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002802 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002804 Py_XDECREF(errorHandler);
2805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002807
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002809 PyErr_SetString(
2810 PyExc_UnicodeError,
2811 "\\N escapes not supported (can't load unicodedata module)"
2812 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002816 return NULL;
2817
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return NULL;
2823}
2824
2825/* Return a Unicode-Escape string version of the Unicode object.
2826
2827 If quotes is true, the string is enclosed in u"" or u'' quotes as
2828 appropriate.
2829
2830*/
2831
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002832Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002833 Py_ssize_t size,
2834 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002835{
2836 /* like wcschr, but doesn't stop at NULL characters */
2837
2838 while (size-- > 0) {
2839 if (*s == ch)
2840 return s;
2841 s++;
2842 }
2843
2844 return NULL;
2845}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847static
2848PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 int quotes)
2851{
2852 PyObject *repr;
2853 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002855 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856
Neal Norwitz17753ec2006-08-21 22:21:19 +00002857 /* XXX(nnorwitz): rather than over-allocating, it would be
2858 better to choose a different scheme. Perhaps scan the
2859 first N-chars of the string and allocate based on that size.
2860 */
2861 /* Initial allocation is based on the longest-possible unichr
2862 escape.
2863
2864 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2865 unichr, so in this case it's the longest unichr escape. In
2866 narrow (UTF-16) builds this is five chars per source unichr
2867 since there are two unichrs in the surrogate pair, so in narrow
2868 (UTF-16) builds it's not the longest unichr escape.
2869
2870 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2871 so in the narrow (UTF-16) build case it's the longest unichr
2872 escape.
2873 */
2874
2875 repr = PyString_FromStringAndSize(NULL,
2876 2
2877#ifdef Py_UNICODE_WIDE
2878 + 10*size
2879#else
2880 + 6*size
2881#endif
2882 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 if (repr == NULL)
2884 return NULL;
2885
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002886 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002890 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 !findchar(s, size, '"')) ? '"' : '\'';
2892 }
2893 while (size-- > 0) {
2894 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002895
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002896 /* Escape quotes and backslashes */
2897 if ((quotes &&
2898 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 *p++ = '\\';
2900 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002901 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002902 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002903
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002904#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002905 /* Map 21-bit characters to '\U00xxxxxx' */
2906 else if (ch >= 0x10000) {
2907 *p++ = '\\';
2908 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002909 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2910 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2911 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2912 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2913 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2914 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2915 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002916 *p++ = hexdigit[ch & 0x0000000F];
2917 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002918 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002919#else
2920 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002921 else if (ch >= 0xD800 && ch < 0xDC00) {
2922 Py_UNICODE ch2;
2923 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002924
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002925 ch2 = *s++;
2926 size--;
2927 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2928 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2929 *p++ = '\\';
2930 *p++ = 'U';
2931 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2932 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2933 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2934 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2935 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2936 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2937 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2938 *p++ = hexdigit[ucs & 0x0000000F];
2939 continue;
2940 }
2941 /* Fall through: isolated surrogates are copied as-is */
2942 s--;
2943 size++;
2944 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002945#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002946
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002948 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 *p++ = '\\';
2950 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002951 *p++ = hexdigit[(ch >> 12) & 0x000F];
2952 *p++ = hexdigit[(ch >> 8) & 0x000F];
2953 *p++ = hexdigit[(ch >> 4) & 0x000F];
2954 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002956
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002957 /* Map special whitespace to '\t', \n', '\r' */
2958 else if (ch == '\t') {
2959 *p++ = '\\';
2960 *p++ = 't';
2961 }
2962 else if (ch == '\n') {
2963 *p++ = '\\';
2964 *p++ = 'n';
2965 }
2966 else if (ch == '\r') {
2967 *p++ = '\\';
2968 *p++ = 'r';
2969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002970
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002971 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002972 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002974 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002975 *p++ = hexdigit[(ch >> 4) & 0x000F];
2976 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 /* Copy everything else as-is */
2980 else
2981 *p++ = (char) ch;
2982 }
2983 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002984 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985
2986 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002987 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return repr;
2989}
2990
2991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 return unicodeescape_string(s, size, 0);
2995}
2996
2997PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2998{
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 return NULL;
3002 }
3003 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3004 PyUnicode_GET_SIZE(unicode));
3005}
3006
3007/* --- Raw Unicode Escape Codec ------------------------------------------- */
3008
3009PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 const char *errors)
3012{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 Py_ssize_t startinpos;
3015 Py_ssize_t endinpos;
3016 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 const char *end;
3020 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 PyObject *errorHandler = NULL;
3022 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 /* Escaped strings will always be longer than the resulting
3025 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 length after conversion to the true value. (But decoding error
3027 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 v = _PyUnicode_New(size);
3029 if (v == NULL)
3030 goto onError;
3031 if (size == 0)
3032 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 end = s + size;
3035 while (s < end) {
3036 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003037 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003039 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040
3041 /* Non-escape characters are interpreted as Unicode ordinals */
3042 if (*s != '\\') {
3043 *p++ = (unsigned char)*s++;
3044 continue;
3045 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047
3048 /* \u-escapes are only interpreted iff the number of leading
3049 backslashes if odd */
3050 bs = s;
3051 for (;s < end;) {
3052 if (*s != '\\')
3053 break;
3054 *p++ = (unsigned char)*s++;
3055 }
3056 if (((s - bs) & 1) == 0 ||
3057 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003058 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 continue;
3060 }
3061 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 s++;
3064
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003065 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003067 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 endinpos = s-starts;
3071 if (unicode_decode_call_errorhandler(
3072 errors, &errorHandler,
3073 "rawunicodeescape", "truncated \\uXXXX",
3074 starts, size, &startinpos, &endinpos, &exc, &s,
3075 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
3079 x = (x<<4) & ~0xF;
3080 if (c >= '0' && c <= '9')
3081 x += c - '0';
3082 else if (c >= 'a' && c <= 'f')
3083 x += 10 + c - 'a';
3084 else
3085 x += 10 + c - 'A';
3086 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003087#ifndef Py_UNICODE_WIDE
3088 if (x > 0x10000) {
3089 if (unicode_decode_call_errorhandler(
3090 errors, &errorHandler,
3091 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3092 starts, size, &startinpos, &endinpos, &exc, &s,
3093 (PyObject **)&v, &outpos, &p))
3094 goto onError;
3095 }
3096#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 *p++ = x;
3098 nextByte:
3099 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 Py_XDECREF(errorHandler);
3104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 onError:
3108 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 Py_XDECREF(errorHandler);
3110 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003115 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116{
3117 PyObject *repr;
3118 char *p;
3119 char *q;
3120
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003121 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003123#ifdef Py_UNICODE_WIDE
3124 repr = PyString_FromStringAndSize(NULL, 10 * size);
3125#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003127#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 if (repr == NULL)
3129 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003130 if (size == 0)
3131 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 p = q = PyString_AS_STRING(repr);
3134 while (size-- > 0) {
3135 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003136#ifdef Py_UNICODE_WIDE
3137 /* Map 32-bit characters to '\Uxxxxxxxx' */
3138 if (ch >= 0x10000) {
3139 *p++ = '\\';
3140 *p++ = 'U';
3141 *p++ = hexdigit[(ch >> 28) & 0xf];
3142 *p++ = hexdigit[(ch >> 24) & 0xf];
3143 *p++ = hexdigit[(ch >> 20) & 0xf];
3144 *p++ = hexdigit[(ch >> 16) & 0xf];
3145 *p++ = hexdigit[(ch >> 12) & 0xf];
3146 *p++ = hexdigit[(ch >> 8) & 0xf];
3147 *p++ = hexdigit[(ch >> 4) & 0xf];
3148 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003149 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003150 else
3151#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 /* Map 16-bit characters to '\uxxxx' */
3153 if (ch >= 256) {
3154 *p++ = '\\';
3155 *p++ = 'u';
3156 *p++ = hexdigit[(ch >> 12) & 0xf];
3157 *p++ = hexdigit[(ch >> 8) & 0xf];
3158 *p++ = hexdigit[(ch >> 4) & 0xf];
3159 *p++ = hexdigit[ch & 15];
3160 }
3161 /* Copy everything else as-is */
3162 else
3163 *p++ = (char) ch;
3164 }
3165 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003166 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 return repr;
3168}
3169
3170PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3171{
3172 if (!PyUnicode_Check(unicode)) {
3173 PyErr_BadArgument();
3174 return NULL;
3175 }
3176 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3177 PyUnicode_GET_SIZE(unicode));
3178}
3179
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003180/* --- Unicode Internal Codec ------------------------------------------- */
3181
3182PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003183 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003184 const char *errors)
3185{
3186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003187 Py_ssize_t startinpos;
3188 Py_ssize_t endinpos;
3189 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003190 PyUnicodeObject *v;
3191 Py_UNICODE *p;
3192 const char *end;
3193 const char *reason;
3194 PyObject *errorHandler = NULL;
3195 PyObject *exc = NULL;
3196
Neal Norwitzd43069c2006-01-08 01:12:10 +00003197#ifdef Py_UNICODE_WIDE
3198 Py_UNICODE unimax = PyUnicode_GetMax();
3199#endif
3200
Armin Rigo7ccbca92006-10-04 12:17:45 +00003201 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003202 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3203 if (v == NULL)
3204 goto onError;
3205 if (PyUnicode_GetSize((PyObject *)v) == 0)
3206 return (PyObject *)v;
3207 p = PyUnicode_AS_UNICODE(v);
3208 end = s + size;
3209
3210 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003211 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003212 /* We have to sanity check the raw data, otherwise doom looms for
3213 some malformed UCS-4 data. */
3214 if (
3215 #ifdef Py_UNICODE_WIDE
3216 *p > unimax || *p < 0 ||
3217 #endif
3218 end-s < Py_UNICODE_SIZE
3219 )
3220 {
3221 startinpos = s - starts;
3222 if (end-s < Py_UNICODE_SIZE) {
3223 endinpos = end-starts;
3224 reason = "truncated input";
3225 }
3226 else {
3227 endinpos = s - starts + Py_UNICODE_SIZE;
3228 reason = "illegal code point (> 0x10FFFF)";
3229 }
3230 outpos = p - PyUnicode_AS_UNICODE(v);
3231 if (unicode_decode_call_errorhandler(
3232 errors, &errorHandler,
3233 "unicode_internal", reason,
3234 starts, size, &startinpos, &endinpos, &exc, &s,
3235 (PyObject **)&v, &outpos, &p)) {
3236 goto onError;
3237 }
3238 }
3239 else {
3240 p++;
3241 s += Py_UNICODE_SIZE;
3242 }
3243 }
3244
Martin v. Löwis412fb672006-04-13 06:34:32 +00003245 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003246 goto onError;
3247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
3249 return (PyObject *)v;
3250
3251 onError:
3252 Py_XDECREF(v);
3253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
3255 return NULL;
3256}
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258/* --- Latin-1 Codec ------------------------------------------------------ */
3259
3260PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003261 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 const char *errors)
3263{
3264 PyUnicodeObject *v;
3265 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003268 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003269 Py_UNICODE r = *(unsigned char*)s;
3270 return PyUnicode_FromUnicode(&r, 1);
3271 }
3272
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 v = _PyUnicode_New(size);
3274 if (v == NULL)
3275 goto onError;
3276 if (size == 0)
3277 return (PyObject *)v;
3278 p = PyUnicode_AS_UNICODE(v);
3279 while (size-- > 0)
3280 *p++ = (unsigned char)*s++;
3281 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003282
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 onError:
3284 Py_XDECREF(v);
3285 return NULL;
3286}
3287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288/* create or adjust a UnicodeEncodeError */
3289static void make_encode_exception(PyObject **exceptionObject,
3290 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003291 const Py_UNICODE *unicode, Py_ssize_t size,
3292 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 if (*exceptionObject == NULL) {
3296 *exceptionObject = PyUnicodeEncodeError_Create(
3297 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 }
3299 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3301 goto onError;
3302 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3303 goto onError;
3304 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3305 goto onError;
3306 return;
3307 onError:
3308 Py_DECREF(*exceptionObject);
3309 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 }
3311}
3312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313/* raises a UnicodeEncodeError */
3314static void raise_encode_exception(PyObject **exceptionObject,
3315 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 const Py_UNICODE *unicode, Py_ssize_t size,
3317 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 const char *reason)
3319{
3320 make_encode_exception(exceptionObject,
3321 encoding, unicode, size, startpos, endpos, reason);
3322 if (*exceptionObject != NULL)
3323 PyCodec_StrictErrors(*exceptionObject);
3324}
3325
3326/* error handling callback helper:
3327 build arguments, call the callback and check the arguments,
3328 put the result into newpos and return the replacement string, which
3329 has to be freed by the caller */
3330static PyObject *unicode_encode_call_errorhandler(const char *errors,
3331 PyObject **errorHandler,
3332 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
3335 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338
3339 PyObject *restuple;
3340 PyObject *resunicode;
3341
3342 if (*errorHandler == NULL) {
3343 *errorHandler = PyCodec_LookupError(errors);
3344 if (*errorHandler == NULL)
3345 return NULL;
3346 }
3347
3348 make_encode_exception(exceptionObject,
3349 encoding, unicode, size, startpos, endpos, reason);
3350 if (*exceptionObject == NULL)
3351 return NULL;
3352
3353 restuple = PyObject_CallFunctionObjArgs(
3354 *errorHandler, *exceptionObject, NULL);
3355 if (restuple == NULL)
3356 return NULL;
3357 if (!PyTuple_Check(restuple)) {
3358 PyErr_Format(PyExc_TypeError, &argparse[4]);
3359 Py_DECREF(restuple);
3360 return NULL;
3361 }
3362 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3363 &resunicode, newpos)) {
3364 Py_DECREF(restuple);
3365 return NULL;
3366 }
3367 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003368 *newpos = size+*newpos;
3369 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003370 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003371 Py_DECREF(restuple);
3372 return NULL;
3373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 Py_INCREF(resunicode);
3375 Py_DECREF(restuple);
3376 return resunicode;
3377}
3378
3379static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003380 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 const char *errors,
3382 int limit)
3383{
3384 /* output object */
3385 PyObject *res;
3386 /* pointers to the beginning and end+1 of input */
3387 const Py_UNICODE *startp = p;
3388 const Py_UNICODE *endp = p + size;
3389 /* pointer to the beginning of the unencodable characters */
3390 /* const Py_UNICODE *badp = NULL; */
3391 /* pointer into the output */
3392 char *str;
3393 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t respos = 0;
3395 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003396 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3397 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *errorHandler = NULL;
3399 PyObject *exc = NULL;
3400 /* the following variable is used for caching string comparisons
3401 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3402 int known_errorHandler = -1;
3403
3404 /* allocate enough for a simple encoding without
3405 replacements, if we need more, we'll resize */
3406 res = PyString_FromStringAndSize(NULL, size);
3407 if (res == NULL)
3408 goto onError;
3409 if (size == 0)
3410 return res;
3411 str = PyString_AS_STRING(res);
3412 ressize = size;
3413
3414 while (p<endp) {
3415 Py_UNICODE c = *p;
3416
3417 /* can we encode this? */
3418 if (c<limit) {
3419 /* no overflow check, because we know that the space is enough */
3420 *str++ = (char)c;
3421 ++p;
3422 }
3423 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003424 Py_ssize_t unicodepos = p-startp;
3425 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t repsize;
3428 Py_ssize_t newpos;
3429 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 Py_UNICODE *uni2;
3431 /* startpos for collecting unencodable chars */
3432 const Py_UNICODE *collstart = p;
3433 const Py_UNICODE *collend = p;
3434 /* find all unecodable characters */
3435 while ((collend < endp) && ((*collend)>=limit))
3436 ++collend;
3437 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3438 if (known_errorHandler==-1) {
3439 if ((errors==NULL) || (!strcmp(errors, "strict")))
3440 known_errorHandler = 1;
3441 else if (!strcmp(errors, "replace"))
3442 known_errorHandler = 2;
3443 else if (!strcmp(errors, "ignore"))
3444 known_errorHandler = 3;
3445 else if (!strcmp(errors, "xmlcharrefreplace"))
3446 known_errorHandler = 4;
3447 else
3448 known_errorHandler = 0;
3449 }
3450 switch (known_errorHandler) {
3451 case 1: /* strict */
3452 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3453 goto onError;
3454 case 2: /* replace */
3455 while (collstart++<collend)
3456 *str++ = '?'; /* fall through */
3457 case 3: /* ignore */
3458 p = collend;
3459 break;
3460 case 4: /* xmlcharrefreplace */
3461 respos = str-PyString_AS_STRING(res);
3462 /* determine replacement size (temporarily (mis)uses p) */
3463 for (p = collstart, repsize = 0; p < collend; ++p) {
3464 if (*p<10)
3465 repsize += 2+1+1;
3466 else if (*p<100)
3467 repsize += 2+2+1;
3468 else if (*p<1000)
3469 repsize += 2+3+1;
3470 else if (*p<10000)
3471 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003472#ifndef Py_UNICODE_WIDE
3473 else
3474 repsize += 2+5+1;
3475#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 else if (*p<100000)
3477 repsize += 2+5+1;
3478 else if (*p<1000000)
3479 repsize += 2+6+1;
3480 else
3481 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003482#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 }
3484 requiredsize = respos+repsize+(endp-collend);
3485 if (requiredsize > ressize) {
3486 if (requiredsize<2*ressize)
3487 requiredsize = 2*ressize;
3488 if (_PyString_Resize(&res, requiredsize))
3489 goto onError;
3490 str = PyString_AS_STRING(res) + respos;
3491 ressize = requiredsize;
3492 }
3493 /* generate replacement (temporarily (mis)uses p) */
3494 for (p = collstart; p < collend; ++p) {
3495 str += sprintf(str, "&#%d;", (int)*p);
3496 }
3497 p = collend;
3498 break;
3499 default:
3500 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3501 encoding, reason, startp, size, &exc,
3502 collstart-startp, collend-startp, &newpos);
3503 if (repunicode == NULL)
3504 goto onError;
3505 /* need more space? (at least enough for what we
3506 have+the replacement+the rest of the string, so
3507 we won't have to check space for encodable characters) */
3508 respos = str-PyString_AS_STRING(res);
3509 repsize = PyUnicode_GET_SIZE(repunicode);
3510 requiredsize = respos+repsize+(endp-collend);
3511 if (requiredsize > ressize) {
3512 if (requiredsize<2*ressize)
3513 requiredsize = 2*ressize;
3514 if (_PyString_Resize(&res, requiredsize)) {
3515 Py_DECREF(repunicode);
3516 goto onError;
3517 }
3518 str = PyString_AS_STRING(res) + respos;
3519 ressize = requiredsize;
3520 }
3521 /* check if there is anything unencodable in the replacement
3522 and copy it to the output */
3523 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3524 c = *uni2;
3525 if (c >= limit) {
3526 raise_encode_exception(&exc, encoding, startp, size,
3527 unicodepos, unicodepos+1, reason);
3528 Py_DECREF(repunicode);
3529 goto onError;
3530 }
3531 *str = (char)c;
3532 }
3533 p = startp + newpos;
3534 Py_DECREF(repunicode);
3535 }
3536 }
3537 }
3538 /* Resize if we allocated to much */
3539 respos = str-PyString_AS_STRING(res);
3540 if (respos<ressize)
3541 /* If this falls res will be NULL */
3542 _PyString_Resize(&res, respos);
3543 Py_XDECREF(errorHandler);
3544 Py_XDECREF(exc);
3545 return res;
3546
3547 onError:
3548 Py_XDECREF(res);
3549 Py_XDECREF(errorHandler);
3550 Py_XDECREF(exc);
3551 return NULL;
3552}
3553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 const char *errors)
3557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559}
3560
3561PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3562{
3563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
3565 return NULL;
3566 }
3567 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3568 PyUnicode_GET_SIZE(unicode),
3569 NULL);
3570}
3571
3572/* --- 7-bit ASCII Codec -------------------------------------------------- */
3573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 const char *errors)
3577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 PyUnicodeObject *v;
3580 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003581 Py_ssize_t startinpos;
3582 Py_ssize_t endinpos;
3583 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 const char *e;
3585 PyObject *errorHandler = NULL;
3586 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003587
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003589 if (size == 1 && *(unsigned char*)s < 128) {
3590 Py_UNICODE r = *(unsigned char*)s;
3591 return PyUnicode_FromUnicode(&r, 1);
3592 }
Tim Petersced69f82003-09-16 20:30:58 +00003593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 v = _PyUnicode_New(size);
3595 if (v == NULL)
3596 goto onError;
3597 if (size == 0)
3598 return (PyObject *)v;
3599 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 e = s + size;
3601 while (s < e) {
3602 register unsigned char c = (unsigned char)*s;
3603 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 ++s;
3606 }
3607 else {
3608 startinpos = s-starts;
3609 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003610 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 if (unicode_decode_call_errorhandler(
3612 errors, &errorHandler,
3613 "ascii", "ordinal not in range(128)",
3614 starts, size, &startinpos, &endinpos, &exc, &s,
3615 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003619 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003620 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003625
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 onError:
3627 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631}
3632
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003634 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 const char *errors)
3636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638}
3639
3640PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3641{
3642 if (!PyUnicode_Check(unicode)) {
3643 PyErr_BadArgument();
3644 return NULL;
3645 }
3646 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3647 PyUnicode_GET_SIZE(unicode),
3648 NULL);
3649}
3650
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003652
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003653/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003654
Martin v. Löwisd8251432006-06-14 05:21:04 +00003655#if SIZEOF_INT < SIZEOF_SSIZE_T
3656#define NEED_RETRY
3657#endif
3658
3659/* XXX This code is limited to "true" double-byte encodings, as
3660 a) it assumes an incomplete character consists of a single byte, and
3661 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3662 encodings, see IsDBCSLeadByteEx documentation. */
3663
3664static int is_dbcs_lead_byte(const char *s, int offset)
3665{
3666 const char *curr = s + offset;
3667
3668 if (IsDBCSLeadByte(*curr)) {
3669 const char *prev = CharPrev(s, curr);
3670 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3671 }
3672 return 0;
3673}
3674
3675/*
3676 * Decode MBCS string into unicode object. If 'final' is set, converts
3677 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3678 */
3679static int decode_mbcs(PyUnicodeObject **v,
3680 const char *s, /* MBCS string */
3681 int size, /* sizeof MBCS string */
3682 int final)
3683{
3684 Py_UNICODE *p;
3685 Py_ssize_t n = 0;
3686 int usize = 0;
3687
3688 assert(size >= 0);
3689
3690 /* Skip trailing lead-byte unless 'final' is set */
3691 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3692 --size;
3693
3694 /* First get the size of the result */
3695 if (size > 0) {
3696 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3697 if (usize == 0) {
3698 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3699 return -1;
3700 }
3701 }
3702
3703 if (*v == NULL) {
3704 /* Create unicode object */
3705 *v = _PyUnicode_New(usize);
3706 if (*v == NULL)
3707 return -1;
3708 }
3709 else {
3710 /* Extend unicode object */
3711 n = PyUnicode_GET_SIZE(*v);
3712 if (_PyUnicode_Resize(v, n + usize) < 0)
3713 return -1;
3714 }
3715
3716 /* Do the conversion */
3717 if (size > 0) {
3718 p = PyUnicode_AS_UNICODE(*v) + n;
3719 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3720 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3721 return -1;
3722 }
3723 }
3724
3725 return size;
3726}
3727
3728PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3729 Py_ssize_t size,
3730 const char *errors,
3731 Py_ssize_t *consumed)
3732{
3733 PyUnicodeObject *v = NULL;
3734 int done;
3735
3736 if (consumed)
3737 *consumed = 0;
3738
3739#ifdef NEED_RETRY
3740 retry:
3741 if (size > INT_MAX)
3742 done = decode_mbcs(&v, s, INT_MAX, 0);
3743 else
3744#endif
3745 done = decode_mbcs(&v, s, (int)size, !consumed);
3746
3747 if (done < 0) {
3748 Py_XDECREF(v);
3749 return NULL;
3750 }
3751
3752 if (consumed)
3753 *consumed += done;
3754
3755#ifdef NEED_RETRY
3756 if (size > INT_MAX) {
3757 s += done;
3758 size -= done;
3759 goto retry;
3760 }
3761#endif
3762
3763 return (PyObject *)v;
3764}
3765
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003767 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003768 const char *errors)
3769{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003770 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3771}
3772
3773/*
3774 * Convert unicode into string object (MBCS).
3775 * Returns 0 if succeed, -1 otherwise.
3776 */
3777static int encode_mbcs(PyObject **repr,
3778 const Py_UNICODE *p, /* unicode */
3779 int size) /* size of unicode */
3780{
3781 int mbcssize = 0;
3782 Py_ssize_t n = 0;
3783
3784 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003785
3786 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003787 if (size > 0) {
3788 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3789 if (mbcssize == 0) {
3790 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3791 return -1;
3792 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003793 }
3794
Martin v. Löwisd8251432006-06-14 05:21:04 +00003795 if (*repr == NULL) {
3796 /* Create string object */
3797 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3798 if (*repr == NULL)
3799 return -1;
3800 }
3801 else {
3802 /* Extend string object */
3803 n = PyString_Size(*repr);
3804 if (_PyString_Resize(repr, n + mbcssize) < 0)
3805 return -1;
3806 }
3807
3808 /* Do the conversion */
3809 if (size > 0) {
3810 char *s = PyString_AS_STRING(*repr) + n;
3811 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3812 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3813 return -1;
3814 }
3815 }
3816
3817 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003818}
3819
3820PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003822 const char *errors)
3823{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003824 PyObject *repr = NULL;
3825 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003826
Martin v. Löwisd8251432006-06-14 05:21:04 +00003827#ifdef NEED_RETRY
3828 retry:
3829 if (size > INT_MAX)
3830 ret = encode_mbcs(&repr, p, INT_MAX);
3831 else
3832#endif
3833 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003834
Martin v. Löwisd8251432006-06-14 05:21:04 +00003835 if (ret < 0) {
3836 Py_XDECREF(repr);
3837 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839
3840#ifdef NEED_RETRY
3841 if (size > INT_MAX) {
3842 p += INT_MAX;
3843 size -= INT_MAX;
3844 goto retry;
3845 }
3846#endif
3847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003848 return repr;
3849}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003850
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003851PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3852{
3853 if (!PyUnicode_Check(unicode)) {
3854 PyErr_BadArgument();
3855 return NULL;
3856 }
3857 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3858 PyUnicode_GET_SIZE(unicode),
3859 NULL);
3860}
3861
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862#undef NEED_RETRY
3863
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003864#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866/* --- Character Mapping Codec -------------------------------------------- */
3867
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003869 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 PyObject *mapping,
3871 const char *errors)
3872{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t startinpos;
3875 Py_ssize_t endinpos;
3876 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 PyUnicodeObject *v;
3879 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003880 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 PyObject *errorHandler = NULL;
3882 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003883 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003884 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 /* Default to Latin-1 */
3887 if (mapping == NULL)
3888 return PyUnicode_DecodeLatin1(s, size, errors);
3889
3890 v = _PyUnicode_New(size);
3891 if (v == NULL)
3892 goto onError;
3893 if (size == 0)
3894 return (PyObject *)v;
3895 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003897 if (PyUnicode_CheckExact(mapping)) {
3898 mapstring = PyUnicode_AS_UNICODE(mapping);
3899 maplen = PyUnicode_GET_SIZE(mapping);
3900 while (s < e) {
3901 unsigned char ch = *s;
3902 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003904 if (ch < maplen)
3905 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003907 if (x == 0xfffe) {
3908 /* undefined mapping */
3909 outpos = p-PyUnicode_AS_UNICODE(v);
3910 startinpos = s-starts;
3911 endinpos = startinpos+1;
3912 if (unicode_decode_call_errorhandler(
3913 errors, &errorHandler,
3914 "charmap", "character maps to <undefined>",
3915 starts, size, &startinpos, &endinpos, &exc, &s,
3916 (PyObject **)&v, &outpos, &p)) {
3917 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003919 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003920 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003921 *p++ = x;
3922 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003924 }
3925 else {
3926 while (s < e) {
3927 unsigned char ch = *s;
3928 PyObject *w, *x;
3929
3930 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3931 w = PyInt_FromLong((long)ch);
3932 if (w == NULL)
3933 goto onError;
3934 x = PyObject_GetItem(mapping, w);
3935 Py_DECREF(w);
3936 if (x == NULL) {
3937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3938 /* No mapping found means: mapping is undefined. */
3939 PyErr_Clear();
3940 x = Py_None;
3941 Py_INCREF(x);
3942 } else
3943 goto onError;
3944 }
3945
3946 /* Apply mapping */
3947 if (PyInt_Check(x)) {
3948 long value = PyInt_AS_LONG(x);
3949 if (value < 0 || value > 65535) {
3950 PyErr_SetString(PyExc_TypeError,
3951 "character mapping must be in range(65536)");
3952 Py_DECREF(x);
3953 goto onError;
3954 }
3955 *p++ = (Py_UNICODE)value;
3956 }
3957 else if (x == Py_None) {
3958 /* undefined mapping */
3959 outpos = p-PyUnicode_AS_UNICODE(v);
3960 startinpos = s-starts;
3961 endinpos = startinpos+1;
3962 if (unicode_decode_call_errorhandler(
3963 errors, &errorHandler,
3964 "charmap", "character maps to <undefined>",
3965 starts, size, &startinpos, &endinpos, &exc, &s,
3966 (PyObject **)&v, &outpos, &p)) {
3967 Py_DECREF(x);
3968 goto onError;
3969 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003970 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003971 continue;
3972 }
3973 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003974 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003975
3976 if (targetsize == 1)
3977 /* 1-1 mapping */
3978 *p++ = *PyUnicode_AS_UNICODE(x);
3979
3980 else if (targetsize > 1) {
3981 /* 1-n mapping */
3982 if (targetsize > extrachars) {
3983 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3985 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 (targetsize << 2);
3987 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003988 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003989 if (_PyUnicode_Resize(&v,
3990 PyUnicode_GET_SIZE(v) + needed) < 0) {
3991 Py_DECREF(x);
3992 goto onError;
3993 }
3994 p = PyUnicode_AS_UNICODE(v) + oldpos;
3995 }
3996 Py_UNICODE_COPY(p,
3997 PyUnicode_AS_UNICODE(x),
3998 targetsize);
3999 p += targetsize;
4000 extrachars -= targetsize;
4001 }
4002 /* 1-0 mapping: skip the character */
4003 }
4004 else {
4005 /* wrong return value */
4006 PyErr_SetString(PyExc_TypeError,
4007 "character mapping must return integer, None or unicode");
4008 Py_DECREF(x);
4009 goto onError;
4010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004012 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 }
4015 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004016 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 Py_XDECREF(errorHandler);
4019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004021
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 Py_XDECREF(errorHandler);
4024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 Py_XDECREF(v);
4026 return NULL;
4027}
4028
Martin v. Löwis3f767792006-06-04 19:36:28 +00004029/* Charmap encoding: the lookup table */
4030
4031struct encoding_map{
4032 PyObject_HEAD
4033 unsigned char level1[32];
4034 int count2, count3;
4035 unsigned char level23[1];
4036};
4037
4038static PyObject*
4039encoding_map_size(PyObject *obj, PyObject* args)
4040{
4041 struct encoding_map *map = (struct encoding_map*)obj;
4042 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4043 128*map->count3);
4044}
4045
4046static PyMethodDef encoding_map_methods[] = {
4047 {"size", encoding_map_size, METH_NOARGS,
4048 PyDoc_STR("Return the size (in bytes) of this object") },
4049 { 0 }
4050};
4051
4052static void
4053encoding_map_dealloc(PyObject* o)
4054{
4055 PyObject_FREE(o);
4056}
4057
4058static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004059 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004060 "EncodingMap", /*tp_name*/
4061 sizeof(struct encoding_map), /*tp_basicsize*/
4062 0, /*tp_itemsize*/
4063 /* methods */
4064 encoding_map_dealloc, /*tp_dealloc*/
4065 0, /*tp_print*/
4066 0, /*tp_getattr*/
4067 0, /*tp_setattr*/
4068 0, /*tp_compare*/
4069 0, /*tp_repr*/
4070 0, /*tp_as_number*/
4071 0, /*tp_as_sequence*/
4072 0, /*tp_as_mapping*/
4073 0, /*tp_hash*/
4074 0, /*tp_call*/
4075 0, /*tp_str*/
4076 0, /*tp_getattro*/
4077 0, /*tp_setattro*/
4078 0, /*tp_as_buffer*/
4079 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4080 0, /*tp_doc*/
4081 0, /*tp_traverse*/
4082 0, /*tp_clear*/
4083 0, /*tp_richcompare*/
4084 0, /*tp_weaklistoffset*/
4085 0, /*tp_iter*/
4086 0, /*tp_iternext*/
4087 encoding_map_methods, /*tp_methods*/
4088 0, /*tp_members*/
4089 0, /*tp_getset*/
4090 0, /*tp_base*/
4091 0, /*tp_dict*/
4092 0, /*tp_descr_get*/
4093 0, /*tp_descr_set*/
4094 0, /*tp_dictoffset*/
4095 0, /*tp_init*/
4096 0, /*tp_alloc*/
4097 0, /*tp_new*/
4098 0, /*tp_free*/
4099 0, /*tp_is_gc*/
4100};
4101
4102PyObject*
4103PyUnicode_BuildEncodingMap(PyObject* string)
4104{
4105 Py_UNICODE *decode;
4106 PyObject *result;
4107 struct encoding_map *mresult;
4108 int i;
4109 int need_dict = 0;
4110 unsigned char level1[32];
4111 unsigned char level2[512];
4112 unsigned char *mlevel1, *mlevel2, *mlevel3;
4113 int count2 = 0, count3 = 0;
4114
4115 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4116 PyErr_BadArgument();
4117 return NULL;
4118 }
4119 decode = PyUnicode_AS_UNICODE(string);
4120 memset(level1, 0xFF, sizeof level1);
4121 memset(level2, 0xFF, sizeof level2);
4122
4123 /* If there isn't a one-to-one mapping of NULL to \0,
4124 or if there are non-BMP characters, we need to use
4125 a mapping dictionary. */
4126 if (decode[0] != 0)
4127 need_dict = 1;
4128 for (i = 1; i < 256; i++) {
4129 int l1, l2;
4130 if (decode[i] == 0
4131 #ifdef Py_UNICODE_WIDE
4132 || decode[i] > 0xFFFF
4133 #endif
4134 ) {
4135 need_dict = 1;
4136 break;
4137 }
4138 if (decode[i] == 0xFFFE)
4139 /* unmapped character */
4140 continue;
4141 l1 = decode[i] >> 11;
4142 l2 = decode[i] >> 7;
4143 if (level1[l1] == 0xFF)
4144 level1[l1] = count2++;
4145 if (level2[l2] == 0xFF)
4146 level2[l2] = count3++;
4147 }
4148
4149 if (count2 >= 0xFF || count3 >= 0xFF)
4150 need_dict = 1;
4151
4152 if (need_dict) {
4153 PyObject *result = PyDict_New();
4154 PyObject *key, *value;
4155 if (!result)
4156 return NULL;
4157 for (i = 0; i < 256; i++) {
4158 key = value = NULL;
4159 key = PyInt_FromLong(decode[i]);
4160 value = PyInt_FromLong(i);
4161 if (!key || !value)
4162 goto failed1;
4163 if (PyDict_SetItem(result, key, value) == -1)
4164 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004165 Py_DECREF(key);
4166 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004167 }
4168 return result;
4169 failed1:
4170 Py_XDECREF(key);
4171 Py_XDECREF(value);
4172 Py_DECREF(result);
4173 return NULL;
4174 }
4175
4176 /* Create a three-level trie */
4177 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4178 16*count2 + 128*count3 - 1);
4179 if (!result)
4180 return PyErr_NoMemory();
4181 PyObject_Init(result, &EncodingMapType);
4182 mresult = (struct encoding_map*)result;
4183 mresult->count2 = count2;
4184 mresult->count3 = count3;
4185 mlevel1 = mresult->level1;
4186 mlevel2 = mresult->level23;
4187 mlevel3 = mresult->level23 + 16*count2;
4188 memcpy(mlevel1, level1, 32);
4189 memset(mlevel2, 0xFF, 16*count2);
4190 memset(mlevel3, 0, 128*count3);
4191 count3 = 0;
4192 for (i = 1; i < 256; i++) {
4193 int o1, o2, o3, i2, i3;
4194 if (decode[i] == 0xFFFE)
4195 /* unmapped character */
4196 continue;
4197 o1 = decode[i]>>11;
4198 o2 = (decode[i]>>7) & 0xF;
4199 i2 = 16*mlevel1[o1] + o2;
4200 if (mlevel2[i2] == 0xFF)
4201 mlevel2[i2] = count3++;
4202 o3 = decode[i] & 0x7F;
4203 i3 = 128*mlevel2[i2] + o3;
4204 mlevel3[i3] = i;
4205 }
4206 return result;
4207}
4208
4209static int
4210encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4211{
4212 struct encoding_map *map = (struct encoding_map*)mapping;
4213 int l1 = c>>11;
4214 int l2 = (c>>7) & 0xF;
4215 int l3 = c & 0x7F;
4216 int i;
4217
4218#ifdef Py_UNICODE_WIDE
4219 if (c > 0xFFFF) {
4220 return -1;
4221 }
4222#endif
4223 if (c == 0)
4224 return 0;
4225 /* level 1*/
4226 i = map->level1[l1];
4227 if (i == 0xFF) {
4228 return -1;
4229 }
4230 /* level 2*/
4231 i = map->level23[16*i+l2];
4232 if (i == 0xFF) {
4233 return -1;
4234 }
4235 /* level 3 */
4236 i = map->level23[16*map->count2 + 128*i + l3];
4237 if (i == 0) {
4238 return -1;
4239 }
4240 return i;
4241}
4242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243/* Lookup the character ch in the mapping. If the character
4244 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004245 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 PyObject *w = PyInt_FromLong((long)c);
4249 PyObject *x;
4250
4251 if (w == NULL)
4252 return NULL;
4253 x = PyObject_GetItem(mapping, w);
4254 Py_DECREF(w);
4255 if (x == NULL) {
4256 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4257 /* No mapping found means: mapping is undefined. */
4258 PyErr_Clear();
4259 x = Py_None;
4260 Py_INCREF(x);
4261 return x;
4262 } else
4263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004265 else if (x == Py_None)
4266 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 else if (PyInt_Check(x)) {
4268 long value = PyInt_AS_LONG(x);
4269 if (value < 0 || value > 255) {
4270 PyErr_SetString(PyExc_TypeError,
4271 "character mapping must be in range(256)");
4272 Py_DECREF(x);
4273 return NULL;
4274 }
4275 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 else if (PyString_Check(x))
4278 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 /* wrong return value */
4281 PyErr_SetString(PyExc_TypeError,
4282 "character mapping must return integer, None or str");
4283 Py_DECREF(x);
4284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 }
4286}
4287
Martin v. Löwis3f767792006-06-04 19:36:28 +00004288static int
4289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4290{
4291 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4292 /* exponentially overallocate to minimize reallocations */
4293 if (requiredsize < 2*outsize)
4294 requiredsize = 2*outsize;
4295 if (_PyString_Resize(outobj, requiredsize)) {
4296 return 0;
4297 }
4298 return 1;
4299}
4300
4301typedef enum charmapencode_result {
4302 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4303}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304/* lookup the character, put the result in the output string and adjust
4305 various state variables. Reallocate the output string if not enough
4306 space is available. Return a new reference to the object that
4307 was put in the output buffer, or Py_None, if the mapping was undefined
4308 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004309 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004311charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004312 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004314 PyObject *rep;
4315 char *outstart;
4316 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317
Christian Heimese93237d2007-12-19 02:37:44 +00004318 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 int res = encoding_map_lookup(c, mapping);
4320 Py_ssize_t requiredsize = *outpos+1;
4321 if (res == -1)
4322 return enc_FAILED;
4323 if (outsize<requiredsize)
4324 if (!charmapencode_resize(outobj, outpos, requiredsize))
4325 return enc_EXCEPTION;
4326 outstart = PyString_AS_STRING(*outobj);
4327 outstart[(*outpos)++] = (char)res;
4328 return enc_SUCCESS;
4329 }
4330
4331 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004333 return enc_EXCEPTION;
4334 else if (rep==Py_None) {
4335 Py_DECREF(rep);
4336 return enc_FAILED;
4337 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004339 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340 if (outsize<requiredsize)
4341 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4347 }
4348 else {
4349 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4351 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352 if (outsize<requiredsize)
4353 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004355 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004357 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 memcpy(outstart + *outpos, repchars, repsize);
4359 *outpos += repsize;
4360 }
4361 }
Georg Brandl9f167602006-06-04 21:46:16 +00004362 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004363 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364}
4365
4366/* handle an error in PyUnicode_EncodeCharmap
4367 Return 0 on success, -1 on error */
4368static
4369int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004370 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004372 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374{
4375 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004376 Py_ssize_t repsize;
4377 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 Py_UNICODE *uni2;
4379 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t collstartpos = *inpos;
4381 Py_ssize_t collendpos = *inpos+1;
4382 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 char *encoding = "charmap";
4384 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 /* find all unencodable characters */
4388 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004389 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004390 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004391 int res = encoding_map_lookup(p[collendpos], mapping);
4392 if (res != -1)
4393 break;
4394 ++collendpos;
4395 continue;
4396 }
4397
4398 rep = charmapencode_lookup(p[collendpos], mapping);
4399 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004401 else if (rep!=Py_None) {
4402 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 break;
4404 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 ++collendpos;
4407 }
4408 /* cache callback name lookup
4409 * (if not done yet, i.e. it's the first error) */
4410 if (*known_errorHandler==-1) {
4411 if ((errors==NULL) || (!strcmp(errors, "strict")))
4412 *known_errorHandler = 1;
4413 else if (!strcmp(errors, "replace"))
4414 *known_errorHandler = 2;
4415 else if (!strcmp(errors, "ignore"))
4416 *known_errorHandler = 3;
4417 else if (!strcmp(errors, "xmlcharrefreplace"))
4418 *known_errorHandler = 4;
4419 else
4420 *known_errorHandler = 0;
4421 }
4422 switch (*known_errorHandler) {
4423 case 1: /* strict */
4424 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4425 return -1;
4426 case 2: /* replace */
4427 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4428 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004429 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 return -1;
4431 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004432 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4434 return -1;
4435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 }
4437 /* fall through */
4438 case 3: /* ignore */
4439 *inpos = collendpos;
4440 break;
4441 case 4: /* xmlcharrefreplace */
4442 /* generate replacement (temporarily (mis)uses p) */
4443 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4444 char buffer[2+29+1+1];
4445 char *cp;
4446 sprintf(buffer, "&#%d;", (int)p[collpos]);
4447 for (cp = buffer; *cp; ++cp) {
4448 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004449 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004451 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4453 return -1;
4454 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 }
4456 }
4457 *inpos = collendpos;
4458 break;
4459 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004460 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 encoding, reason, p, size, exceptionObject,
4462 collstartpos, collendpos, &newpos);
4463 if (repunicode == NULL)
4464 return -1;
4465 /* generate replacement */
4466 repsize = PyUnicode_GET_SIZE(repunicode);
4467 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4468 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004469 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 return -1;
4471 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4475 return -1;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 }
4478 *inpos = newpos;
4479 Py_DECREF(repunicode);
4480 }
4481 return 0;
4482}
4483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004485 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 PyObject *mapping,
4487 const char *errors)
4488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 /* output object */
4490 PyObject *res = NULL;
4491 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004492 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004494 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 PyObject *errorHandler = NULL;
4496 PyObject *exc = NULL;
4497 /* the following variable is used for caching string comparisons
4498 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4499 * 3=ignore, 4=xmlcharrefreplace */
4500 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501
4502 /* Default to Latin-1 */
4503 if (mapping == NULL)
4504 return PyUnicode_EncodeLatin1(p, size, errors);
4505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 /* allocate enough for a simple encoding without
4507 replacements, if we need more, we'll resize */
4508 res = PyString_FromStringAndSize(NULL, size);
4509 if (res == NULL)
4510 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004511 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 while (inpos<size) {
4515 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4517 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004519 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (charmap_encoding_error(p, size, &inpos, mapping,
4521 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004522 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004523 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004524 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 else
4528 /* done with this character => adjust input position */
4529 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 /* Resize if we allocated to much */
4533 if (respos<PyString_GET_SIZE(res)) {
4534 if (_PyString_Resize(&res, respos))
4535 goto onError;
4536 }
4537 Py_XDECREF(exc);
4538 Py_XDECREF(errorHandler);
4539 return res;
4540
4541 onError:
4542 Py_XDECREF(res);
4543 Py_XDECREF(exc);
4544 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 return NULL;
4546}
4547
4548PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4549 PyObject *mapping)
4550{
4551 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4552 PyErr_BadArgument();
4553 return NULL;
4554 }
4555 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4556 PyUnicode_GET_SIZE(unicode),
4557 mapping,
4558 NULL);
4559}
4560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561/* create or adjust a UnicodeTranslateError */
4562static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 const Py_UNICODE *unicode, Py_ssize_t size,
4564 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 if (*exceptionObject == NULL) {
4568 *exceptionObject = PyUnicodeTranslateError_Create(
4569 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4573 goto onError;
4574 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4575 goto onError;
4576 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4577 goto onError;
4578 return;
4579 onError:
4580 Py_DECREF(*exceptionObject);
4581 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
4583}
4584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585/* raises a UnicodeTranslateError */
4586static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004587 const Py_UNICODE *unicode, Py_ssize_t size,
4588 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 const char *reason)
4590{
4591 make_translate_exception(exceptionObject,
4592 unicode, size, startpos, endpos, reason);
4593 if (*exceptionObject != NULL)
4594 PyCodec_StrictErrors(*exceptionObject);
4595}
4596
4597/* error handling callback helper:
4598 build arguments, call the callback and check the arguments,
4599 put the result into newpos and return the replacement string, which
4600 has to be freed by the caller */
4601static PyObject *unicode_translate_call_errorhandler(const char *errors,
4602 PyObject **errorHandler,
4603 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004604 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4605 Py_ssize_t startpos, Py_ssize_t endpos,
4606 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004608 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609
Martin v. Löwis412fb672006-04-13 06:34:32 +00004610 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 PyObject *restuple;
4612 PyObject *resunicode;
4613
4614 if (*errorHandler == NULL) {
4615 *errorHandler = PyCodec_LookupError(errors);
4616 if (*errorHandler == NULL)
4617 return NULL;
4618 }
4619
4620 make_translate_exception(exceptionObject,
4621 unicode, size, startpos, endpos, reason);
4622 if (*exceptionObject == NULL)
4623 return NULL;
4624
4625 restuple = PyObject_CallFunctionObjArgs(
4626 *errorHandler, *exceptionObject, NULL);
4627 if (restuple == NULL)
4628 return NULL;
4629 if (!PyTuple_Check(restuple)) {
4630 PyErr_Format(PyExc_TypeError, &argparse[4]);
4631 Py_DECREF(restuple);
4632 return NULL;
4633 }
4634 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 Py_DECREF(restuple);
4637 return NULL;
4638 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 if (i_newpos<0)
4640 *newpos = size+i_newpos;
4641 else
4642 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004643 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004644 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004645 Py_DECREF(restuple);
4646 return NULL;
4647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 Py_INCREF(resunicode);
4649 Py_DECREF(restuple);
4650 return resunicode;
4651}
4652
4653/* Lookup the character ch in the mapping and put the result in result,
4654 which must be decrefed by the caller.
4655 Return 0 on success, -1 on error */
4656static
4657int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4658{
4659 PyObject *w = PyInt_FromLong((long)c);
4660 PyObject *x;
4661
4662 if (w == NULL)
4663 return -1;
4664 x = PyObject_GetItem(mapping, w);
4665 Py_DECREF(w);
4666 if (x == NULL) {
4667 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4668 /* No mapping found means: use 1:1 mapping. */
4669 PyErr_Clear();
4670 *result = NULL;
4671 return 0;
4672 } else
4673 return -1;
4674 }
4675 else if (x == Py_None) {
4676 *result = x;
4677 return 0;
4678 }
4679 else if (PyInt_Check(x)) {
4680 long value = PyInt_AS_LONG(x);
4681 long max = PyUnicode_GetMax();
4682 if (value < 0 || value > max) {
4683 PyErr_Format(PyExc_TypeError,
4684 "character mapping must be in range(0x%lx)", max+1);
4685 Py_DECREF(x);
4686 return -1;
4687 }
4688 *result = x;
4689 return 0;
4690 }
4691 else if (PyUnicode_Check(x)) {
4692 *result = x;
4693 return 0;
4694 }
4695 else {
4696 /* wrong return value */
4697 PyErr_SetString(PyExc_TypeError,
4698 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004699 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 return -1;
4701 }
4702}
4703/* ensure that *outobj is at least requiredsize characters long,
4704if not reallocate and adjust various state variables.
4705Return 0 on success, -1 on error */
4706static
Walter Dörwald4894c302003-10-24 14:25:28 +00004707int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004711 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004713 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004715 if (requiredsize < 2 * oldsize)
4716 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004717 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 return -1;
4719 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 }
4721 return 0;
4722}
4723/* lookup the character, put the result in the output string and adjust
4724 various state variables. Return a new reference to the object that
4725 was put in the output buffer in *result, or Py_None, if the mapping was
4726 undefined (in which case no character was written).
4727 The called must decref result.
4728 Return 0 on success, -1 on error. */
4729static
Walter Dörwald4894c302003-10-24 14:25:28 +00004730int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004732 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733{
Walter Dörwald4894c302003-10-24 14:25:28 +00004734 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 return -1;
4736 if (*res==NULL) {
4737 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004738 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 }
4740 else if (*res==Py_None)
4741 ;
4742 else if (PyInt_Check(*res)) {
4743 /* no overflow check, because we know that the space is enough */
4744 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4745 }
4746 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 if (repsize==1) {
4749 /* no overflow check, because we know that the space is enough */
4750 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4751 }
4752 else if (repsize!=0) {
4753 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004755 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004756 repsize - 1;
4757 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 return -1;
4759 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4760 *outp += repsize;
4761 }
4762 }
4763 else
4764 return -1;
4765 return 0;
4766}
4767
4768PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 PyObject *mapping,
4771 const char *errors)
4772{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 /* output object */
4774 PyObject *res = NULL;
4775 /* pointers to the beginning and end+1 of input */
4776 const Py_UNICODE *startp = p;
4777 const Py_UNICODE *endp = p + size;
4778 /* pointer into the output */
4779 Py_UNICODE *str;
4780 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 char *reason = "character maps to <undefined>";
4783 PyObject *errorHandler = NULL;
4784 PyObject *exc = NULL;
4785 /* the following variable is used for caching string comparisons
4786 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4787 * 3=ignore, 4=xmlcharrefreplace */
4788 int known_errorHandler = -1;
4789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 if (mapping == NULL) {
4791 PyErr_BadArgument();
4792 return NULL;
4793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794
4795 /* allocate enough for a simple 1:1 translation without
4796 replacements, if we need more, we'll resize */
4797 res = PyUnicode_FromUnicode(NULL, size);
4798 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004799 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 return res;
4802 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 while (p<endp) {
4805 /* try to encode it */
4806 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004807 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 goto onError;
4810 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004811 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (x!=Py_None) /* it worked => adjust input pointer */
4813 ++p;
4814 else { /* untranslatable character */
4815 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816 Py_ssize_t repsize;
4817 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_UNICODE *uni2;
4819 /* startpos for collecting untranslatable chars */
4820 const Py_UNICODE *collstart = p;
4821 const Py_UNICODE *collend = p+1;
4822 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 /* find all untranslatable characters */
4825 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004826 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 goto onError;
4828 Py_XDECREF(x);
4829 if (x!=Py_None)
4830 break;
4831 ++collend;
4832 }
4833 /* cache callback name lookup
4834 * (if not done yet, i.e. it's the first error) */
4835 if (known_errorHandler==-1) {
4836 if ((errors==NULL) || (!strcmp(errors, "strict")))
4837 known_errorHandler = 1;
4838 else if (!strcmp(errors, "replace"))
4839 known_errorHandler = 2;
4840 else if (!strcmp(errors, "ignore"))
4841 known_errorHandler = 3;
4842 else if (!strcmp(errors, "xmlcharrefreplace"))
4843 known_errorHandler = 4;
4844 else
4845 known_errorHandler = 0;
4846 }
4847 switch (known_errorHandler) {
4848 case 1: /* strict */
4849 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4850 goto onError;
4851 case 2: /* replace */
4852 /* No need to check for space, this is a 1:1 replacement */
4853 for (coll = collstart; coll<collend; ++coll)
4854 *str++ = '?';
4855 /* fall through */
4856 case 3: /* ignore */
4857 p = collend;
4858 break;
4859 case 4: /* xmlcharrefreplace */
4860 /* generate replacement (temporarily (mis)uses p) */
4861 for (p = collstart; p < collend; ++p) {
4862 char buffer[2+29+1+1];
4863 char *cp;
4864 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004865 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4867 goto onError;
4868 for (cp = buffer; *cp; ++cp)
4869 *str++ = *cp;
4870 }
4871 p = collend;
4872 break;
4873 default:
4874 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4875 reason, startp, size, &exc,
4876 collstart-startp, collend-startp, &newpos);
4877 if (repunicode == NULL)
4878 goto onError;
4879 /* generate replacement */
4880 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004881 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4883 Py_DECREF(repunicode);
4884 goto onError;
4885 }
4886 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4887 *str++ = *uni2;
4888 p = startp + newpos;
4889 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
4891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 /* Resize if we allocated to much */
4894 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004895 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004896 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004897 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 }
4899 Py_XDECREF(exc);
4900 Py_XDECREF(errorHandler);
4901 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 onError:
4904 Py_XDECREF(res);
4905 Py_XDECREF(exc);
4906 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 return NULL;
4908}
4909
4910PyObject *PyUnicode_Translate(PyObject *str,
4911 PyObject *mapping,
4912 const char *errors)
4913{
4914 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004915
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 str = PyUnicode_FromObject(str);
4917 if (str == NULL)
4918 goto onError;
4919 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4920 PyUnicode_GET_SIZE(str),
4921 mapping,
4922 errors);
4923 Py_DECREF(str);
4924 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004925
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 onError:
4927 Py_XDECREF(str);
4928 return NULL;
4929}
Tim Petersced69f82003-09-16 20:30:58 +00004930
Guido van Rossum9e896b32000-04-05 20:11:21 +00004931/* --- Decimal Encoder ---------------------------------------------------- */
4932
4933int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004935 char *output,
4936 const char *errors)
4937{
4938 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 PyObject *errorHandler = NULL;
4940 PyObject *exc = NULL;
4941 const char *encoding = "decimal";
4942 const char *reason = "invalid decimal Unicode string";
4943 /* the following variable is used for caching string comparisons
4944 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4945 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004946
4947 if (output == NULL) {
4948 PyErr_BadArgument();
4949 return -1;
4950 }
4951
4952 p = s;
4953 end = s + length;
4954 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004956 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004958 Py_ssize_t repsize;
4959 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 Py_UNICODE *uni2;
4961 Py_UNICODE *collstart;
4962 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Guido van Rossum9e896b32000-04-05 20:11:21 +00004964 if (Py_UNICODE_ISSPACE(ch)) {
4965 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004967 continue;
4968 }
4969 decimal = Py_UNICODE_TODECIMAL(ch);
4970 if (decimal >= 0) {
4971 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004973 continue;
4974 }
Guido van Rossumba477042000-04-06 18:18:10 +00004975 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004976 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004978 continue;
4979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 /* All other characters are considered unencodable */
4981 collstart = p;
4982 collend = p+1;
4983 while (collend < end) {
4984 if ((0 < *collend && *collend < 256) ||
4985 !Py_UNICODE_ISSPACE(*collend) ||
4986 Py_UNICODE_TODECIMAL(*collend))
4987 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 /* cache callback name lookup
4990 * (if not done yet, i.e. it's the first error) */
4991 if (known_errorHandler==-1) {
4992 if ((errors==NULL) || (!strcmp(errors, "strict")))
4993 known_errorHandler = 1;
4994 else if (!strcmp(errors, "replace"))
4995 known_errorHandler = 2;
4996 else if (!strcmp(errors, "ignore"))
4997 known_errorHandler = 3;
4998 else if (!strcmp(errors, "xmlcharrefreplace"))
4999 known_errorHandler = 4;
5000 else
5001 known_errorHandler = 0;
5002 }
5003 switch (known_errorHandler) {
5004 case 1: /* strict */
5005 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5006 goto onError;
5007 case 2: /* replace */
5008 for (p = collstart; p < collend; ++p)
5009 *output++ = '?';
5010 /* fall through */
5011 case 3: /* ignore */
5012 p = collend;
5013 break;
5014 case 4: /* xmlcharrefreplace */
5015 /* generate replacement (temporarily (mis)uses p) */
5016 for (p = collstart; p < collend; ++p)
5017 output += sprintf(output, "&#%d;", (int)*p);
5018 p = collend;
5019 break;
5020 default:
5021 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5022 encoding, reason, s, length, &exc,
5023 collstart-s, collend-s, &newpos);
5024 if (repunicode == NULL)
5025 goto onError;
5026 /* generate replacement */
5027 repsize = PyUnicode_GET_SIZE(repunicode);
5028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5029 Py_UNICODE ch = *uni2;
5030 if (Py_UNICODE_ISSPACE(ch))
5031 *output++ = ' ';
5032 else {
5033 decimal = Py_UNICODE_TODECIMAL(ch);
5034 if (decimal >= 0)
5035 *output++ = '0' + decimal;
5036 else if (0 < ch && ch < 256)
5037 *output++ = (char)ch;
5038 else {
5039 Py_DECREF(repunicode);
5040 raise_encode_exception(&exc, encoding,
5041 s, length, collstart-s, collend-s, reason);
5042 goto onError;
5043 }
5044 }
5045 }
5046 p = s + newpos;
5047 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005048 }
5049 }
5050 /* 0-terminate the output string */
5051 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 Py_XDECREF(exc);
5053 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005054 return 0;
5055
5056 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 Py_XDECREF(exc);
5058 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005059 return -1;
5060}
5061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062/* --- Helpers ------------------------------------------------------------ */
5063
Eric Smitha9f7d622008-02-17 19:46:49 +00005064#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005065
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005066#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005067
Fredrik Lundha50d2012006-05-26 17:04:58 +00005068#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005069
5070#include "stringlib/count.h"
5071#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005072#include "stringlib/partition.h"
5073
Fredrik Lundhc8162812006-05-26 19:33:03 +00005074/* helper macro to fixup start/end slice values */
5075#define FIX_START_END(obj) \
5076 if (start < 0) \
5077 start += (obj)->length; \
5078 if (start < 0) \
5079 start = 0; \
5080 if (end > (obj)->length) \
5081 end = (obj)->length; \
5082 if (end < 0) \
5083 end += (obj)->length; \
5084 if (end < 0) \
5085 end = 0;
5086
Martin v. Löwis18e16552006-02-15 17:27:45 +00005087Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005088 PyObject *substr,
5089 Py_ssize_t start,
5090 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005093 PyUnicodeObject* str_obj;
5094 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005095
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005096 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5097 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005099 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5100 if (!sub_obj) {
5101 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 return -1;
5103 }
Tim Petersced69f82003-09-16 20:30:58 +00005104
Fredrik Lundhc8162812006-05-26 19:33:03 +00005105 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005106
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005107 result = stringlib_count(
5108 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5109 );
5110
5111 Py_DECREF(sub_obj);
5112 Py_DECREF(str_obj);
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 return result;
5115}
5116
Martin v. Löwis18e16552006-02-15 17:27:45 +00005117Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005118 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005119 Py_ssize_t start,
5120 Py_ssize_t end,
5121 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005123 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005124
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005125 str = PyUnicode_FromObject(str);
5126 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005127 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005128 sub = PyUnicode_FromObject(sub);
5129 if (!sub) {
5130 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005131 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 }
Tim Petersced69f82003-09-16 20:30:58 +00005133
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005134 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005135 result = stringlib_find_slice(
5136 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5137 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5138 start, end
5139 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005140 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005141 result = stringlib_rfind_slice(
5142 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5143 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5144 start, end
5145 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005146
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005147 Py_DECREF(str);
5148 Py_DECREF(sub);
5149
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return result;
5151}
5152
Tim Petersced69f82003-09-16 20:30:58 +00005153static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154int tailmatch(PyUnicodeObject *self,
5155 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005156 Py_ssize_t start,
5157 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 int direction)
5159{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 if (substring->length == 0)
5161 return 1;
5162
Fredrik Lundhc8162812006-05-26 19:33:03 +00005163 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164
5165 end -= substring->length;
5166 if (end < start)
5167 return 0;
5168
5169 if (direction > 0) {
5170 if (Py_UNICODE_MATCH(self, end, substring))
5171 return 1;
5172 } else {
5173 if (Py_UNICODE_MATCH(self, start, substring))
5174 return 1;
5175 }
5176
5177 return 0;
5178}
5179
Martin v. Löwis18e16552006-02-15 17:27:45 +00005180Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005182 Py_ssize_t start,
5183 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 int direction)
5185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 str = PyUnicode_FromObject(str);
5189 if (str == NULL)
5190 return -1;
5191 substr = PyUnicode_FromObject(substr);
5192 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005193 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return -1;
5195 }
Tim Petersced69f82003-09-16 20:30:58 +00005196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 result = tailmatch((PyUnicodeObject *)str,
5198 (PyUnicodeObject *)substr,
5199 start, end, direction);
5200 Py_DECREF(str);
5201 Py_DECREF(substr);
5202 return result;
5203}
5204
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205/* Apply fixfct filter to the Unicode object self and return a
5206 reference to the modified object */
5207
Tim Petersced69f82003-09-16 20:30:58 +00005208static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209PyObject *fixup(PyUnicodeObject *self,
5210 int (*fixfct)(PyUnicodeObject *s))
5211{
5212
5213 PyUnicodeObject *u;
5214
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005215 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (u == NULL)
5217 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005218
5219 Py_UNICODE_COPY(u->str, self->str, self->length);
5220
Tim Peters7a29bd52001-09-12 03:03:31 +00005221 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 /* fixfct should return TRUE if it modified the buffer. If
5223 FALSE, return a reference to the original buffer instead
5224 (to save space, not time) */
5225 Py_INCREF(self);
5226 Py_DECREF(u);
5227 return (PyObject*) self;
5228 }
5229 return (PyObject*) u;
5230}
5231
Tim Petersced69f82003-09-16 20:30:58 +00005232static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233int fixupper(PyUnicodeObject *self)
5234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 Py_UNICODE *s = self->str;
5237 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 while (len-- > 0) {
5240 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005241
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 ch = Py_UNICODE_TOUPPER(*s);
5243 if (ch != *s) {
5244 status = 1;
5245 *s = ch;
5246 }
5247 s++;
5248 }
5249
5250 return status;
5251}
5252
Tim Petersced69f82003-09-16 20:30:58 +00005253static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254int fixlower(PyUnicodeObject *self)
5255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005256 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 Py_UNICODE *s = self->str;
5258 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005259
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 while (len-- > 0) {
5261 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005262
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 ch = Py_UNICODE_TOLOWER(*s);
5264 if (ch != *s) {
5265 status = 1;
5266 *s = ch;
5267 }
5268 s++;
5269 }
5270
5271 return status;
5272}
5273
Tim Petersced69f82003-09-16 20:30:58 +00005274static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275int fixswapcase(PyUnicodeObject *self)
5276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 Py_UNICODE *s = self->str;
5279 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 while (len-- > 0) {
5282 if (Py_UNICODE_ISUPPER(*s)) {
5283 *s = Py_UNICODE_TOLOWER(*s);
5284 status = 1;
5285 } else if (Py_UNICODE_ISLOWER(*s)) {
5286 *s = Py_UNICODE_TOUPPER(*s);
5287 status = 1;
5288 }
5289 s++;
5290 }
5291
5292 return status;
5293}
5294
Tim Petersced69f82003-09-16 20:30:58 +00005295static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296int fixcapitalize(PyUnicodeObject *self)
5297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005299 Py_UNICODE *s = self->str;
5300 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005302 if (len == 0)
5303 return 0;
5304 if (Py_UNICODE_ISLOWER(*s)) {
5305 *s = Py_UNICODE_TOUPPER(*s);
5306 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005308 s++;
5309 while (--len > 0) {
5310 if (Py_UNICODE_ISUPPER(*s)) {
5311 *s = Py_UNICODE_TOLOWER(*s);
5312 status = 1;
5313 }
5314 s++;
5315 }
5316 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317}
5318
5319static
5320int fixtitle(PyUnicodeObject *self)
5321{
5322 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5323 register Py_UNICODE *e;
5324 int previous_is_cased;
5325
5326 /* Shortcut for single character strings */
5327 if (PyUnicode_GET_SIZE(self) == 1) {
5328 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5329 if (*p != ch) {
5330 *p = ch;
5331 return 1;
5332 }
5333 else
5334 return 0;
5335 }
Tim Petersced69f82003-09-16 20:30:58 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 e = p + PyUnicode_GET_SIZE(self);
5338 previous_is_cased = 0;
5339 for (; p < e; p++) {
5340 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 if (previous_is_cased)
5343 *p = Py_UNICODE_TOLOWER(ch);
5344 else
5345 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005346
5347 if (Py_UNICODE_ISLOWER(ch) ||
5348 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE_ISTITLE(ch))
5350 previous_is_cased = 1;
5351 else
5352 previous_is_cased = 0;
5353 }
5354 return 1;
5355}
5356
Tim Peters8ce9f162004-08-27 01:49:32 +00005357PyObject *
5358PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359{
Tim Peters8ce9f162004-08-27 01:49:32 +00005360 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005361 const Py_UNICODE blank = ' ';
5362 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005363 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005364 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005365 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5366 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005367 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5368 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005370 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005371 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372
Tim Peters05eba1f2004-08-27 21:32:02 +00005373 fseq = PySequence_Fast(seq, "");
5374 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005375 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005376 }
5377
Tim Peters91879ab2004-08-27 22:35:44 +00005378 /* Grrrr. A codec may be invoked to convert str objects to
5379 * Unicode, and so it's possible to call back into Python code
5380 * during PyUnicode_FromObject(), and so it's possible for a sick
5381 * codec to change the size of fseq (if seq is a list). Therefore
5382 * we have to keep refetching the size -- can't assume seqlen
5383 * is invariant.
5384 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005385 seqlen = PySequence_Fast_GET_SIZE(fseq);
5386 /* If empty sequence, return u"". */
5387 if (seqlen == 0) {
5388 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5389 goto Done;
5390 }
5391 /* If singleton sequence with an exact Unicode, return that. */
5392 if (seqlen == 1) {
5393 item = PySequence_Fast_GET_ITEM(fseq, 0);
5394 if (PyUnicode_CheckExact(item)) {
5395 Py_INCREF(item);
5396 res = (PyUnicodeObject *)item;
5397 goto Done;
5398 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 }
5400
Tim Peters05eba1f2004-08-27 21:32:02 +00005401 /* At least two items to join, or one that isn't exact Unicode. */
5402 if (seqlen > 1) {
5403 /* Set up sep and seplen -- they're needed. */
5404 if (separator == NULL) {
5405 sep = &blank;
5406 seplen = 1;
5407 }
5408 else {
5409 internal_separator = PyUnicode_FromObject(separator);
5410 if (internal_separator == NULL)
5411 goto onError;
5412 sep = PyUnicode_AS_UNICODE(internal_separator);
5413 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005414 /* In case PyUnicode_FromObject() mutated seq. */
5415 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 }
5417 }
5418
5419 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005420 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005422 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 res_p = PyUnicode_AS_UNICODE(res);
5424 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005425
Tim Peters05eba1f2004-08-27 21:32:02 +00005426 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005427 Py_ssize_t itemlen;
5428 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005429
5430 item = PySequence_Fast_GET_ITEM(fseq, i);
5431 /* Convert item to Unicode. */
5432 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5433 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005434 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005436 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005437 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 item = PyUnicode_FromObject(item);
5440 if (item == NULL)
5441 goto onError;
5442 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005443
Tim Peters91879ab2004-08-27 22:35:44 +00005444 /* In case PyUnicode_FromObject() mutated seq. */
5445 seqlen = PySequence_Fast_GET_SIZE(fseq);
5446
Tim Peters8ce9f162004-08-27 01:49:32 +00005447 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005450 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005451 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 if (i < seqlen - 1) {
5453 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005454 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005455 goto Overflow;
5456 }
5457 if (new_res_used > res_alloc) {
5458 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005459 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005460 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005461 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005464 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005465 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005467 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005468 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005470
5471 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005472 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005473 res_p += itemlen;
5474 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005475 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 res_p += seplen;
5477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 res_used = new_res_used;
5480 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005481
Tim Peters05eba1f2004-08-27 21:32:02 +00005482 /* Shrink res to match the used area; this probably can't fail,
5483 * but it's cheap to check.
5484 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005485 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005486 goto onError;
5487
5488 Done:
5489 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 return (PyObject *)res;
5492
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 Overflow:
5494 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005495 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005496 Py_DECREF(item);
5497 /* fall through */
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005502 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 return NULL;
5504}
5505
Tim Petersced69f82003-09-16 20:30:58 +00005506static
5507PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005508 Py_ssize_t left,
5509 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_UNICODE fill)
5511{
5512 PyUnicodeObject *u;
5513
5514 if (left < 0)
5515 left = 0;
5516 if (right < 0)
5517 right = 0;
5518
Tim Peters7a29bd52001-09-12 03:03:31 +00005519 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 Py_INCREF(self);
5521 return self;
5522 }
5523
5524 u = _PyUnicode_New(left + self->length + right);
5525 if (u) {
5526 if (left)
5527 Py_UNICODE_FILL(u->str, fill, left);
5528 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5529 if (right)
5530 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5531 }
5532
5533 return u;
5534}
5535
5536#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005537 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 if (!str) \
5539 goto onError; \
5540 if (PyList_Append(list, str)) { \
5541 Py_DECREF(str); \
5542 goto onError; \
5543 } \
5544 else \
5545 Py_DECREF(str);
5546
5547static
5548PyObject *split_whitespace(PyUnicodeObject *self,
5549 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005550 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 register Py_ssize_t i;
5553 register Py_ssize_t j;
5554 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005556 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558 for (i = j = 0; i < len; ) {
5559 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005560 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 i++;
5562 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005563 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 i++;
5565 if (j < i) {
5566 if (maxcount-- <= 0)
5567 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005568 SPLIT_APPEND(buf, j, i);
5569 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 i++;
5571 j = i;
5572 }
5573 }
5574 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005575 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
5577 return list;
5578
5579 onError:
5580 Py_DECREF(list);
5581 return NULL;
5582}
5583
5584PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005585 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 register Py_ssize_t i;
5588 register Py_ssize_t j;
5589 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 PyObject *list;
5591 PyObject *str;
5592 Py_UNICODE *data;
5593
5594 string = PyUnicode_FromObject(string);
5595 if (string == NULL)
5596 return NULL;
5597 data = PyUnicode_AS_UNICODE(string);
5598 len = PyUnicode_GET_SIZE(string);
5599
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 list = PyList_New(0);
5601 if (!list)
5602 goto onError;
5603
5604 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005608 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005612 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 if (i < len) {
5614 if (data[i] == '\r' && i + 1 < len &&
5615 data[i+1] == '\n')
5616 i += 2;
5617 else
5618 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005619 if (keepends)
5620 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
Guido van Rossum86662912000-04-11 15:38:46 +00005622 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 j = i;
5624 }
5625 if (j < len) {
5626 SPLIT_APPEND(data, j, len);
5627 }
5628
5629 Py_DECREF(string);
5630 return list;
5631
5632 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005633 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 Py_DECREF(string);
5635 return NULL;
5636}
5637
Tim Petersced69f82003-09-16 20:30:58 +00005638static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639PyObject *split_char(PyUnicodeObject *self,
5640 PyObject *list,
5641 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005644 register Py_ssize_t i;
5645 register Py_ssize_t j;
5646 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005648 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
5650 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005651 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (maxcount-- <= 0)
5653 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005654 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 i = j = i + 1;
5656 } else
5657 i++;
5658 }
5659 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005660 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 }
5662 return list;
5663
5664 onError:
5665 Py_DECREF(list);
5666 return NULL;
5667}
5668
Tim Petersced69f82003-09-16 20:30:58 +00005669static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670PyObject *split_substring(PyUnicodeObject *self,
5671 PyObject *list,
5672 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len = self->length;
5678 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 PyObject *str;
5680
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005681 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 if (Py_UNICODE_MATCH(self, i, substring)) {
5683 if (maxcount-- <= 0)
5684 break;
5685 SPLIT_APPEND(self->str, j, i);
5686 i = j = i + sublen;
5687 } else
5688 i++;
5689 }
5690 if (j <= len) {
5691 SPLIT_APPEND(self->str, j, len);
5692 }
5693 return list;
5694
5695 onError:
5696 Py_DECREF(list);
5697 return NULL;
5698}
5699
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700static
5701PyObject *rsplit_whitespace(PyUnicodeObject *self,
5702 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 register Py_ssize_t i;
5706 register Py_ssize_t j;
5707 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005708 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005709 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005710
5711 for (i = j = len - 1; i >= 0; ) {
5712 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005713 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005714 i--;
5715 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005716 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005717 i--;
5718 if (j > i) {
5719 if (maxcount-- <= 0)
5720 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005721 SPLIT_APPEND(buf, i + 1, j + 1);
5722 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005723 i--;
5724 j = i;
5725 }
5726 }
5727 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005728 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005729 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005730 if (PyList_Reverse(list) < 0)
5731 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005732 return list;
5733
5734 onError:
5735 Py_DECREF(list);
5736 return NULL;
5737}
5738
5739static
5740PyObject *rsplit_char(PyUnicodeObject *self,
5741 PyObject *list,
5742 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005743 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 register Py_ssize_t i;
5746 register Py_ssize_t j;
5747 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005748 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005749 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005750
5751 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005752 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005753 if (maxcount-- <= 0)
5754 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005755 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756 j = i = i - 1;
5757 } else
5758 i--;
5759 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005760 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005761 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005763 if (PyList_Reverse(list) < 0)
5764 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 return list;
5766
5767 onError:
5768 Py_DECREF(list);
5769 return NULL;
5770}
5771
5772static
5773PyObject *rsplit_substring(PyUnicodeObject *self,
5774 PyObject *list,
5775 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005778 register Py_ssize_t i;
5779 register Py_ssize_t j;
5780 Py_ssize_t len = self->length;
5781 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 PyObject *str;
5783
5784 for (i = len - sublen, j = len; i >= 0; ) {
5785 if (Py_UNICODE_MATCH(self, i, substring)) {
5786 if (maxcount-- <= 0)
5787 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005788 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789 j = i;
5790 i -= sublen;
5791 } else
5792 i--;
5793 }
5794 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005795 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005797 if (PyList_Reverse(list) < 0)
5798 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 return list;
5800
5801 onError:
5802 Py_DECREF(list);
5803 return NULL;
5804}
5805
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806#undef SPLIT_APPEND
5807
5808static
5809PyObject *split(PyUnicodeObject *self,
5810 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812{
5813 PyObject *list;
5814
5815 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005816 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818 list = PyList_New(0);
5819 if (!list)
5820 return NULL;
5821
5822 if (substring == NULL)
5823 return split_whitespace(self,list,maxcount);
5824
5825 else if (substring->length == 1)
5826 return split_char(self,list,substring->str[0],maxcount);
5827
5828 else if (substring->length == 0) {
5829 Py_DECREF(list);
5830 PyErr_SetString(PyExc_ValueError, "empty separator");
5831 return NULL;
5832 }
5833 else
5834 return split_substring(self,list,substring,maxcount);
5835}
5836
Tim Petersced69f82003-09-16 20:30:58 +00005837static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838PyObject *rsplit(PyUnicodeObject *self,
5839 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005840 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841{
5842 PyObject *list;
5843
5844 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005845 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005846
5847 list = PyList_New(0);
5848 if (!list)
5849 return NULL;
5850
5851 if (substring == NULL)
5852 return rsplit_whitespace(self,list,maxcount);
5853
5854 else if (substring->length == 1)
5855 return rsplit_char(self,list,substring->str[0],maxcount);
5856
5857 else if (substring->length == 0) {
5858 Py_DECREF(list);
5859 PyErr_SetString(PyExc_ValueError, "empty separator");
5860 return NULL;
5861 }
5862 else
5863 return rsplit_substring(self,list,substring,maxcount);
5864}
5865
5866static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867PyObject *replace(PyUnicodeObject *self,
5868 PyUnicodeObject *str1,
5869 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
5872 PyUnicodeObject *u;
5873
5874 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005875 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
Fredrik Lundh347ee272006-05-24 16:35:18 +00005877 if (str1->length == str2->length) {
5878 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005879 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005880 if (str1->length == 1) {
5881 /* replace characters */
5882 Py_UNICODE u1, u2;
5883 if (!findchar(self->str, self->length, str1->str[0]))
5884 goto nothing;
5885 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5886 if (!u)
5887 return NULL;
5888 Py_UNICODE_COPY(u->str, self->str, self->length);
5889 u1 = str1->str[0];
5890 u2 = str2->str[0];
5891 for (i = 0; i < u->length; i++)
5892 if (u->str[i] == u1) {
5893 if (--maxcount < 0)
5894 break;
5895 u->str[i] = u2;
5896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 i = fastsearch(
5899 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005901 if (i < 0)
5902 goto nothing;
5903 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5904 if (!u)
5905 return NULL;
5906 Py_UNICODE_COPY(u->str, self->str, self->length);
5907 while (i <= self->length - str1->length)
5908 if (Py_UNICODE_MATCH(self, i, str1)) {
5909 if (--maxcount < 0)
5910 break;
5911 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5912 i += str1->length;
5913 } else
5914 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005918 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005919 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 Py_UNICODE *p;
5921
5922 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005923 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (n > maxcount)
5925 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005926 if (n == 0)
5927 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005928 /* new_size = self->length + n * (str2->length - str1->length)); */
5929 delta = (str2->length - str1->length);
5930 if (delta == 0) {
5931 new_size = self->length;
5932 } else {
5933 product = n * (str2->length - str1->length);
5934 if ((product / (str2->length - str1->length)) != n) {
5935 PyErr_SetString(PyExc_OverflowError,
5936 "replace string is too long");
5937 return NULL;
5938 }
5939 new_size = self->length + product;
5940 if (new_size < 0) {
5941 PyErr_SetString(PyExc_OverflowError,
5942 "replace string is too long");
5943 return NULL;
5944 }
5945 }
5946 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005947 if (!u)
5948 return NULL;
5949 i = 0;
5950 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005951 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005952 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005953 while (n-- > 0) {
5954 /* look for next match */
5955 j = i;
5956 while (j <= e) {
5957 if (Py_UNICODE_MATCH(self, j, str1))
5958 break;
5959 j++;
5960 }
5961 if (j > i) {
5962 if (j > e)
5963 break;
5964 /* copy unchanged part [i:j] */
5965 Py_UNICODE_COPY(p, self->str+i, j-i);
5966 p += j - i;
5967 }
5968 /* copy substitution string */
5969 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970 Py_UNICODE_COPY(p, str2->str, str2->length);
5971 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005972 }
5973 i = j + str1->length;
5974 }
5975 if (i < self->length)
5976 /* copy tail [i:] */
5977 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005978 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005979 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005980 while (n > 0) {
5981 Py_UNICODE_COPY(p, str2->str, str2->length);
5982 p += str2->length;
5983 if (--n <= 0)
5984 break;
5985 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005987 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
5989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991
5992nothing:
5993 /* nothing to replace; return original string (when possible) */
5994 if (PyUnicode_CheckExact(self)) {
5995 Py_INCREF(self);
5996 return (PyObject *) self;
5997 }
5998 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999}
6000
6001/* --- Unicode Object Methods --------------------------------------------- */
6002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006003PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004"S.title() -> unicode\n\
6005\n\
6006Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006007characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
6009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006010unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return fixup(self, fixtitle);
6013}
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016"S.capitalize() -> unicode\n\
6017\n\
6018Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006022unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return fixup(self, fixcapitalize);
6025}
6026
6027#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029"S.capwords() -> unicode\n\
6030\n\
6031Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
6034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006035unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
6037 PyObject *list;
6038 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006039 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 /* Split into words */
6042 list = split(self, NULL, -1);
6043 if (!list)
6044 return NULL;
6045
6046 /* Capitalize each word */
6047 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6048 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6049 fixcapitalize);
6050 if (item == NULL)
6051 goto onError;
6052 Py_DECREF(PyList_GET_ITEM(list, i));
6053 PyList_SET_ITEM(list, i, item);
6054 }
6055
6056 /* Join the words to form a new string */
6057 item = PyUnicode_Join(NULL, list);
6058
6059onError:
6060 Py_DECREF(list);
6061 return (PyObject *)item;
6062}
6063#endif
6064
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006065/* Argument converter. Coerces to a single unicode character */
6066
6067static int
6068convert_uc(PyObject *obj, void *addr)
6069{
6070 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6071 PyObject *uniobj;
6072 Py_UNICODE *unistr;
6073
6074 uniobj = PyUnicode_FromObject(obj);
6075 if (uniobj == NULL) {
6076 PyErr_SetString(PyExc_TypeError,
6077 "The fill character cannot be converted to Unicode");
6078 return 0;
6079 }
6080 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6081 PyErr_SetString(PyExc_TypeError,
6082 "The fill character must be exactly one character long");
6083 Py_DECREF(uniobj);
6084 return 0;
6085 }
6086 unistr = PyUnicode_AS_UNICODE(uniobj);
6087 *fillcharloc = unistr[0];
6088 Py_DECREF(uniobj);
6089 return 1;
6090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006093"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006095Return S centered in a Unicode string of length width. Padding is\n\
6096done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098static PyObject *
6099unicode_center(PyUnicodeObject *self, PyObject *args)
6100{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t marg, left;
6102 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006103 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
Thomas Woutersde017742006-02-16 19:34:37 +00006105 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
6107
Tim Peters7a29bd52001-09-12 03:03:31 +00006108 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 Py_INCREF(self);
6110 return (PyObject*) self;
6111 }
6112
6113 marg = width - self->length;
6114 left = marg / 2 + (marg & width & 1);
6115
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006116 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117}
6118
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119#if 0
6120
6121/* This code should go into some future Unicode collation support
6122 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006123 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006124
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006125/* speedy UTF-16 code point order comparison */
6126/* gleaned from: */
6127/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6128
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006129static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006130{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006131 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006132 0, 0, 0, 0, 0, 0, 0, 0,
6133 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006134 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006135};
6136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137static int
6138unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 Py_UNICODE *s1 = str1->str;
6143 Py_UNICODE *s2 = str2->str;
6144
6145 len1 = str1->length;
6146 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006147
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006149 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006150
6151 c1 = *s1++;
6152 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006153
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006154 if (c1 > (1<<11) * 26)
6155 c1 += utf16Fixup[c1>>11];
6156 if (c2 > (1<<11) * 26)
6157 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006159
6160 if (c1 != c2)
6161 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006162
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 }
6165
6166 return (len1 < len2) ? -1 : (len1 != len2);
6167}
6168
Marc-André Lemburge5034372000-08-08 08:04:29 +00006169#else
6170
6171static int
6172unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6173{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006174 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006175
6176 Py_UNICODE *s1 = str1->str;
6177 Py_UNICODE *s2 = str2->str;
6178
6179 len1 = str1->length;
6180 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006181
Marc-André Lemburge5034372000-08-08 08:04:29 +00006182 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006183 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006184
Fredrik Lundh45714e92001-06-26 16:39:36 +00006185 c1 = *s1++;
6186 c2 = *s2++;
6187
6188 if (c1 != c2)
6189 return (c1 < c2) ? -1 : 1;
6190
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191 len1--; len2--;
6192 }
6193
6194 return (len1 < len2) ? -1 : (len1 != len2);
6195}
6196
6197#endif
6198
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199int PyUnicode_Compare(PyObject *left,
6200 PyObject *right)
6201{
6202 PyUnicodeObject *u = NULL, *v = NULL;
6203 int result;
6204
6205 /* Coerce the two arguments */
6206 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6207 if (u == NULL)
6208 goto onError;
6209 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6210 if (v == NULL)
6211 goto onError;
6212
Thomas Wouters7e474022000-07-16 12:04:32 +00006213 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 if (v == u) {
6215 Py_DECREF(u);
6216 Py_DECREF(v);
6217 return 0;
6218 }
6219
6220 result = unicode_compare(u, v);
6221
6222 Py_DECREF(u);
6223 Py_DECREF(v);
6224 return result;
6225
6226onError:
6227 Py_XDECREF(u);
6228 Py_XDECREF(v);
6229 return -1;
6230}
6231
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006232PyObject *PyUnicode_RichCompare(PyObject *left,
6233 PyObject *right,
6234 int op)
6235{
6236 int result;
6237
6238 result = PyUnicode_Compare(left, right);
6239 if (result == -1 && PyErr_Occurred())
6240 goto onError;
6241
6242 /* Convert the return value to a Boolean */
6243 switch (op) {
6244 case Py_EQ:
6245 result = (result == 0);
6246 break;
6247 case Py_NE:
6248 result = (result != 0);
6249 break;
6250 case Py_LE:
6251 result = (result <= 0);
6252 break;
6253 case Py_GE:
6254 result = (result >= 0);
6255 break;
6256 case Py_LT:
6257 result = (result == -1);
6258 break;
6259 case Py_GT:
6260 result = (result == 1);
6261 break;
6262 }
6263 return PyBool_FromLong(result);
6264
6265 onError:
6266
6267 /* Standard case
6268
6269 Type errors mean that PyUnicode_FromObject() could not convert
6270 one of the arguments (usually the right hand side) to Unicode,
6271 ie. we can't handle the comparison request. However, it is
6272 possible that the other object knows a comparison method, which
6273 is why we return Py_NotImplemented to give the other object a
6274 chance.
6275
6276 */
6277 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6278 PyErr_Clear();
6279 Py_INCREF(Py_NotImplemented);
6280 return Py_NotImplemented;
6281 }
6282 if (op != Py_EQ && op != Py_NE)
6283 return NULL;
6284
6285 /* Equality comparison.
6286
6287 This is a special case: we silence any PyExc_UnicodeDecodeError
6288 and instead turn it into a PyErr_UnicodeWarning.
6289
6290 */
6291 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6292 return NULL;
6293 PyErr_Clear();
6294 if (PyErr_Warn(PyExc_UnicodeWarning,
6295 (op == Py_EQ) ?
6296 "Unicode equal comparison "
6297 "failed to convert both arguments to Unicode - "
6298 "interpreting them as being unequal" :
6299 "Unicode unequal comparison "
6300 "failed to convert both arguments to Unicode - "
6301 "interpreting them as being unequal"
6302 ) < 0)
6303 return NULL;
6304 result = (op == Py_NE);
6305 return PyBool_FromLong(result);
6306}
6307
Guido van Rossum403d68b2000-03-13 15:55:09 +00006308int PyUnicode_Contains(PyObject *container,
6309 PyObject *element)
6310{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006311 PyObject *str, *sub;
6312 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006313
6314 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006315 sub = PyUnicode_FromObject(element);
6316 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006317 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006318 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006319 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006320 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006321
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006322 str = PyUnicode_FromObject(container);
6323 if (!str) {
6324 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006325 return -1;
6326 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006327
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006328 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006329
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006330 Py_DECREF(str);
6331 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006332
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006333 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006334}
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336/* Concat to string or Unicode object giving a new Unicode object. */
6337
6338PyObject *PyUnicode_Concat(PyObject *left,
6339 PyObject *right)
6340{
6341 PyUnicodeObject *u = NULL, *v = NULL, *w;
6342
6343 /* Coerce the two arguments */
6344 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6345 if (u == NULL)
6346 goto onError;
6347 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6348 if (v == NULL)
6349 goto onError;
6350
6351 /* Shortcuts */
6352 if (v == unicode_empty) {
6353 Py_DECREF(v);
6354 return (PyObject *)u;
6355 }
6356 if (u == unicode_empty) {
6357 Py_DECREF(u);
6358 return (PyObject *)v;
6359 }
6360
6361 /* Concat the two Unicode strings */
6362 w = _PyUnicode_New(u->length + v->length);
6363 if (w == NULL)
6364 goto onError;
6365 Py_UNICODE_COPY(w->str, u->str, u->length);
6366 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6367
6368 Py_DECREF(u);
6369 Py_DECREF(v);
6370 return (PyObject *)w;
6371
6372onError:
6373 Py_XDECREF(u);
6374 Py_XDECREF(v);
6375 return NULL;
6376}
6377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006378PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379"S.count(sub[, start[, end]]) -> int\n\
6380\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006381Return the number of non-overlapping occurrences of substring sub in\n\
6382Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006383interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
6385static PyObject *
6386unicode_count(PyUnicodeObject *self, PyObject *args)
6387{
6388 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006390 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 PyObject *result;
6392
Guido van Rossumb8872e62000-05-09 14:14:27 +00006393 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 return NULL;
6396
6397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006398 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 if (substring == NULL)
6400 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006401
Fredrik Lundhc8162812006-05-26 19:33:03 +00006402 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006404 result = PyInt_FromSsize_t(
6405 stringlib_count(self->str + start, end - start,
6406 substring->str, substring->length)
6407 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 return result;
6412}
6413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006415"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006417Encodes S using the codec registered for encoding. encoding defaults\n\
6418to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6421'xmlcharrefreplace' as well as any other name registered with\n\
6422codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424static PyObject *
6425unicode_encode(PyUnicodeObject *self, PyObject *args)
6426{
6427 char *encoding = NULL;
6428 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006429 PyObject *v;
6430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6432 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006434 if (v == NULL)
6435 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006436 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6437 PyErr_Format(PyExc_TypeError,
6438 "encoder did not return a string/unicode object "
6439 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006440 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006441 Py_DECREF(v);
6442 return NULL;
6443 }
6444 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006445
6446 onError:
6447 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448}
6449
6450PyDoc_STRVAR(decode__doc__,
6451"S.decode([encoding[,errors]]) -> string or unicode\n\
6452\n\
6453Decodes S using the codec registered for encoding. encoding defaults\n\
6454to the default encoding. errors may be given to set a different error\n\
6455handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6456a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6457as well as any other name registerd with codecs.register_error that is\n\
6458able to handle UnicodeDecodeErrors.");
6459
6460static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006461unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006462{
6463 char *encoding = NULL;
6464 char *errors = NULL;
6465 PyObject *v;
6466
6467 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6468 return NULL;
6469 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006470 if (v == NULL)
6471 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6473 PyErr_Format(PyExc_TypeError,
6474 "decoder did not return a string/unicode object "
6475 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006476 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477 Py_DECREF(v);
6478 return NULL;
6479 }
6480 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006481
6482 onError:
6483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484}
6485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487"S.expandtabs([tabsize]) -> unicode\n\
6488\n\
6489Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
6492static PyObject*
6493unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6494{
6495 Py_UNICODE *e;
6496 Py_UNICODE *p;
6497 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006498 Py_UNICODE *qe;
6499 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 PyUnicodeObject *u;
6501 int tabsize = 8;
6502
6503 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6504 return NULL;
6505
Thomas Wouters7e474022000-07-16 12:04:32 +00006506 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006507 i = 0; /* chars up to and including most recent \n or \r */
6508 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6509 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 for (p = self->str; p < e; p++)
6511 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006512 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006513 incr = tabsize - (j % tabsize); /* cannot overflow */
6514 if (j > PY_SSIZE_T_MAX - incr)
6515 goto overflow1;
6516 j += incr;
6517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 }
6519 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006520 if (j > PY_SSIZE_T_MAX - 1)
6521 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 j++;
6523 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006524 if (i > PY_SSIZE_T_MAX - j)
6525 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006527 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 }
6529 }
6530
Guido van Rossum5bdff602008-03-11 21:18:06 +00006531 if (i > PY_SSIZE_T_MAX - j)
6532 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 /* Second pass: create output string and fill it */
6535 u = _PyUnicode_New(i + j);
6536 if (!u)
6537 return NULL;
6538
Guido van Rossum5bdff602008-03-11 21:18:06 +00006539 j = 0; /* same as in first pass */
6540 q = u->str; /* next output char */
6541 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542
6543 for (p = self->str; p < e; p++)
6544 if (*p == '\t') {
6545 if (tabsize > 0) {
6546 i = tabsize - (j % tabsize);
6547 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006548 while (i--) {
6549 if (q >= qe)
6550 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 }
6554 }
6555 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006556 if (q >= qe)
6557 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006559 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 if (*p == '\n' || *p == '\r')
6561 j = 0;
6562 }
6563
6564 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006565
6566 overflow2:
6567 Py_DECREF(u);
6568 overflow1:
6569 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571}
6572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006573PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574"S.find(sub [,start [,end]]) -> int\n\
6575\n\
6576Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006577such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578arguments start and end are interpreted as in slice notation.\n\
6579\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582static PyObject *
6583unicode_find(PyUnicodeObject *self, PyObject *args)
6584{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006585 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006586 Py_ssize_t start;
6587 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006588 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
Facundo Batista57d56692007-11-16 18:04:14 +00006590 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006593 result = stringlib_find_slice(
6594 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6595 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6596 start, end
6597 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
6599 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006600
6601 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602}
6603
6604static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
6607 if (index < 0 || index >= self->length) {
6608 PyErr_SetString(PyExc_IndexError, "string index out of range");
6609 return NULL;
6610 }
6611
6612 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6613}
6614
6615static long
6616unicode_hash(PyUnicodeObject *self)
6617{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006618 /* Since Unicode objects compare equal to their ASCII string
6619 counterparts, they should use the individual character values
6620 as basis for their hash value. This is needed to assure that
6621 strings and Unicode objects behave in the same way as
6622 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Martin v. Löwis18e16552006-02-15 17:27:45 +00006624 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006625 register Py_UNICODE *p;
6626 register long x;
6627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 if (self->hash != -1)
6629 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006630 len = PyUnicode_GET_SIZE(self);
6631 p = PyUnicode_AS_UNICODE(self);
6632 x = *p << 7;
6633 while (--len >= 0)
6634 x = (1000003*x) ^ *p++;
6635 x ^= PyUnicode_GET_SIZE(self);
6636 if (x == -1)
6637 x = -2;
6638 self->hash = x;
6639 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006642PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643"S.index(sub [,start [,end]]) -> int\n\
6644\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006645Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
6647static PyObject *
6648unicode_index(PyUnicodeObject *self, PyObject *args)
6649{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006651 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006652 Py_ssize_t start;
6653 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
Facundo Batista57d56692007-11-16 18:04:14 +00006655 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006658 result = stringlib_find_slice(
6659 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6660 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6661 start, end
6662 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
6664 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 if (result < 0) {
6667 PyErr_SetString(PyExc_ValueError, "substring not found");
6668 return NULL;
6669 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006670
Martin v. Löwis18e16552006-02-15 17:27:45 +00006671 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006675"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006677Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006678at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
6680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006681unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682{
6683 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6684 register const Py_UNICODE *e;
6685 int cased;
6686
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 /* Shortcut for single character strings */
6688 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006689 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006691 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006692 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006693 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 e = p + PyUnicode_GET_SIZE(self);
6696 cased = 0;
6697 for (; p < e; p++) {
6698 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 else if (!cased && Py_UNICODE_ISLOWER(ch))
6703 cased = 1;
6704 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006705 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706}
6707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006711Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006715unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
6717 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6718 register const Py_UNICODE *e;
6719 int cased;
6720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 /* Shortcut for single character strings */
6722 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006723 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006725 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006726 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 e = p + PyUnicode_GET_SIZE(self);
6730 cased = 0;
6731 for (; p < e; p++) {
6732 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 else if (!cased && Py_UNICODE_ISUPPER(ch))
6737 cased = 1;
6738 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006745Return True if S is a titlecased string and there is at least one\n\
6746character in S, i.e. upper- and titlecase characters may only\n\
6747follow uncased characters and lowercase characters only cased ones.\n\
6748Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
6750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006751unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752{
6753 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6754 register const Py_UNICODE *e;
6755 int cased, previous_is_cased;
6756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 /* Shortcut for single character strings */
6758 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006759 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6760 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006762 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006763 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006764 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006765
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 e = p + PyUnicode_GET_SIZE(self);
6767 cased = 0;
6768 previous_is_cased = 0;
6769 for (; p < e; p++) {
6770 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006771
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6773 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 previous_is_cased = 1;
6776 cased = 1;
6777 }
6778 else if (Py_UNICODE_ISLOWER(ch)) {
6779 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006780 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 previous_is_cased = 1;
6782 cased = 1;
6783 }
6784 else
6785 previous_is_cased = 0;
6786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006787 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006790PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006793Return True if all characters in S are whitespace\n\
6794and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006797unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798{
6799 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6800 register const Py_UNICODE *e;
6801
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 /* Shortcut for single character strings */
6803 if (PyUnicode_GET_SIZE(self) == 1 &&
6804 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006805 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006807 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006808 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 e = p + PyUnicode_GET_SIZE(self);
6812 for (; p < e; p++) {
6813 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006819PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006821\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006822Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824
6825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006826unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827{
6828 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6829 register const Py_UNICODE *e;
6830
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831 /* Shortcut for single character strings */
6832 if (PyUnicode_GET_SIZE(self) == 1 &&
6833 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835
6836 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006837 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006839
6840 e = p + PyUnicode_GET_SIZE(self);
6841 for (; p < e; p++) {
6842 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006844 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006846}
6847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006851Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006852and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853
6854static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006855unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856{
6857 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6858 register const Py_UNICODE *e;
6859
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860 /* Shortcut for single character strings */
6861 if (PyUnicode_GET_SIZE(self) == 1 &&
6862 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864
6865 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006866 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006868
6869 e = p + PyUnicode_GET_SIZE(self);
6870 for (; p < e; p++) {
6871 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006875}
6876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006877PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
6883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006884unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
6886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6887 register const Py_UNICODE *e;
6888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 /* Shortcut for single character strings */
6890 if (PyUnicode_GET_SIZE(self) == 1 &&
6891 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006894 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006895 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 e = p + PyUnicode_GET_SIZE(self);
6899 for (; p < e; p++) {
6900 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006909Return True if all characters in S are digits\n\
6910and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006913unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914{
6915 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6916 register const Py_UNICODE *e;
6917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 /* Shortcut for single character strings */
6919 if (PyUnicode_GET_SIZE(self) == 1 &&
6920 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006923 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006924 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006926
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 e = p + PyUnicode_GET_SIZE(self);
6928 for (; p < e; p++) {
6929 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933}
6934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006935PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940
6941static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006942unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
6944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6945 register const Py_UNICODE *e;
6946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 /* Shortcut for single character strings */
6948 if (PyUnicode_GET_SIZE(self) == 1 &&
6949 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006952 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006953 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 e = p + PyUnicode_GET_SIZE(self);
6957 for (; p < e; p++) {
6958 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965"S.join(sequence) -> unicode\n\
6966\n\
6967Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006968sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
6970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006971unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006973 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974}
6975
Martin v. Löwis18e16552006-02-15 17:27:45 +00006976static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977unicode_length(PyUnicodeObject *self)
6978{
6979 return self->length;
6980}
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006983"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984\n\
6985Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006986done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
6988static PyObject *
6989unicode_ljust(PyUnicodeObject *self, PyObject *args)
6990{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006991 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006992 Py_UNICODE fillchar = ' ';
6993
Martin v. Löwis412fb672006-04-13 06:34:32 +00006994 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 return NULL;
6996
Tim Peters7a29bd52001-09-12 03:03:31 +00006997 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 Py_INCREF(self);
6999 return (PyObject*) self;
7000 }
7001
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007002 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003}
7004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007005PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006"S.lower() -> unicode\n\
7007\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
7010static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007011unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 return fixup(self, fixlower);
7014}
7015
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016#define LEFTSTRIP 0
7017#define RIGHTSTRIP 1
7018#define BOTHSTRIP 2
7019
7020/* Arrays indexed by above */
7021static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7022
7023#define STRIPNAME(i) (stripformat[i]+3)
7024
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025/* externally visible for str.strip(unicode) */
7026PyObject *
7027_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7028{
7029 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007030 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007031 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007032 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7033 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007035 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7036
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037 i = 0;
7038 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007039 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7040 i++;
7041 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042 }
7043
7044 j = len;
7045 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007046 do {
7047 j--;
7048 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7049 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050 }
7051
7052 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007053 Py_INCREF(self);
7054 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055 }
7056 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007057 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058}
7059
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
7061static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007065 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007066
7067 i = 0;
7068 if (striptype != RIGHTSTRIP) {
7069 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7070 i++;
7071 }
7072 }
7073
7074 j = len;
7075 if (striptype != LEFTSTRIP) {
7076 do {
7077 j--;
7078 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7079 j++;
7080 }
7081
7082 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7083 Py_INCREF(self);
7084 return (PyObject*)self;
7085 }
7086 else
7087 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088}
7089
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090
7091static PyObject *
7092do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7093{
7094 PyObject *sep = NULL;
7095
7096 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7097 return NULL;
7098
7099 if (sep != NULL && sep != Py_None) {
7100 if (PyUnicode_Check(sep))
7101 return _PyUnicode_XStrip(self, striptype, sep);
7102 else if (PyString_Check(sep)) {
7103 PyObject *res;
7104 sep = PyUnicode_FromObject(sep);
7105 if (sep==NULL)
7106 return NULL;
7107 res = _PyUnicode_XStrip(self, striptype, sep);
7108 Py_DECREF(sep);
7109 return res;
7110 }
7111 else {
7112 PyErr_Format(PyExc_TypeError,
7113 "%s arg must be None, unicode or str",
7114 STRIPNAME(striptype));
7115 return NULL;
7116 }
7117 }
7118
7119 return do_strip(self, striptype);
7120}
7121
7122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007124"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125\n\
7126Return a copy of the string S with leading and trailing\n\
7127whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007128If chars is given and not None, remove characters in chars instead.\n\
7129If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130
7131static PyObject *
7132unicode_strip(PyUnicodeObject *self, PyObject *args)
7133{
7134 if (PyTuple_GET_SIZE(args) == 0)
7135 return do_strip(self, BOTHSTRIP); /* Common case */
7136 else
7137 return do_argstrip(self, BOTHSTRIP, args);
7138}
7139
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007142"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143\n\
7144Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007145If chars is given and not None, remove characters in chars instead.\n\
7146If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147
7148static PyObject *
7149unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7150{
7151 if (PyTuple_GET_SIZE(args) == 0)
7152 return do_strip(self, LEFTSTRIP); /* Common case */
7153 else
7154 return do_argstrip(self, LEFTSTRIP, args);
7155}
7156
7157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007159"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007160\n\
7161Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007162If chars is given and not None, remove characters in chars instead.\n\
7163If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007164
7165static PyObject *
7166unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7167{
7168 if (PyTuple_GET_SIZE(args) == 0)
7169 return do_strip(self, RIGHTSTRIP); /* Common case */
7170 else
7171 return do_argstrip(self, RIGHTSTRIP, args);
7172}
7173
7174
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007176unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177{
7178 PyUnicodeObject *u;
7179 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007180 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007181 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183 if (len < 0)
7184 len = 0;
7185
Tim Peters7a29bd52001-09-12 03:03:31 +00007186 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 /* no repeat, return original string */
7188 Py_INCREF(str);
7189 return (PyObject*) str;
7190 }
Tim Peters8f422462000-09-09 06:13:41 +00007191
7192 /* ensure # of chars needed doesn't overflow int and # of bytes
7193 * needed doesn't overflow size_t
7194 */
7195 nchars = len * str->length;
7196 if (len && nchars / len != str->length) {
7197 PyErr_SetString(PyExc_OverflowError,
7198 "repeated string is too long");
7199 return NULL;
7200 }
7201 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7202 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7203 PyErr_SetString(PyExc_OverflowError,
7204 "repeated string is too long");
7205 return NULL;
7206 }
7207 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 if (!u)
7209 return NULL;
7210
7211 p = u->str;
7212
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007213 if (str->length == 1 && len > 0) {
7214 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007215 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007216 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007217 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007218 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007219 done = str->length;
7220 }
7221 while (done < nchars) {
7222 int n = (done <= nchars-done) ? done : nchars-done;
7223 Py_UNICODE_COPY(p+done, p, n);
7224 done += n;
7225 }
7226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
7228 return (PyObject*) u;
7229}
7230
7231PyObject *PyUnicode_Replace(PyObject *obj,
7232 PyObject *subobj,
7233 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007234 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235{
7236 PyObject *self;
7237 PyObject *str1;
7238 PyObject *str2;
7239 PyObject *result;
7240
7241 self = PyUnicode_FromObject(obj);
7242 if (self == NULL)
7243 return NULL;
7244 str1 = PyUnicode_FromObject(subobj);
7245 if (str1 == NULL) {
7246 Py_DECREF(self);
7247 return NULL;
7248 }
7249 str2 = PyUnicode_FromObject(replobj);
7250 if (str2 == NULL) {
7251 Py_DECREF(self);
7252 Py_DECREF(str1);
7253 return NULL;
7254 }
Tim Petersced69f82003-09-16 20:30:58 +00007255 result = replace((PyUnicodeObject *)self,
7256 (PyUnicodeObject *)str1,
7257 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 maxcount);
7259 Py_DECREF(self);
7260 Py_DECREF(str1);
7261 Py_DECREF(str2);
7262 return result;
7263}
7264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266"S.replace (old, new[, maxsplit]) -> unicode\n\
7267\n\
7268Return a copy of S with all occurrences of substring\n\
7269old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007270given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272static PyObject*
7273unicode_replace(PyUnicodeObject *self, PyObject *args)
7274{
7275 PyUnicodeObject *str1;
7276 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007277 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 PyObject *result;
7279
Martin v. Löwis18e16552006-02-15 17:27:45 +00007280 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 return NULL;
7282 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7283 if (str1 == NULL)
7284 return NULL;
7285 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007286 if (str2 == NULL) {
7287 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
7291 result = replace(self, str1, str2, maxcount);
7292
7293 Py_DECREF(str1);
7294 Py_DECREF(str2);
7295 return result;
7296}
7297
7298static
7299PyObject *unicode_repr(PyObject *unicode)
7300{
7301 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7302 PyUnicode_GET_SIZE(unicode),
7303 1);
7304}
7305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307"S.rfind(sub [,start [,end]]) -> int\n\
7308\n\
7309Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007310such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311arguments start and end are interpreted as in slice notation.\n\
7312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007313Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315static PyObject *
7316unicode_rfind(PyUnicodeObject *self, PyObject *args)
7317{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007318 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007319 Py_ssize_t start;
7320 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007321 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
Facundo Batista57d56692007-11-16 18:04:14 +00007323 if (!_ParseTupleFinds(args, &substring, &start, &end))
7324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007326 result = stringlib_rfind_slice(
7327 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7328 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7329 start, end
7330 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007333
7334 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335}
7336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338"S.rindex(sub [,start [,end]]) -> int\n\
7339\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007340Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
7342static PyObject *
7343unicode_rindex(PyUnicodeObject *self, PyObject *args)
7344{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007345 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007346 Py_ssize_t start;
7347 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007348 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
Facundo Batista57d56692007-11-16 18:04:14 +00007350 if (!_ParseTupleFinds(args, &substring, &start, &end))
7351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007353 result = stringlib_rfind_slice(
7354 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7355 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7356 start, end
7357 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358
7359 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007360
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 if (result < 0) {
7362 PyErr_SetString(PyExc_ValueError, "substring not found");
7363 return NULL;
7364 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007365 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366}
7367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007369"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370\n\
7371Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007372done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374static PyObject *
7375unicode_rjust(PyUnicodeObject *self, PyObject *args)
7376{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007377 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007378 Py_UNICODE fillchar = ' ';
7379
Martin v. Löwis412fb672006-04-13 06:34:32 +00007380 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 return NULL;
7382
Tim Peters7a29bd52001-09-12 03:03:31 +00007383 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 Py_INCREF(self);
7385 return (PyObject*) self;
7386 }
7387
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007388 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007392unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
7394 /* standard clamping */
7395 if (start < 0)
7396 start = 0;
7397 if (end < 0)
7398 end = 0;
7399 if (end > self->length)
7400 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007401 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 /* full slice, return original string */
7403 Py_INCREF(self);
7404 return (PyObject*) self;
7405 }
7406 if (start > end)
7407 start = end;
7408 /* copy slice */
7409 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7410 end - start);
7411}
7412
7413PyObject *PyUnicode_Split(PyObject *s,
7414 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007415 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007418
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 s = PyUnicode_FromObject(s);
7420 if (s == NULL)
7421 return NULL;
7422 if (sep != NULL) {
7423 sep = PyUnicode_FromObject(sep);
7424 if (sep == NULL) {
7425 Py_DECREF(s);
7426 return NULL;
7427 }
7428 }
7429
7430 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7431
7432 Py_DECREF(s);
7433 Py_XDECREF(sep);
7434 return result;
7435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438"S.split([sep [,maxsplit]]) -> list of strings\n\
7439\n\
7440Return a list of the words in S, using sep as the\n\
7441delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007442splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007443any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
7445static PyObject*
7446unicode_split(PyUnicodeObject *self, PyObject *args)
7447{
7448 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007449 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
Martin v. Löwis18e16552006-02-15 17:27:45 +00007451 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 return NULL;
7453
7454 if (substring == Py_None)
7455 return split(self, NULL, maxcount);
7456 else if (PyUnicode_Check(substring))
7457 return split(self, (PyUnicodeObject *)substring, maxcount);
7458 else
7459 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7460}
7461
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007462PyObject *
7463PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7464{
7465 PyObject* str_obj;
7466 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007467 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007468
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007469 str_obj = PyUnicode_FromObject(str_in);
7470 if (!str_obj)
7471 return NULL;
7472 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007473 if (!sep_obj) {
7474 Py_DECREF(str_obj);
7475 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007476 }
7477
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007478 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007479 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7480 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7481 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007482
Fredrik Lundhb9479482006-05-26 17:22:38 +00007483 Py_DECREF(sep_obj);
7484 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007485
7486 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007487}
7488
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007489
7490PyObject *
7491PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7492{
7493 PyObject* str_obj;
7494 PyObject* sep_obj;
7495 PyObject* out;
7496
7497 str_obj = PyUnicode_FromObject(str_in);
7498 if (!str_obj)
7499 return NULL;
7500 sep_obj = PyUnicode_FromObject(sep_in);
7501 if (!sep_obj) {
7502 Py_DECREF(str_obj);
7503 return NULL;
7504 }
7505
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007506 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007507 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7508 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7509 );
7510
7511 Py_DECREF(sep_obj);
7512 Py_DECREF(str_obj);
7513
7514 return out;
7515}
7516
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007517PyDoc_STRVAR(partition__doc__,
7518"S.partition(sep) -> (head, sep, tail)\n\
7519\n\
7520Searches for the separator sep in S, and returns the part before it,\n\
7521the separator itself, and the part after it. If the separator is not\n\
7522found, returns S and two empty strings.");
7523
7524static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007525unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007526{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007527 return PyUnicode_Partition((PyObject *)self, separator);
7528}
7529
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007530PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007531"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007532\n\
7533Searches for the separator sep in S, starting at the end of S, and returns\n\
7534the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007535separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007536
7537static PyObject*
7538unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7539{
7540 return PyUnicode_RPartition((PyObject *)self, separator);
7541}
7542
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007543PyObject *PyUnicode_RSplit(PyObject *s,
7544 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007545 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007546{
7547 PyObject *result;
7548
7549 s = PyUnicode_FromObject(s);
7550 if (s == NULL)
7551 return NULL;
7552 if (sep != NULL) {
7553 sep = PyUnicode_FromObject(sep);
7554 if (sep == NULL) {
7555 Py_DECREF(s);
7556 return NULL;
7557 }
7558 }
7559
7560 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7561
7562 Py_DECREF(s);
7563 Py_XDECREF(sep);
7564 return result;
7565}
7566
7567PyDoc_STRVAR(rsplit__doc__,
7568"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7569\n\
7570Return a list of the words in S, using sep as the\n\
7571delimiter string, starting at the end of the string and\n\
7572working to the front. If maxsplit is given, at most maxsplit\n\
7573splits are done. If sep is not specified, any whitespace string\n\
7574is a separator.");
7575
7576static PyObject*
7577unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7578{
7579 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007581
Martin v. Löwis18e16552006-02-15 17:27:45 +00007582 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007583 return NULL;
7584
7585 if (substring == Py_None)
7586 return rsplit(self, NULL, maxcount);
7587 else if (PyUnicode_Check(substring))
7588 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7589 else
7590 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7591}
7592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007593PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007594"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595\n\
7596Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007597Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007598is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
7600static PyObject*
7601unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7602{
Guido van Rossum86662912000-04-11 15:38:46 +00007603 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
Guido van Rossum86662912000-04-11 15:38:46 +00007605 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 return NULL;
7607
Guido van Rossum86662912000-04-11 15:38:46 +00007608 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609}
7610
7611static
7612PyObject *unicode_str(PyUnicodeObject *self)
7613{
Fred Drakee4315f52000-05-09 19:53:39 +00007614 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615}
7616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618"S.swapcase() -> unicode\n\
7619\n\
7620Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007624unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 return fixup(self, fixswapcase);
7627}
7628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007629PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630"S.translate(table) -> unicode\n\
7631\n\
7632Return a copy of the string S, where all characters have been mapped\n\
7633through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007634Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7635Unmapped characters are left untouched. Characters mapped to None\n\
7636are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007639unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640{
Tim Petersced69f82003-09-16 20:30:58 +00007641 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007643 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 "ignore");
7645}
7646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648"S.upper() -> unicode\n\
7649\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
7652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007653unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 return fixup(self, fixupper);
7656}
7657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007658PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659"S.zfill(width) -> unicode\n\
7660\n\
7661Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007662of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
7664static PyObject *
7665unicode_zfill(PyUnicodeObject *self, PyObject *args)
7666{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 PyUnicodeObject *u;
7669
Martin v. Löwis18e16552006-02-15 17:27:45 +00007670 Py_ssize_t width;
7671 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 return NULL;
7673
7674 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007675 if (PyUnicode_CheckExact(self)) {
7676 Py_INCREF(self);
7677 return (PyObject*) self;
7678 }
7679 else
7680 return PyUnicode_FromUnicode(
7681 PyUnicode_AS_UNICODE(self),
7682 PyUnicode_GET_SIZE(self)
7683 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 }
7685
7686 fill = width - self->length;
7687
7688 u = pad(self, fill, 0, '0');
7689
Walter Dörwald068325e2002-04-15 13:36:47 +00007690 if (u == NULL)
7691 return NULL;
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 if (u->str[fill] == '+' || u->str[fill] == '-') {
7694 /* move sign to beginning of string */
7695 u->str[0] = u->str[fill];
7696 u->str[fill] = '0';
7697 }
7698
7699 return (PyObject*) u;
7700}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
7702#if 0
7703static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007704free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007706 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707}
7708#endif
7709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007711"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007713Return True if S starts with the specified prefix, False otherwise.\n\
7714With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007715With optional end, stop comparing S at that position.\n\
7716prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
7718static PyObject *
7719unicode_startswith(PyUnicodeObject *self,
7720 PyObject *args)
7721{
Georg Brandl24250812006-06-09 18:45:48 +00007722 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007724 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007725 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007726 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
Georg Brandl24250812006-06-09 18:45:48 +00007728 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007729 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007731 if (PyTuple_Check(subobj)) {
7732 Py_ssize_t i;
7733 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7734 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7735 PyTuple_GET_ITEM(subobj, i));
7736 if (substring == NULL)
7737 return NULL;
7738 result = tailmatch(self, substring, start, end, -1);
7739 Py_DECREF(substring);
7740 if (result) {
7741 Py_RETURN_TRUE;
7742 }
7743 }
7744 /* nothing matched */
7745 Py_RETURN_FALSE;
7746 }
7747 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007749 return NULL;
7750 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007752 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753}
7754
7755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007757"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007759Return True if S ends with the specified suffix, False otherwise.\n\
7760With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007761With optional end, stop comparing S at that position.\n\
7762suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
7764static PyObject *
7765unicode_endswith(PyUnicodeObject *self,
7766 PyObject *args)
7767{
Georg Brandl24250812006-06-09 18:45:48 +00007768 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007770 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007771 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007772 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
Georg Brandl24250812006-06-09 18:45:48 +00007774 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7775 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007777 if (PyTuple_Check(subobj)) {
7778 Py_ssize_t i;
7779 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7780 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7781 PyTuple_GET_ITEM(subobj, i));
7782 if (substring == NULL)
7783 return NULL;
7784 result = tailmatch(self, substring, start, end, +1);
7785 Py_DECREF(substring);
7786 if (result) {
7787 Py_RETURN_TRUE;
7788 }
7789 }
7790 Py_RETURN_FALSE;
7791 }
7792 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
Georg Brandl24250812006-06-09 18:45:48 +00007796 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007798 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799}
7800
7801
Eric Smitha9f7d622008-02-17 19:46:49 +00007802/* Implements do_string_format, which is unicode because of stringlib */
7803#include "stringlib/string_format.h"
7804
7805PyDoc_STRVAR(format__doc__,
7806"S.format(*args, **kwargs) -> unicode\n\
7807\n\
7808");
7809
7810PyDoc_STRVAR(p_format__doc__,
7811"S.__format__(format_spec) -> unicode\n\
7812\n\
7813");
7814
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007815
7816static PyObject *
7817unicode_getnewargs(PyUnicodeObject *v)
7818{
7819 return Py_BuildValue("(u#)", v->str, v->length);
7820}
7821
7822
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823static PyMethodDef unicode_methods[] = {
7824
7825 /* Order is according to common usage: often used methods should
7826 appear first, since lookup is done sequentially. */
7827
Georg Brandlecdc0a92006-03-30 12:19:07 +00007828 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007829 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7830 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007831 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7833 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7834 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7835 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7836 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7837 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7838 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007839 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007840 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7841 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7842 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007844 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007845/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7846 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7847 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7848 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007849 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007850 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007851 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007852 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007853 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7854 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7855 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7856 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7857 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7858 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7859 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7860 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7861 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7862 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7863 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7864 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7865 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7866 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007867 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007868 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7869 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7870 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7871 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007872#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007873 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874#endif
7875
7876#if 0
7877 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007878 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879#endif
7880
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007881 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 {NULL, NULL}
7883};
7884
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007885static PyObject *
7886unicode_mod(PyObject *v, PyObject *w)
7887{
7888 if (!PyUnicode_Check(v)) {
7889 Py_INCREF(Py_NotImplemented);
7890 return Py_NotImplemented;
7891 }
7892 return PyUnicode_Format(v, w);
7893}
7894
7895static PyNumberMethods unicode_as_number = {
7896 0, /*nb_add*/
7897 0, /*nb_subtract*/
7898 0, /*nb_multiply*/
7899 0, /*nb_divide*/
7900 unicode_mod, /*nb_remainder*/
7901};
7902
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007904 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007905 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007906 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7907 (ssizeargfunc) unicode_getitem, /* sq_item */
7908 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 0, /* sq_ass_item */
7910 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007911 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912};
7913
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007914static PyObject*
7915unicode_subscript(PyUnicodeObject* self, PyObject* item)
7916{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007917 if (PyIndex_Check(item)) {
7918 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007919 if (i == -1 && PyErr_Occurred())
7920 return NULL;
7921 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007922 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 return unicode_getitem(self, i);
7924 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007925 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007926 Py_UNICODE* source_buf;
7927 Py_UNICODE* result_buf;
7928 PyObject* result;
7929
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007930 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007931 &start, &stop, &step, &slicelength) < 0) {
7932 return NULL;
7933 }
7934
7935 if (slicelength <= 0) {
7936 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007937 } else if (start == 0 && step == 1 && slicelength == self->length &&
7938 PyUnicode_CheckExact(self)) {
7939 Py_INCREF(self);
7940 return (PyObject *)self;
7941 } else if (step == 1) {
7942 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007943 } else {
7944 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007945 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7946 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007947
7948 if (result_buf == NULL)
7949 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007950
7951 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7952 result_buf[i] = source_buf[cur];
7953 }
Tim Petersced69f82003-09-16 20:30:58 +00007954
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007955 result = PyUnicode_FromUnicode(result_buf, slicelength);
7956 PyMem_FREE(result_buf);
7957 return result;
7958 }
7959 } else {
7960 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7961 return NULL;
7962 }
7963}
7964
7965static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007966 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007967 (binaryfunc)unicode_subscript, /* mp_subscript */
7968 (objobjargproc)0, /* mp_ass_subscript */
7969};
7970
Martin v. Löwis18e16552006-02-15 17:27:45 +00007971static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 const void **ptr)
7975{
7976 if (index != 0) {
7977 PyErr_SetString(PyExc_SystemError,
7978 "accessing non-existent unicode segment");
7979 return -1;
7980 }
7981 *ptr = (void *) self->str;
7982 return PyUnicode_GET_DATA_SIZE(self);
7983}
7984
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985static Py_ssize_t
7986unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 const void **ptr)
7988{
7989 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007990 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 return -1;
7992}
7993
7994static int
7995unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007996 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
7998 if (lenp)
7999 *lenp = PyUnicode_GET_DATA_SIZE(self);
8000 return 1;
8001}
8002
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008003static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 const void **ptr)
8007{
8008 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 if (index != 0) {
8011 PyErr_SetString(PyExc_SystemError,
8012 "accessing non-existent unicode segment");
8013 return -1;
8014 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008015 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 if (str == NULL)
8017 return -1;
8018 *ptr = (void *) PyString_AS_STRING(str);
8019 return PyString_GET_SIZE(str);
8020}
8021
8022/* Helpers for PyUnicode_Format() */
8023
8024static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008025getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 if (argidx < arglen) {
8029 (*p_argidx)++;
8030 if (arglen < 0)
8031 return args;
8032 else
8033 return PyTuple_GetItem(args, argidx);
8034 }
8035 PyErr_SetString(PyExc_TypeError,
8036 "not enough arguments for format string");
8037 return NULL;
8038}
8039
8040#define F_LJUST (1<<0)
8041#define F_SIGN (1<<1)
8042#define F_BLANK (1<<2)
8043#define F_ALT (1<<3)
8044#define F_ZERO (1<<4)
8045
Martin v. Löwis18e16552006-02-15 17:27:45 +00008046static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008047strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008049 register Py_ssize_t i;
8050 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 for (i = len - 1; i >= 0; i--)
8052 buffer[i] = (Py_UNICODE) charbuffer[i];
8053
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return len;
8055}
8056
Neal Norwitzfc76d632006-01-10 06:03:13 +00008057static int
8058doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8059{
Tim Peters15231542006-02-16 01:08:01 +00008060 Py_ssize_t result;
8061
Neal Norwitzfc76d632006-01-10 06:03:13 +00008062 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008063 result = strtounicode(buffer, (char *)buffer);
8064 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008065}
8066
8067static int
8068longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8069{
Tim Peters15231542006-02-16 01:08:01 +00008070 Py_ssize_t result;
8071
Neal Norwitzfc76d632006-01-10 06:03:13 +00008072 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008073 result = strtounicode(buffer, (char *)buffer);
8074 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008075}
8076
Guido van Rossum078151d2002-08-11 04:24:12 +00008077/* XXX To save some code duplication, formatfloat/long/int could have been
8078 shared with stringobject.c, converting from 8-bit to Unicode after the
8079 formatting is done. */
8080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081static int
8082formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008083 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 int flags,
8085 int prec,
8086 int type,
8087 PyObject *v)
8088{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008089 /* fmt = '%#.' + `prec` + `type`
8090 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 char fmt[20];
8092 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008093
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 x = PyFloat_AsDouble(v);
8095 if (x == -1.0 && PyErr_Occurred())
8096 return -1;
8097 if (prec < 0)
8098 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8100 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008101 /* Worst case length calc to ensure no buffer overrun:
8102
8103 'g' formats:
8104 fmt = %#.<prec>g
8105 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8106 for any double rep.)
8107 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8108
8109 'f' formats:
8110 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8111 len = 1 + 50 + 1 + prec = 52 + prec
8112
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008113 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008114 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008115
8116 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008117 if (((type == 'g' || type == 'G') &&
8118 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008119 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008120 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008121 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008122 return -1;
8123 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008124 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8125 (flags&F_ALT) ? "#" : "",
8126 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008127 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128}
8129
Tim Peters38fd5b62000-09-21 05:43:11 +00008130static PyObject*
8131formatlong(PyObject *val, int flags, int prec, int type)
8132{
8133 char *buf;
8134 int i, len;
8135 PyObject *str; /* temporary string object. */
8136 PyUnicodeObject *result;
8137
8138 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8139 if (!str)
8140 return NULL;
8141 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008142 if (!result) {
8143 Py_DECREF(str);
8144 return NULL;
8145 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008146 for (i = 0; i < len; i++)
8147 result->str[i] = buf[i];
8148 result->str[len] = 0;
8149 Py_DECREF(str);
8150 return (PyObject*)result;
8151}
8152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153static int
8154formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008155 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 int flags,
8157 int prec,
8158 int type,
8159 PyObject *v)
8160{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008161 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008162 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8163 * + 1 + 1
8164 * = 24
8165 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008166 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008167 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 long x;
8169
8170 x = PyInt_AsLong(v);
8171 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008172 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008173 if (x < 0 && type == 'u') {
8174 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008175 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008176 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8177 sign = "-";
8178 else
8179 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008181 prec = 1;
8182
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008183 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8184 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008185 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008186 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008187 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008188 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008189 return -1;
8190 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008191
8192 if ((flags & F_ALT) &&
8193 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008194 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008195 * of issues that cause pain:
8196 * - when 0 is being converted, the C standard leaves off
8197 * the '0x' or '0X', which is inconsistent with other
8198 * %#x/%#X conversions and inconsistent with Python's
8199 * hex() function
8200 * - there are platforms that violate the standard and
8201 * convert 0 with the '0x' or '0X'
8202 * (Metrowerks, Compaq Tru64)
8203 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008204 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008205 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008206 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008207 * We can achieve the desired consistency by inserting our
8208 * own '0x' or '0X' prefix, and substituting %x/%X in place
8209 * of %#x/%#X.
8210 *
8211 * Note that this is the same approach as used in
8212 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008213 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008214 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8215 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008216 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008217 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008218 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8219 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008220 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008221 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008222 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008223 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008224 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008225 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226}
8227
8228static int
8229formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008230 size_t buflen,
8231 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008233 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008234 if (PyUnicode_Check(v)) {
8235 if (PyUnicode_GET_SIZE(v) != 1)
8236 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008240 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008241 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008242 goto onError;
8243 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
8246 else {
8247 /* Integer input truncated to a character */
8248 long x;
8249 x = PyInt_AsLong(v);
8250 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008251 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008252#ifdef Py_UNICODE_WIDE
8253 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008254 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008255 "%c arg not in range(0x110000) "
8256 "(wide Python build)");
8257 return -1;
8258 }
8259#else
8260 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008261 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008262 "%c arg not in range(0x10000) "
8263 "(narrow Python build)");
8264 return -1;
8265 }
8266#endif
8267 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 }
8269 buf[1] = '\0';
8270 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008271
8272 onError:
8273 PyErr_SetString(PyExc_TypeError,
8274 "%c requires int or char");
8275 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276}
8277
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8279
8280 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8281 chars are formatted. XXX This is a magic number. Each formatting
8282 routine does bounds checking to ensure no overflow, but a better
8283 solution may be to malloc a buffer of appropriate size for each
8284 format. For now, the current solution is sufficient.
8285*/
8286#define FORMATBUFLEN (size_t)120
8287
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288PyObject *PyUnicode_Format(PyObject *format,
8289 PyObject *args)
8290{
8291 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 int args_owned = 0;
8294 PyUnicodeObject *result = NULL;
8295 PyObject *dict = NULL;
8296 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008297
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 if (format == NULL || args == NULL) {
8299 PyErr_BadInternalCall();
8300 return NULL;
8301 }
8302 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008303 if (uformat == NULL)
8304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 fmt = PyUnicode_AS_UNICODE(uformat);
8306 fmtcnt = PyUnicode_GET_SIZE(uformat);
8307
8308 reslen = rescnt = fmtcnt + 100;
8309 result = _PyUnicode_New(reslen);
8310 if (result == NULL)
8311 goto onError;
8312 res = PyUnicode_AS_UNICODE(result);
8313
8314 if (PyTuple_Check(args)) {
8315 arglen = PyTuple_Size(args);
8316 argidx = 0;
8317 }
8318 else {
8319 arglen = -1;
8320 argidx = -2;
8321 }
Christian Heimese93237d2007-12-19 02:37:44 +00008322 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008323 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 dict = args;
8325
8326 while (--fmtcnt >= 0) {
8327 if (*fmt != '%') {
8328 if (--rescnt < 0) {
8329 rescnt = fmtcnt + 100;
8330 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008331 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008332 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8334 --rescnt;
8335 }
8336 *res++ = *fmt++;
8337 }
8338 else {
8339 /* Got a format specifier */
8340 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008341 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 Py_UNICODE c = '\0';
8344 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008345 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 PyObject *v = NULL;
8347 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008348 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008350 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008351 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352
8353 fmt++;
8354 if (*fmt == '(') {
8355 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008356 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 PyObject *key;
8358 int pcount = 1;
8359
8360 if (dict == NULL) {
8361 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008362 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 goto onError;
8364 }
8365 ++fmt;
8366 --fmtcnt;
8367 keystart = fmt;
8368 /* Skip over balanced parentheses */
8369 while (pcount > 0 && --fmtcnt >= 0) {
8370 if (*fmt == ')')
8371 --pcount;
8372 else if (*fmt == '(')
8373 ++pcount;
8374 fmt++;
8375 }
8376 keylen = fmt - keystart - 1;
8377 if (fmtcnt < 0 || pcount > 0) {
8378 PyErr_SetString(PyExc_ValueError,
8379 "incomplete format key");
8380 goto onError;
8381 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008382#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008383 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 then looked up since Python uses strings to hold
8385 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008386 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 key = PyUnicode_EncodeUTF8(keystart,
8388 keylen,
8389 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008390#else
8391 key = PyUnicode_FromUnicode(keystart, keylen);
8392#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 if (key == NULL)
8394 goto onError;
8395 if (args_owned) {
8396 Py_DECREF(args);
8397 args_owned = 0;
8398 }
8399 args = PyObject_GetItem(dict, key);
8400 Py_DECREF(key);
8401 if (args == NULL) {
8402 goto onError;
8403 }
8404 args_owned = 1;
8405 arglen = -1;
8406 argidx = -2;
8407 }
8408 while (--fmtcnt >= 0) {
8409 switch (c = *fmt++) {
8410 case '-': flags |= F_LJUST; continue;
8411 case '+': flags |= F_SIGN; continue;
8412 case ' ': flags |= F_BLANK; continue;
8413 case '#': flags |= F_ALT; continue;
8414 case '0': flags |= F_ZERO; continue;
8415 }
8416 break;
8417 }
8418 if (c == '*') {
8419 v = getnextarg(args, arglen, &argidx);
8420 if (v == NULL)
8421 goto onError;
8422 if (!PyInt_Check(v)) {
8423 PyErr_SetString(PyExc_TypeError,
8424 "* wants int");
8425 goto onError;
8426 }
8427 width = PyInt_AsLong(v);
8428 if (width < 0) {
8429 flags |= F_LJUST;
8430 width = -width;
8431 }
8432 if (--fmtcnt >= 0)
8433 c = *fmt++;
8434 }
8435 else if (c >= '0' && c <= '9') {
8436 width = c - '0';
8437 while (--fmtcnt >= 0) {
8438 c = *fmt++;
8439 if (c < '0' || c > '9')
8440 break;
8441 if ((width*10) / 10 != width) {
8442 PyErr_SetString(PyExc_ValueError,
8443 "width too big");
8444 goto onError;
8445 }
8446 width = width*10 + (c - '0');
8447 }
8448 }
8449 if (c == '.') {
8450 prec = 0;
8451 if (--fmtcnt >= 0)
8452 c = *fmt++;
8453 if (c == '*') {
8454 v = getnextarg(args, arglen, &argidx);
8455 if (v == NULL)
8456 goto onError;
8457 if (!PyInt_Check(v)) {
8458 PyErr_SetString(PyExc_TypeError,
8459 "* wants int");
8460 goto onError;
8461 }
8462 prec = PyInt_AsLong(v);
8463 if (prec < 0)
8464 prec = 0;
8465 if (--fmtcnt >= 0)
8466 c = *fmt++;
8467 }
8468 else if (c >= '0' && c <= '9') {
8469 prec = c - '0';
8470 while (--fmtcnt >= 0) {
8471 c = Py_CHARMASK(*fmt++);
8472 if (c < '0' || c > '9')
8473 break;
8474 if ((prec*10) / 10 != prec) {
8475 PyErr_SetString(PyExc_ValueError,
8476 "prec too big");
8477 goto onError;
8478 }
8479 prec = prec*10 + (c - '0');
8480 }
8481 }
8482 } /* prec */
8483 if (fmtcnt >= 0) {
8484 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 if (--fmtcnt >= 0)
8486 c = *fmt++;
8487 }
8488 }
8489 if (fmtcnt < 0) {
8490 PyErr_SetString(PyExc_ValueError,
8491 "incomplete format");
8492 goto onError;
8493 }
8494 if (c != '%') {
8495 v = getnextarg(args, arglen, &argidx);
8496 if (v == NULL)
8497 goto onError;
8498 }
8499 sign = 0;
8500 fill = ' ';
8501 switch (c) {
8502
8503 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008504 pbuf = formatbuf;
8505 /* presume that buffer length is at least 1 */
8506 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 len = 1;
8508 break;
8509
8510 case 's':
8511 case 'r':
8512 if (PyUnicode_Check(v) && c == 's') {
8513 temp = v;
8514 Py_INCREF(temp);
8515 }
8516 else {
8517 PyObject *unicode;
8518 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008519 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 else
8521 temp = PyObject_Repr(v);
8522 if (temp == NULL)
8523 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008524 if (PyUnicode_Check(temp))
8525 /* nothing to do */;
8526 else if (PyString_Check(temp)) {
8527 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008528 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008530 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008532 Py_DECREF(temp);
8533 temp = unicode;
8534 if (temp == NULL)
8535 goto onError;
8536 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008537 else {
8538 Py_DECREF(temp);
8539 PyErr_SetString(PyExc_TypeError,
8540 "%s argument has non-string str()");
8541 goto onError;
8542 }
8543 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008544 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 len = PyUnicode_GET_SIZE(temp);
8546 if (prec >= 0 && len > prec)
8547 len = prec;
8548 break;
8549
8550 case 'i':
8551 case 'd':
8552 case 'u':
8553 case 'o':
8554 case 'x':
8555 case 'X':
8556 if (c == 'i')
8557 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008558 isnumok = 0;
8559 if (PyNumber_Check(v)) {
8560 PyObject *iobj=NULL;
8561
8562 if (PyInt_Check(v) || (PyLong_Check(v))) {
8563 iobj = v;
8564 Py_INCREF(iobj);
8565 }
8566 else {
8567 iobj = PyNumber_Int(v);
8568 if (iobj==NULL) iobj = PyNumber_Long(v);
8569 }
8570 if (iobj!=NULL) {
8571 if (PyInt_Check(iobj)) {
8572 isnumok = 1;
8573 pbuf = formatbuf;
8574 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8575 flags, prec, c, iobj);
8576 Py_DECREF(iobj);
8577 if (len < 0)
8578 goto onError;
8579 sign = 1;
8580 }
8581 else if (PyLong_Check(iobj)) {
8582 isnumok = 1;
8583 temp = formatlong(iobj, flags, prec, c);
8584 Py_DECREF(iobj);
8585 if (!temp)
8586 goto onError;
8587 pbuf = PyUnicode_AS_UNICODE(temp);
8588 len = PyUnicode_GET_SIZE(temp);
8589 sign = 1;
8590 }
8591 else {
8592 Py_DECREF(iobj);
8593 }
8594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008596 if (!isnumok) {
8597 PyErr_Format(PyExc_TypeError,
8598 "%%%c format: a number is required, "
8599 "not %.200s", c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008600 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008601 }
8602 if (flags & F_ZERO)
8603 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 break;
8605
8606 case 'e':
8607 case 'E':
8608 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008609 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 case 'g':
8611 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008612 if (c == 'F')
8613 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008614 pbuf = formatbuf;
8615 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8616 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 if (len < 0)
8618 goto onError;
8619 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008620 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 fill = '0';
8622 break;
8623
8624 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008625 pbuf = formatbuf;
8626 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 if (len < 0)
8628 goto onError;
8629 break;
8630
8631 default:
8632 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008633 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008634 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008635 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008636 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008637 (Py_ssize_t)(fmt - 1 -
8638 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 goto onError;
8640 }
8641 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008642 if (*pbuf == '-' || *pbuf == '+') {
8643 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 len--;
8645 }
8646 else if (flags & F_SIGN)
8647 sign = '+';
8648 else if (flags & F_BLANK)
8649 sign = ' ';
8650 else
8651 sign = 0;
8652 }
8653 if (width < len)
8654 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008655 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 reslen -= rescnt;
8657 rescnt = width + fmtcnt + 100;
8658 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008659 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008660 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008661 PyErr_NoMemory();
8662 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008663 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008664 if (_PyUnicode_Resize(&result, reslen) < 0) {
8665 Py_XDECREF(temp);
8666 goto onError;
8667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 res = PyUnicode_AS_UNICODE(result)
8669 + reslen - rescnt;
8670 }
8671 if (sign) {
8672 if (fill != ' ')
8673 *res++ = sign;
8674 rescnt--;
8675 if (width > len)
8676 width--;
8677 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008678 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8679 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008680 assert(pbuf[1] == c);
8681 if (fill != ' ') {
8682 *res++ = *pbuf++;
8683 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008684 }
Tim Petersfff53252001-04-12 18:38:48 +00008685 rescnt -= 2;
8686 width -= 2;
8687 if (width < 0)
8688 width = 0;
8689 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 if (width > len && !(flags & F_LJUST)) {
8692 do {
8693 --rescnt;
8694 *res++ = fill;
8695 } while (--width > len);
8696 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008697 if (fill == ' ') {
8698 if (sign)
8699 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008700 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008702 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008703 *res++ = *pbuf++;
8704 *res++ = *pbuf++;
8705 }
8706 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008707 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 res += len;
8709 rescnt -= len;
8710 while (--width >= len) {
8711 --rescnt;
8712 *res++ = ' ';
8713 }
8714 if (dict && (argidx < arglen) && c != '%') {
8715 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008716 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008717 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 goto onError;
8719 }
8720 Py_XDECREF(temp);
8721 } /* '%' */
8722 } /* until end */
8723 if (argidx < arglen && !dict) {
8724 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008725 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 goto onError;
8727 }
8728
Thomas Woutersa96affe2006-03-12 00:29:36 +00008729 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 if (args_owned) {
8732 Py_DECREF(args);
8733 }
8734 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 return (PyObject *)result;
8736
8737 onError:
8738 Py_XDECREF(result);
8739 Py_DECREF(uformat);
8740 if (args_owned) {
8741 Py_DECREF(args);
8742 }
8743 return NULL;
8744}
8745
8746static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008747 (readbufferproc) unicode_buffer_getreadbuf,
8748 (writebufferproc) unicode_buffer_getwritebuf,
8749 (segcountproc) unicode_buffer_getsegcount,
8750 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751};
8752
Jeremy Hylton938ace62002-07-17 16:30:39 +00008753static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008754unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8755
Tim Peters6d6c1a32001-08-02 04:15:00 +00008756static PyObject *
8757unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8758{
8759 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008760 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008761 char *encoding = NULL;
8762 char *errors = NULL;
8763
Guido van Rossume023fe02001-08-30 03:12:59 +00008764 if (type != &PyUnicode_Type)
8765 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008766 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8767 kwlist, &x, &encoding, &errors))
8768 return NULL;
8769 if (x == NULL)
8770 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008771 if (encoding == NULL && errors == NULL)
8772 return PyObject_Unicode(x);
8773 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008774 return PyUnicode_FromEncodedObject(x, encoding, errors);
8775}
8776
Guido van Rossume023fe02001-08-30 03:12:59 +00008777static PyObject *
8778unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8779{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008780 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008781 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008782
8783 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8784 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8785 if (tmp == NULL)
8786 return NULL;
8787 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008788 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008789 if (pnew == NULL) {
8790 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008791 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008792 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008793 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8794 if (pnew->str == NULL) {
8795 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008796 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008797 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008798 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008799 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008800 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8801 pnew->length = n;
8802 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008803 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008804 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008805}
8806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008807PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008808"unicode(string [, encoding[, errors]]) -> object\n\
8809\n\
8810Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008811encoding defaults to the current default string encoding.\n\
8812errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008813
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008815 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 "unicode", /* tp_name */
8817 sizeof(PyUnicodeObject), /* tp_size */
8818 0, /* tp_itemsize */
8819 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008820 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008822 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008824 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008825 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008826 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008828 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 (hashfunc) unicode_hash, /* tp_hash*/
8830 0, /* tp_call*/
8831 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008832 PyObject_GenericGetAttr, /* tp_getattro */
8833 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008835 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008836 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008837 unicode_doc, /* tp_doc */
8838 0, /* tp_traverse */
8839 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008840 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008841 0, /* tp_weaklistoffset */
8842 0, /* tp_iter */
8843 0, /* tp_iternext */
8844 unicode_methods, /* tp_methods */
8845 0, /* tp_members */
8846 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008847 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008848 0, /* tp_dict */
8849 0, /* tp_descr_get */
8850 0, /* tp_descr_set */
8851 0, /* tp_dictoffset */
8852 0, /* tp_init */
8853 0, /* tp_alloc */
8854 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008855 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856};
8857
8858/* Initialize the Unicode implementation */
8859
Thomas Wouters78890102000-07-22 19:25:51 +00008860void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008862 int i;
8863
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008864 /* XXX - move this array to unicodectype.c ? */
8865 Py_UNICODE linebreak[] = {
8866 0x000A, /* LINE FEED */
8867 0x000D, /* CARRIAGE RETURN */
8868 0x001C, /* FILE SEPARATOR */
8869 0x001D, /* GROUP SEPARATOR */
8870 0x001E, /* RECORD SEPARATOR */
8871 0x0085, /* NEXT LINE */
8872 0x2028, /* LINE SEPARATOR */
8873 0x2029, /* PARAGRAPH SEPARATOR */
8874 };
8875
Fred Drakee4315f52000-05-09 19:53:39 +00008876 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008877 free_list = NULL;
8878 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008880 if (!unicode_empty)
8881 return;
8882
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008883 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008884 for (i = 0; i < 256; i++)
8885 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008886 if (PyType_Ready(&PyUnicode_Type) < 0)
8887 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008888
8889 /* initialize the linebreak bloom filter */
8890 bloom_linebreak = make_bloom_mask(
8891 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8892 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008893
8894 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895}
8896
8897/* Finalize the Unicode implementation */
8898
Christian Heimes3b718a72008-02-14 12:47:33 +00008899int
8900PyUnicode_ClearFreeList(void)
8901{
8902 int freelist_size = numfree;
8903 PyUnicodeObject *u;
8904
8905 for (u = free_list; u != NULL;) {
8906 PyUnicodeObject *v = u;
8907 u = *(PyUnicodeObject **)u;
8908 if (v->str)
8909 PyMem_DEL(v->str);
8910 Py_XDECREF(v->defenc);
8911 PyObject_Del(v);
8912 numfree--;
8913 }
8914 free_list = NULL;
8915 assert(numfree == 0);
8916 return freelist_size;
8917}
8918
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919void
Thomas Wouters78890102000-07-22 19:25:51 +00008920_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008922 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008924 Py_XDECREF(unicode_empty);
8925 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008926
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008927 for (i = 0; i < 256; i++) {
8928 if (unicode_latin1[i]) {
8929 Py_DECREF(unicode_latin1[i]);
8930 unicode_latin1[i] = NULL;
8931 }
8932 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008933 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008935
Anthony Baxterac6bd462006-04-13 02:06:09 +00008936#ifdef __cplusplus
8937}
8938#endif
8939
8940
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008941/*
8942Local variables:
8943c-basic-offset: 4
8944indent-tabs-mode: nil
8945End:
8946*/