blob: 0dca976ab4fafca03bd7ac0ed17e1c1f873985b2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Eric Smitha9f7d622008-02-17 19:46:49 +000045#include "formatter_unicode.h"
46
Guido van Rossumd57fd912000-03-10 22:53:23 +000047#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
Christian Heimes5b970ad2008-02-06 13:33:44 +000056#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Christian Heimes5b970ad2008-02-06 13:33:44 +000064 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Anthony Baxterac6bd462006-04-13 02:06:09 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Guido van Rossumd57fd912000-03-10 22:53:23 +000097/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000098static PyUnicodeObject *free_list;
99static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000101/* The empty Unicode object is shared to improve performance. */
102static PyUnicodeObject *unicode_empty;
103
104/* Single character Unicode strings in the Latin-1 range are being
105 shared as well. */
106static PyUnicodeObject *unicode_latin1[256];
107
Fred Drakee4315f52000-05-09 19:53:39 +0000108/* Default encoding to use and assume when NULL is passed as encoding
109 parameter; it is initialized by _PyUnicode_Init().
110
111 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000112 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000113
114*/
Fred Drakee4315f52000-05-09 19:53:39 +0000115static char unicode_default_encoding[100];
116
Christian Heimes4d4f2702008-01-30 11:32:37 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119 0, 0, 0, 0, 0, 0, 0, 0,
120// case 0x0009: /* HORIZONTAL TABULATION */
121// case 0x000A: /* LINE FEED */
122// case 0x000B: /* VERTICAL TABULATION */
123// case 0x000C: /* FORM FEED */
124// case 0x000D: /* CARRIAGE RETURN */
125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
127// case 0x001C: /* FILE SEPARATOR */
128// case 0x001D: /* GROUP SEPARATOR */
129// case 0x001E: /* RECORD SEPARATOR */
130// case 0x001F: /* UNIT SEPARATOR */
131 0, 0, 0, 0, 1, 1, 1, 1,
132// case 0x0020: /* SPACE */
133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
146};
147
148/* Same for linebreaks */
149static unsigned char ascii_linebreak[] = {
150 0, 0, 0, 0, 0, 0, 0, 0,
151// 0x000A, /* LINE FEED */
152// 0x000D, /* CARRIAGE RETURN */
153 0, 0, 1, 0, 0, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155// 0x001C, /* FILE SEPARATOR */
156// 0x001D, /* GROUP SEPARATOR */
157// 0x001E, /* RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000179 return 0x10FFFF;
180#else
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#define BLOOM_MASK unsigned long
196
197static BLOOM_MASK bloom_linebreak;
198
199#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
200
Christian Heimes4d4f2702008-01-30 11:32:37 +0000201#define BLOOM_LINEBREAK(ch) \
202 ((ch) < 128U ? ascii_linebreak[(ch)] : \
203 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000205Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000206{
207 /* calculate simple bloom-style bitmask for a given unicode string */
208
209 long mask;
210 Py_ssize_t i;
211
212 mask = 0;
213 for (i = 0; i < len; i++)
214 mask |= (1 << (ptr[i] & 0x1F));
215
216 return mask;
217}
218
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000219Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000220{
221 Py_ssize_t i;
222
223 for (i = 0; i < setlen; i++)
224 if (set[i] == chr)
225 return 1;
226
Fredrik Lundh77633512006-05-23 19:47:35 +0000227 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000228}
229
230#define BLOOM_MEMBER(mask, chr, set, setlen)\
231 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233/* --- Unicode Object ----------------------------------------------------- */
234
235static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000236int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000240
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245 /* Resizing shared object (unicode_empty or single character
246 objects) in-place is not allowed. Use PyUnicode_Resize()
247 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000248
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000249 if (unicode == unicode_empty ||
250 (unicode->length == 1 &&
251 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 return -1;
256 }
257
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000258 /* We allocate one more byte to make sure the string is Ux0000 terminated.
259 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000260 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000261 it contains). */
262
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 oldstr = unicode->str;
264 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
265 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000266 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 PyErr_NoMemory();
268 return -1;
269 }
270 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000271 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000273 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000275 if (unicode->defenc) {
276 Py_DECREF(unicode->defenc);
277 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000280
Guido van Rossumd57fd912000-03-10 22:53:23 +0000281 return 0;
282}
283
284/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000285 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286
287 XXX This allocator could further be enhanced by assuring that the
288 free list never reduces its size below 1.
289
290*/
291
292static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000293PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
295 register PyUnicodeObject *unicode;
296
Andrew Dalkee0df7622006-05-27 11:04:36 +0000297 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 if (length == 0 && unicode_empty != NULL) {
299 Py_INCREF(unicode_empty);
300 return unicode_empty;
301 }
302
303 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000304 if (free_list) {
305 unicode = free_list;
306 free_list = *(PyUnicodeObject **)unicode;
307 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000309 /* Keep-Alive optimization: we only upsize the buffer,
310 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000311 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000312 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 }
316 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000317 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000319 }
320 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000323 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode == NULL)
325 return NULL;
326 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
327 }
328
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000329 if (!unicode->str) {
330 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000331 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000332 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000333 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000334 * the caller fails before initializing str -- unicode_resize()
335 * reads str[0], and the Keep-Alive optimization can keep memory
336 * allocated for str alive across a call to unicode_dealloc(unicode).
337 * We don't want unicode_resize to read uninitialized memory in
338 * that case.
339 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000342 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000344 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000346
347 onError:
348 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000349 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351}
352
353static
Guido van Rossum9475a232001-10-05 20:51:39 +0000354void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000356 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000357 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000358 /* Keep-Alive optimization */
359 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000360 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->str = NULL;
362 unicode->length = 0;
363 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 if (unicode->defenc) {
365 Py_DECREF(unicode->defenc);
366 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 }
368 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000369 *(PyUnicodeObject **)unicode = free_list;
370 free_list = unicode;
371 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 }
373 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000374 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000376 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 }
378}
379
Martin v. Löwis18e16552006-02-15 17:27:45 +0000380int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000381{
382 register PyUnicodeObject *v;
383
384 /* Argument checks */
385 if (unicode == NULL) {
386 PyErr_BadInternalCall();
387 return -1;
388 }
389 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000390 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 PyErr_BadInternalCall();
392 return -1;
393 }
394
395 /* Resizing unicode_empty and single character objects is not
396 possible since these are being shared. We simply return a fresh
397 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000398 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000399 (v == unicode_empty || v->length == 1)) {
400 PyUnicodeObject *w = _PyUnicode_New(length);
401 if (w == NULL)
402 return -1;
403 Py_UNICODE_COPY(w->str, v->str,
404 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000405 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 *unicode = (PyObject *)w;
407 return 0;
408 }
409
410 /* Note that we don't have to modify *unicode for unshared Unicode
411 objects, since we can modify them in-place. */
412 return unicode_resize(v, length);
413}
414
415/* Internal API for use in unicodeobject.c only ! */
416#define _PyUnicode_Resize(unicodevar, length) \
417 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
418
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000420 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421{
422 PyUnicodeObject *unicode;
423
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 /* If the Unicode data is known at construction time, we can apply
425 some optimizations which share commonly used objects. */
426 if (u != NULL) {
427
428 /* Optimization for empty strings */
429 if (size == 0 && unicode_empty != NULL) {
430 Py_INCREF(unicode_empty);
431 return (PyObject *)unicode_empty;
432 }
433
434 /* Single character Unicode objects in the Latin-1 range are
435 shared when using this constructor */
436 if (size == 1 && *u < 256) {
437 unicode = unicode_latin1[*u];
438 if (!unicode) {
439 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 if (!unicode)
441 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000442 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 unicode_latin1[*u] = unicode;
444 }
445 Py_INCREF(unicode);
446 return (PyObject *)unicode;
447 }
448 }
Tim Petersced69f82003-09-16 20:30:58 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450 unicode = _PyUnicode_New(size);
451 if (!unicode)
452 return NULL;
453
454 /* Copy the Unicode data into the new object */
455 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000456 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 return (PyObject *)unicode;
459}
460
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000461PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
462{
463 PyUnicodeObject *unicode;
464 /* If the Unicode data is known at construction time, we can apply
465 some optimizations which share commonly used objects.
466 Also, this means the input must be UTF-8, so fall back to the
467 UTF-8 decoder at the end. */
468 if (u != NULL) {
469
470 /* Optimization for empty strings */
471 if (size == 0 && unicode_empty != NULL) {
472 Py_INCREF(unicode_empty);
473 return (PyObject *)unicode_empty;
474 }
475
476 /* Single characters are shared when using this constructor.
477 Restrict to ASCII, since the input must be UTF-8. */
478 if (size == 1 && Py_CHARMASK(*u) < 128) {
479 unicode = unicode_latin1[Py_CHARMASK(*u)];
480 if (!unicode) {
481 unicode = _PyUnicode_New(1);
482 if (!unicode)
483 return NULL;
484 unicode->str[0] = Py_CHARMASK(*u);
485 unicode_latin1[Py_CHARMASK(*u)] = unicode;
486 }
487 Py_INCREF(unicode);
488 return (PyObject *)unicode;
489 }
490
491 return PyUnicode_DecodeUTF8(u, size, NULL);
492 }
493
494 unicode = _PyUnicode_New(size);
495 if (!unicode)
496 return NULL;
497
498 return (PyObject *)unicode;
499}
500
501PyObject *PyUnicode_FromString(const char *u)
502{
503 size_t size = strlen(u);
504 if (size > PY_SSIZE_T_MAX) {
505 PyErr_SetString(PyExc_OverflowError, "input too long");
506 return NULL;
507 }
508
509 return PyUnicode_FromStringAndSize(u, size);
510}
511
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512#ifdef HAVE_WCHAR_H
513
514PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516{
517 PyUnicodeObject *unicode;
518
519 if (w == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
523
524 unicode = _PyUnicode_New(size);
525 if (!unicode)
526 return NULL;
527
528 /* Copy the wchar_t data into the new object */
529#ifdef HAVE_USABLE_WCHAR_T
530 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000531#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 {
533 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000534 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000536 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537 *u++ = *w++;
538 }
539#endif
540
541 return (PyObject *)unicode;
542}
543
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000544static void
545makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
546{
547 *fmt++ = '%';
548 if (width) {
549 if (zeropad)
550 *fmt++ = '0';
551 fmt += sprintf(fmt, "%d", width);
552 }
553 if (precision)
554 fmt += sprintf(fmt, ".%d", precision);
555 if (longflag)
556 *fmt++ = 'l';
557 else if (size_tflag) {
558 char *f = PY_FORMAT_SIZE_T;
559 while (*f)
560 *fmt++ = *f++;
561 }
562 *fmt++ = c;
563 *fmt = '\0';
564}
565
566#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
567
568PyObject *
569PyUnicode_FromFormatV(const char *format, va_list vargs)
570{
571 va_list count;
572 Py_ssize_t callcount = 0;
573 PyObject **callresults = NULL;
574 PyObject **callresult = NULL;
575 Py_ssize_t n = 0;
576 int width = 0;
577 int precision = 0;
578 int zeropad;
579 const char* f;
580 Py_UNICODE *s;
581 PyObject *string;
582 /* used by sprintf */
583 char buffer[21];
584 /* use abuffer instead of buffer, if we need more space
585 * (which can happen if there's a format specifier with width). */
586 char *abuffer = NULL;
587 char *realbuffer;
588 Py_ssize_t abuffersize = 0;
589 char fmt[60]; /* should be enough for %0width.precisionld */
590 const char *copy;
591
592#ifdef VA_LIST_IS_ARRAY
593 Py_MEMCPY(count, vargs, sizeof(va_list));
594#else
595#ifdef __va_copy
596 __va_copy(count, vargs);
597#else
598 count = vargs;
599#endif
600#endif
601 /* step 1: count the number of %S/%R format specifications
602 * (we call PyObject_Str()/PyObject_Repr() for these objects
603 * once during step 3 and put the result in an array) */
604 for (f = format; *f; f++) {
605 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
606 ++callcount;
607 }
608 /* step 2: allocate memory for the results of
609 * PyObject_Str()/PyObject_Repr() calls */
610 if (callcount) {
611 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
612 if (!callresults) {
613 PyErr_NoMemory();
614 return NULL;
615 }
616 callresult = callresults;
617 }
618 /* step 3: figure out how large a buffer we need */
619 for (f = format; *f; f++) {
620 if (*f == '%') {
621 const char* p = f;
622 width = 0;
623 while (isdigit(*f))
624 width = (width*10) + *f++ - '0';
625 while (*++f && *f != '%' && !isalpha(*f))
626 ;
627
628 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
629 * they don't affect the amount of space we reserve.
630 */
631 if ((*f == 'l' || *f == 'z') &&
632 (f[1] == 'd' || f[1] == 'u'))
633 ++f;
634
635 switch (*f) {
636 case 'c':
637 (void)va_arg(count, int);
638 /* fall through... */
639 case '%':
640 n++;
641 break;
642 case 'd': case 'u': case 'i': case 'x':
643 (void) va_arg(count, int);
644 /* 20 bytes is enough to hold a 64-bit
645 integer. Decimal takes the most space.
646 This isn't enough for octal.
647 If a width is specified we need more
648 (which we allocate later). */
649 if (width < 20)
650 width = 20;
651 n += width;
652 if (abuffersize < width)
653 abuffersize = width;
654 break;
655 case 's':
656 {
657 /* UTF-8 */
658 unsigned char*s;
659 s = va_arg(count, unsigned char*);
660 while (*s) {
661 if (*s < 128) {
662 n++; s++;
663 } else if (*s < 0xc0) {
664 /* invalid UTF-8 */
665 n++; s++;
666 } else if (*s < 0xc0) {
667 n++;
668 s++; if(!*s)break;
669 s++;
670 } else if (*s < 0xe0) {
671 n++;
672 s++; if(!*s)break;
673 s++; if(!*s)break;
674 s++;
675 } else {
676 #ifdef Py_UNICODE_WIDE
677 n++;
678 #else
679 n+=2;
680 #endif
681 s++; if(!*s)break;
682 s++; if(!*s)break;
683 s++; if(!*s)break;
684 s++;
685 }
686 }
687 break;
688 }
689 case 'U':
690 {
691 PyObject *obj = va_arg(count, PyObject *);
692 assert(obj && PyUnicode_Check(obj));
693 n += PyUnicode_GET_SIZE(obj);
694 break;
695 }
696 case 'V':
697 {
698 PyObject *obj = va_arg(count, PyObject *);
699 const char *str = va_arg(count, const char *);
700 assert(obj || str);
701 assert(!obj || PyUnicode_Check(obj));
702 if (obj)
703 n += PyUnicode_GET_SIZE(obj);
704 else
705 n += strlen(str);
706 break;
707 }
708 case 'S':
709 {
710 PyObject *obj = va_arg(count, PyObject *);
711 PyObject *str;
712 assert(obj);
713 str = PyObject_Str(obj);
714 if (!str)
715 goto fail;
716 n += PyUnicode_GET_SIZE(str);
717 /* Remember the str and switch to the next slot */
718 *callresult++ = str;
719 break;
720 }
721 case 'R':
722 {
723 PyObject *obj = va_arg(count, PyObject *);
724 PyObject *repr;
725 assert(obj);
726 repr = PyObject_Repr(obj);
727 if (!repr)
728 goto fail;
729 n += PyUnicode_GET_SIZE(repr);
730 /* Remember the repr and switch to the next slot */
731 *callresult++ = repr;
732 break;
733 }
734 case 'p':
735 (void) va_arg(count, int);
736 /* maximum 64-bit pointer representation:
737 * 0xffffffffffffffff
738 * so 19 characters is enough.
739 * XXX I count 18 -- what's the extra for?
740 */
741 n += 19;
742 break;
743 default:
744 /* if we stumble upon an unknown
745 formatting code, copy the rest of
746 the format string to the output
747 string. (we cannot just skip the
748 code, since there's no way to know
749 what's in the argument list) */
750 n += strlen(p);
751 goto expand;
752 }
753 } else
754 n++;
755 }
756 expand:
757 if (abuffersize > 20) {
758 abuffer = PyMem_Malloc(abuffersize);
759 if (!abuffer) {
760 PyErr_NoMemory();
761 goto fail;
762 }
763 realbuffer = abuffer;
764 }
765 else
766 realbuffer = buffer;
767 /* step 4: fill the buffer */
768 /* Since we've analyzed how much space we need for the worst case,
769 we don't have to resize the string.
770 There can be no errors beyond this point. */
771 string = PyUnicode_FromUnicode(NULL, n);
772 if (!string)
773 goto fail;
774
775 s = PyUnicode_AS_UNICODE(string);
776 callresult = callresults;
777
778 for (f = format; *f; f++) {
779 if (*f == '%') {
780 const char* p = f++;
781 int longflag = 0;
782 int size_tflag = 0;
783 zeropad = (*f == '0');
784 /* parse the width.precision part */
785 width = 0;
786 while (isdigit(*f))
787 width = (width*10) + *f++ - '0';
788 precision = 0;
789 if (*f == '.') {
790 f++;
791 while (isdigit(*f))
792 precision = (precision*10) + *f++ - '0';
793 }
794 /* handle the long flag, but only for %ld and %lu.
795 others can be added when necessary. */
796 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
797 longflag = 1;
798 ++f;
799 }
800 /* handle the size_t flag. */
801 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
802 size_tflag = 1;
803 ++f;
804 }
805
806 switch (*f) {
807 case 'c':
808 *s++ = va_arg(vargs, int);
809 break;
810 case 'd':
811 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
812 if (longflag)
813 sprintf(realbuffer, fmt, va_arg(vargs, long));
814 else if (size_tflag)
815 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
816 else
817 sprintf(realbuffer, fmt, va_arg(vargs, int));
818 appendstring(realbuffer);
819 break;
820 case 'u':
821 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
822 if (longflag)
823 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
824 else if (size_tflag)
825 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
826 else
827 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
828 appendstring(realbuffer);
829 break;
830 case 'i':
831 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
832 sprintf(realbuffer, fmt, va_arg(vargs, int));
833 appendstring(realbuffer);
834 break;
835 case 'x':
836 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
837 sprintf(realbuffer, fmt, va_arg(vargs, int));
838 appendstring(realbuffer);
839 break;
840 case 's':
841 {
842 /* Parameter must be UTF-8 encoded.
843 In case of encoding errors, use
844 the replacement character. */
845 PyObject *u;
846 p = va_arg(vargs, char*);
847 u = PyUnicode_DecodeUTF8(p, strlen(p),
848 "replace");
849 if (!u)
850 goto fail;
851 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
852 PyUnicode_GET_SIZE(u));
853 s += PyUnicode_GET_SIZE(u);
854 Py_DECREF(u);
855 break;
856 }
857 case 'U':
858 {
859 PyObject *obj = va_arg(vargs, PyObject *);
860 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
861 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
862 s += size;
863 break;
864 }
865 case 'V':
866 {
867 PyObject *obj = va_arg(vargs, PyObject *);
868 const char *str = va_arg(vargs, const char *);
869 if (obj) {
870 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
871 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
872 s += size;
873 } else {
874 appendstring(str);
875 }
876 break;
877 }
878 case 'S':
879 case 'R':
880 {
881 Py_UNICODE *ucopy;
882 Py_ssize_t usize;
883 Py_ssize_t upos;
884 /* unused, since we already have the result */
885 (void) va_arg(vargs, PyObject *);
886 ucopy = PyUnicode_AS_UNICODE(*callresult);
887 usize = PyUnicode_GET_SIZE(*callresult);
888 for (upos = 0; upos<usize;)
889 *s++ = ucopy[upos++];
890 /* We're done with the unicode()/repr() => forget it */
891 Py_DECREF(*callresult);
892 /* switch to next unicode()/repr() result */
893 ++callresult;
894 break;
895 }
896 case 'p':
897 sprintf(buffer, "%p", va_arg(vargs, void*));
898 /* %p is ill-defined: ensure leading 0x. */
899 if (buffer[1] == 'X')
900 buffer[1] = 'x';
901 else if (buffer[1] != 'x') {
902 memmove(buffer+2, buffer, strlen(buffer)+1);
903 buffer[0] = '0';
904 buffer[1] = 'x';
905 }
906 appendstring(buffer);
907 break;
908 case '%':
909 *s++ = '%';
910 break;
911 default:
912 appendstring(p);
913 goto end;
914 }
915 } else
916 *s++ = *f;
917 }
918
919 end:
920 if (callresults)
921 PyMem_Free(callresults);
922 if (abuffer)
923 PyMem_Free(abuffer);
924 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
925 return string;
926 fail:
927 if (callresults) {
928 PyObject **callresult2 = callresults;
929 while (callresult2 < callresult) {
930 Py_DECREF(*callresult2);
931 ++callresult2;
932 }
933 PyMem_Free(callresults);
934 }
935 if (abuffer)
936 PyMem_Free(abuffer);
937 return NULL;
938}
939
940#undef appendstring
941
942PyObject *
943PyUnicode_FromFormat(const char *format, ...)
944{
945 PyObject* ret;
946 va_list vargs;
947
948#ifdef HAVE_STDARG_PROTOTYPES
949 va_start(vargs, format);
950#else
951 va_start(vargs);
952#endif
953 ret = PyUnicode_FromFormatV(format, vargs);
954 va_end(vargs);
955 return ret;
956}
957
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
959 wchar_t *w,
960 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961{
962 if (unicode == NULL) {
963 PyErr_BadInternalCall();
964 return -1;
965 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000966
967 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000968 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000969 size = PyUnicode_GET_SIZE(unicode) + 1;
970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971#ifdef HAVE_USABLE_WCHAR_T
972 memcpy(w, unicode->str, size * sizeof(wchar_t));
973#else
974 {
975 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000976 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000978 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979 *w++ = *u++;
980 }
981#endif
982
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000983 if (size > PyUnicode_GET_SIZE(unicode))
984 return PyUnicode_GET_SIZE(unicode);
985 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 return size;
987}
988
989#endif
990
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000991PyObject *PyUnicode_FromOrdinal(int ordinal)
992{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000993 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000994
995#ifdef Py_UNICODE_WIDE
996 if (ordinal < 0 || ordinal > 0x10ffff) {
997 PyErr_SetString(PyExc_ValueError,
998 "unichr() arg not in range(0x110000) "
999 "(wide Python build)");
1000 return NULL;
1001 }
1002#else
1003 if (ordinal < 0 || ordinal > 0xffff) {
1004 PyErr_SetString(PyExc_ValueError,
1005 "unichr() arg not in range(0x10000) "
1006 "(narrow Python build)");
1007 return NULL;
1008 }
1009#endif
1010
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 s[0] = (Py_UNICODE)ordinal;
1012 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001013}
1014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015PyObject *PyUnicode_FromObject(register PyObject *obj)
1016{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 /* XXX Perhaps we should make this API an alias of
1018 PyObject_Unicode() instead ?! */
1019 if (PyUnicode_CheckExact(obj)) {
1020 Py_INCREF(obj);
1021 return obj;
1022 }
1023 if (PyUnicode_Check(obj)) {
1024 /* For a Unicode subtype that's not a Unicode object,
1025 return a true Unicode object with the same data. */
1026 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1027 PyUnicode_GET_SIZE(obj));
1028 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001029 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1030}
1031
1032PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1033 const char *encoding,
1034 const char *errors)
1035{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001039
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (obj == NULL) {
1041 PyErr_BadInternalCall();
1042 return NULL;
1043 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001044
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001045#if 0
1046 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001047 that no encodings is given and then redirect to
1048 PyObject_Unicode() which then applies the additional logic for
1049 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001050
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001051 NOTE: This API should really only be used for object which
1052 represent *encoded* Unicode !
1053
1054 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001055 if (PyUnicode_Check(obj)) {
1056 if (encoding) {
1057 PyErr_SetString(PyExc_TypeError,
1058 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001059 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001061 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001062 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#else
1064 if (PyUnicode_Check(obj)) {
1065 PyErr_SetString(PyExc_TypeError,
1066 "decoding Unicode is not supported");
1067 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069#endif
1070
1071 /* Coerce object */
1072 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001073 s = PyString_AS_STRING(obj);
1074 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001075 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001076 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1077 /* Overwrite the error message with something more useful in
1078 case of a TypeError. */
1079 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001080 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081 "coercing to Unicode: need string or buffer, "
1082 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001083 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001084 goto onError;
1085 }
Tim Petersced69f82003-09-16 20:30:58 +00001086
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001087 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (len == 0) {
1089 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001090 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001094
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095 return v;
1096
1097 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099}
1100
1101PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001102 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 const char *encoding,
1104 const char *errors)
1105{
1106 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107
1108 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001109 encoding = PyUnicode_GetDefaultEncoding();
1110
1111 /* Shortcuts for common default encodings */
1112 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001114 else if (strcmp(encoding, "latin-1") == 0)
1115 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001116#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1117 else if (strcmp(encoding, "mbcs") == 0)
1118 return PyUnicode_DecodeMBCS(s, size, errors);
1119#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001120 else if (strcmp(encoding, "ascii") == 0)
1121 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122
1123 /* Decode via the codec registry */
1124 buffer = PyBuffer_FromMemory((void *)s, size);
1125 if (buffer == NULL)
1126 goto onError;
1127 unicode = PyCodec_Decode(buffer, encoding, errors);
1128 if (unicode == NULL)
1129 goto onError;
1130 if (!PyUnicode_Check(unicode)) {
1131 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001132 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001133 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 Py_DECREF(unicode);
1135 goto onError;
1136 }
1137 Py_DECREF(buffer);
1138 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001139
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 onError:
1141 Py_XDECREF(buffer);
1142 return NULL;
1143}
1144
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001145PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1146 const char *encoding,
1147 const char *errors)
1148{
1149 PyObject *v;
1150
1151 if (!PyUnicode_Check(unicode)) {
1152 PyErr_BadArgument();
1153 goto onError;
1154 }
1155
1156 if (encoding == NULL)
1157 encoding = PyUnicode_GetDefaultEncoding();
1158
1159 /* Decode via the codec registry */
1160 v = PyCodec_Decode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
1163 return v;
1164
1165 onError:
1166 return NULL;
1167}
1168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001170 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 const char *encoding,
1172 const char *errors)
1173{
1174 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 unicode = PyUnicode_FromUnicode(s, size);
1177 if (unicode == NULL)
1178 return NULL;
1179 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1180 Py_DECREF(unicode);
1181 return v;
1182}
1183
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001184PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1185 const char *encoding,
1186 const char *errors)
1187{
1188 PyObject *v;
1189
1190 if (!PyUnicode_Check(unicode)) {
1191 PyErr_BadArgument();
1192 goto onError;
1193 }
1194
1195 if (encoding == NULL)
1196 encoding = PyUnicode_GetDefaultEncoding();
1197
1198 /* Encode via the codec registry */
1199 v = PyCodec_Encode(unicode, encoding, errors);
1200 if (v == NULL)
1201 goto onError;
1202 return v;
1203
1204 onError:
1205 return NULL;
1206}
1207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1211{
1212 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1217 }
Fred Drakee4315f52000-05-09 19:53:39 +00001218
Tim Petersced69f82003-09-16 20:30:58 +00001219 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001220 encoding = PyUnicode_GetDefaultEncoding();
1221
1222 /* Shortcuts for common default encodings */
1223 if (errors == NULL) {
1224 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001225 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001226 else if (strcmp(encoding, "latin-1") == 0)
1227 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001228#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1229 else if (strcmp(encoding, "mbcs") == 0)
1230 return PyUnicode_AsMBCSString(unicode);
1231#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001232 else if (strcmp(encoding, "ascii") == 0)
1233 return PyUnicode_AsASCIIString(unicode);
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
1236 /* Encode via the codec registry */
1237 v = PyCodec_Encode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (!PyString_Check(v)) {
1241 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001242 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001243 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 Py_DECREF(v);
1245 goto onError;
1246 }
1247 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001248
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 onError:
1250 return NULL;
1251}
1252
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001253PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1254 const char *errors)
1255{
1256 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1257
1258 if (v)
1259 return v;
1260 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1261 if (v && errors == NULL)
1262 ((PyUnicodeObject *)unicode)->defenc = v;
1263 return v;
1264}
1265
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1267{
1268 if (!PyUnicode_Check(unicode)) {
1269 PyErr_BadArgument();
1270 goto onError;
1271 }
1272 return PyUnicode_AS_UNICODE(unicode);
1273
1274 onError:
1275 return NULL;
1276}
1277
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279{
1280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1283 }
1284 return PyUnicode_GET_SIZE(unicode);
1285
1286 onError:
1287 return -1;
1288}
1289
Thomas Wouters78890102000-07-22 19:25:51 +00001290const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001291{
1292 return unicode_default_encoding;
1293}
1294
1295int PyUnicode_SetDefaultEncoding(const char *encoding)
1296{
1297 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001298
Fred Drakee4315f52000-05-09 19:53:39 +00001299 /* Make sure the encoding is valid. As side effect, this also
1300 loads the encoding into the codec registry cache. */
1301 v = _PyCodec_Lookup(encoding);
1302 if (v == NULL)
1303 goto onError;
1304 Py_DECREF(v);
1305 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001306 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001307 sizeof(unicode_default_encoding));
1308 return 0;
1309
1310 onError:
1311 return -1;
1312}
1313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314/* error handling callback helper:
1315 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001316 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 and adjust various state variables.
1318 return 0 on success, -1 on error
1319*/
1320
1321static
1322int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1323 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001324 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1325 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001326 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001328 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329
1330 PyObject *restuple = NULL;
1331 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001332 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1333 Py_ssize_t requiredsize;
1334 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001336 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 int res = -1;
1338
1339 if (*errorHandler == NULL) {
1340 *errorHandler = PyCodec_LookupError(errors);
1341 if (*errorHandler == NULL)
1342 goto onError;
1343 }
1344
1345 if (*exceptionObject == NULL) {
1346 *exceptionObject = PyUnicodeDecodeError_Create(
1347 encoding, input, insize, *startinpos, *endinpos, reason);
1348 if (*exceptionObject == NULL)
1349 goto onError;
1350 }
1351 else {
1352 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1353 goto onError;
1354 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1355 goto onError;
1356 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1357 goto onError;
1358 }
1359
1360 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1361 if (restuple == NULL)
1362 goto onError;
1363 if (!PyTuple_Check(restuple)) {
1364 PyErr_Format(PyExc_TypeError, &argparse[4]);
1365 goto onError;
1366 }
1367 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1368 goto onError;
1369 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001370 newpos = insize+newpos;
1371 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001373 goto onError;
1374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375
1376 /* need more space? (at least enough for what we
1377 have+the replacement+the rest of the string (starting
1378 at the new input position), so we won't have to check space
1379 when there are no errors in the rest of the string) */
1380 repptr = PyUnicode_AS_UNICODE(repunicode);
1381 repsize = PyUnicode_GET_SIZE(repunicode);
1382 requiredsize = *outpos + repsize + insize-newpos;
1383 if (requiredsize > outsize) {
1384 if (requiredsize<2*outsize)
1385 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001386 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 goto onError;
1388 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1389 }
1390 *endinpos = newpos;
1391 *inptr = input + newpos;
1392 Py_UNICODE_COPY(*outptr, repptr, repsize);
1393 *outptr += repsize;
1394 *outpos += repsize;
1395 /* we made it! */
1396 res = 0;
1397
1398 onError:
1399 Py_XDECREF(restuple);
1400 return res;
1401}
1402
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001403/* --- UTF-7 Codec -------------------------------------------------------- */
1404
1405/* see RFC2152 for details */
1406
Tim Petersced69f82003-09-16 20:30:58 +00001407static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408char utf7_special[128] = {
1409 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1410 encoded:
1411 0 - not special
1412 1 - special
1413 2 - whitespace (optional)
1414 3 - RFC2152 Set O (optional) */
1415 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1416 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1417 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1419 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1421 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1423
1424};
1425
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001426/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1427 warnings about the comparison always being false; since
1428 utf7_special[0] is 1, we can safely make that one comparison
1429 true */
1430
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001432 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001433 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434 (encodeO && (utf7_special[(c)] == 3)))
1435
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001436#define B64(n) \
1437 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1438#define B64CHAR(c) \
1439 (isalnum(c) || (c) == '+' || (c) == '/')
1440#define UB64(c) \
1441 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1442 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001443
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001444#define ENCODE(out, ch, bits) \
1445 while (bits >= 6) { \
1446 *out++ = B64(ch >> (bits-6)); \
1447 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448 }
1449
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001450#define DECODE(out, ch, bits, surrogate) \
1451 while (bits >= 16) { \
1452 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1453 bits -= 16; \
1454 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001455 /* We have already generated an error for the high surrogate \
1456 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001457 surrogate = 0; \
1458 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001460 it in a 16-bit character */ \
1461 surrogate = 1; \
1462 errmsg = "code pairs are not supported"; \
1463 goto utf7Error; \
1464 } else { \
1465 *out++ = outCh; \
1466 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001467 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001470 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 const char *errors)
1472{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001473 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1474}
1475
1476PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1477 Py_ssize_t size,
1478 const char *errors,
1479 Py_ssize_t *consumed)
1480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001481 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001482 Py_ssize_t startinpos;
1483 Py_ssize_t endinpos;
1484 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485 const char *e;
1486 PyUnicodeObject *unicode;
1487 Py_UNICODE *p;
1488 const char *errmsg = "";
1489 int inShift = 0;
1490 unsigned int bitsleft = 0;
1491 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 int surrogate = 0;
1493 PyObject *errorHandler = NULL;
1494 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495
1496 unicode = _PyUnicode_New(size);
1497 if (!unicode)
1498 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001499 if (size == 0) {
1500 if (consumed)
1501 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504
1505 p = unicode->str;
1506 e = s + size;
1507
1508 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 Py_UNICODE ch;
1510 restart:
1511 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512
1513 if (inShift) {
1514 if ((ch == '-') || !B64CHAR(ch)) {
1515 inShift = 0;
1516 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1519 if (bitsleft >= 6) {
1520 /* The shift sequence has a partial character in it. If
1521 bitsleft < 6 then we could just classify it as padding
1522 but that is not the case here */
1523
1524 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001525 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 }
1527 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001528 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529 here so indicate the potential of a misencoded character. */
1530
1531 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1532 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1533 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 }
1536
1537 if (ch == '-') {
1538 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001539 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 inShift = 1;
1541 }
1542 } else if (SPECIAL(ch,0,0)) {
1543 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001544 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001545 } else {
1546 *p++ = ch;
1547 }
1548 } else {
1549 charsleft = (charsleft << 6) | UB64(ch);
1550 bitsleft += 6;
1551 s++;
1552 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1553 }
1554 }
1555 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 s++;
1558 if (s < e && *s == '-') {
1559 s++;
1560 *p++ = '+';
1561 } else
1562 {
1563 inShift = 1;
1564 bitsleft = 0;
1565 }
1566 }
1567 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001568 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 errmsg = "unexpected special character";
1570 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001571 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 }
1573 else {
1574 *p++ = ch;
1575 s++;
1576 }
1577 continue;
1578 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 outpos = p-PyUnicode_AS_UNICODE(unicode);
1580 endinpos = s-starts;
1581 if (unicode_decode_call_errorhandler(
1582 errors, &errorHandler,
1583 "utf7", errmsg,
1584 starts, size, &startinpos, &endinpos, &exc, &s,
1585 (PyObject **)&unicode, &outpos, &p))
1586 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 }
1588
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001589 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 outpos = p-PyUnicode_AS_UNICODE(unicode);
1591 endinpos = size;
1592 if (unicode_decode_call_errorhandler(
1593 errors, &errorHandler,
1594 "utf7", "unterminated shift sequence",
1595 starts, size, &startinpos, &endinpos, &exc, &s,
1596 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 if (s < e)
1599 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001601 if (consumed) {
1602 if(inShift)
1603 *consumed = startinpos;
1604 else
1605 *consumed = s-starts;
1606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001608 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001609 goto onError;
1610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001611 Py_XDECREF(errorHandler);
1612 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 return (PyObject *)unicode;
1614
1615onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001616 Py_XDECREF(errorHandler);
1617 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 Py_DECREF(unicode);
1619 return NULL;
1620}
1621
1622
1623PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001624 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 int encodeSetO,
1626 int encodeWhiteSpace,
1627 const char *errors)
1628{
1629 PyObject *v;
1630 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001631 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001633 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 unsigned int bitsleft = 0;
1635 unsigned long charsleft = 0;
1636 char * out;
1637 char * start;
1638
1639 if (size == 0)
1640 return PyString_FromStringAndSize(NULL, 0);
1641
1642 v = PyString_FromStringAndSize(NULL, cbAllocated);
1643 if (v == NULL)
1644 return NULL;
1645
1646 start = out = PyString_AS_STRING(v);
1647 for (;i < size; ++i) {
1648 Py_UNICODE ch = s[i];
1649
1650 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001651 if (ch == '+') {
1652 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 *out++ = '-';
1654 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1655 charsleft = ch;
1656 bitsleft = 16;
1657 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001658 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001660 } else {
1661 *out++ = (char) ch;
1662 }
1663 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1665 *out++ = B64(charsleft << (6-bitsleft));
1666 charsleft = 0;
1667 bitsleft = 0;
1668 /* Characters not in the BASE64 set implicitly unshift the sequence
1669 so no '-' is required, except if the character is itself a '-' */
1670 if (B64CHAR(ch) || ch == '-') {
1671 *out++ = '-';
1672 }
1673 inShift = 0;
1674 *out++ = (char) ch;
1675 } else {
1676 bitsleft += 16;
1677 charsleft = (charsleft << 16) | ch;
1678 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1679
1680 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001681 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 or '-' then the shift sequence will be terminated implicitly and we
1683 don't have to insert a '-'. */
1684
1685 if (bitsleft == 0) {
1686 if (i + 1 < size) {
1687 Py_UNICODE ch2 = s[i+1];
1688
1689 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001690
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 } else if (B64CHAR(ch2) || ch2 == '-') {
1692 *out++ = '-';
1693 inShift = 0;
1694 } else {
1695 inShift = 0;
1696 }
1697
1698 }
1699 else {
1700 *out++ = '-';
1701 inShift = 0;
1702 }
1703 }
Tim Petersced69f82003-09-16 20:30:58 +00001704 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 if (bitsleft) {
1708 *out++= B64(charsleft << (6-bitsleft) );
1709 *out++ = '-';
1710 }
1711
Tim Peters5de98422002-04-27 18:44:32 +00001712 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713 return v;
1714}
1715
1716#undef SPECIAL
1717#undef B64
1718#undef B64CHAR
1719#undef UB64
1720#undef ENCODE
1721#undef DECODE
1722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723/* --- UTF-8 Codec -------------------------------------------------------- */
1724
Tim Petersced69f82003-09-16 20:30:58 +00001725static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726char utf8_code_length[256] = {
1727 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1728 illegal prefix. see RFC 2279 for details */
1729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1730 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1731 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1732 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1733 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1734 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1735 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1736 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1739 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1741 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1742 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1744 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1745};
1746
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001748 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 const char *errors)
1750{
Walter Dörwald69652032004-09-07 20:24:22 +00001751 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1752}
1753
1754PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001755 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001756 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761 Py_ssize_t startinpos;
1762 Py_ssize_t endinpos;
1763 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 const char *e;
1765 PyUnicodeObject *unicode;
1766 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 PyObject *errorHandler = NULL;
1769 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
1771 /* Note: size will always be longer than the resulting Unicode
1772 character count */
1773 unicode = _PyUnicode_New(size);
1774 if (!unicode)
1775 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001776 if (size == 0) {
1777 if (consumed)
1778 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781
1782 /* Unpack UTF-8 encoded data */
1783 p = unicode->str;
1784 e = s + size;
1785
1786 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001787 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
1789 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001790 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 s++;
1792 continue;
1793 }
1794
1795 n = utf8_code_length[ch];
1796
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001798 if (consumed)
1799 break;
1800 else {
1801 errmsg = "unexpected end of data";
1802 startinpos = s-starts;
1803 endinpos = size;
1804 goto utf8Error;
1805 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
1808 switch (n) {
1809
1810 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 startinpos = s-starts;
1813 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815
1816 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 if ((s[1] & 0xc0) != 0x80) {
1824 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 startinpos = s-starts;
1826 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001827 goto utf8Error;
1828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 startinpos = s-starts;
1832 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 errmsg = "illegal encoding";
1834 goto utf8Error;
1835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 break;
1839
1840 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001841 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 (s[2] & 0xc0) != 0x80) {
1843 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 startinpos = s-starts;
1845 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 goto utf8Error;
1847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001849 if (ch < 0x0800) {
1850 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001851 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001852
1853 XXX For wide builds (UCS-4) we should probably try
1854 to recombine the surrogates into a single code
1855 unit.
1856 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
1861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001863 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001864 break;
1865
1866 case 4:
1867 if ((s[1] & 0xc0) != 0x80 ||
1868 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 (s[3] & 0xc0) != 0x80) {
1870 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 startinpos = s-starts;
1872 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 goto utf8Error;
1874 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001875 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1876 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1877 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001878 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001879 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001880 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001881 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001882 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001883 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 startinpos = s-starts;
1885 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001886 goto utf8Error;
1887 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001888#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001889 *p++ = (Py_UNICODE)ch;
1890#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001892
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 /* translate from 10000..10FFFF to 0..FFFF */
1894 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001895
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001896 /* high surrogate = top 10 bits added to D800 */
1897 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001898
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001899 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001900 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 break;
1903
1904 default:
1905 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 startinpos = s-starts;
1908 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001909 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910 }
1911 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001912 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001913
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001914 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 outpos = p-PyUnicode_AS_UNICODE(unicode);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "utf8", errmsg,
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&unicode, &outpos, &p))
1921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 }
Walter Dörwald69652032004-09-07 20:24:22 +00001923 if (consumed)
1924 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925
1926 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001927 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 goto onError;
1929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001930 Py_XDECREF(errorHandler);
1931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 return (PyObject *)unicode;
1933
1934onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 Py_XDECREF(errorHandler);
1936 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 Py_DECREF(unicode);
1938 return NULL;
1939}
1940
Tim Peters602f7402002-04-27 18:03:26 +00001941/* Allocation strategy: if the string is short, convert into a stack buffer
1942 and allocate exactly as much space needed at the end. Else allocate the
1943 maximum possible needed (4 result bytes per Unicode character), and return
1944 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001945*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001946PyObject *
1947PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001948 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950{
Tim Peters602f7402002-04-27 18:03:26 +00001951#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001952
Martin v. Löwis18e16552006-02-15 17:27:45 +00001953 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001954 PyObject *v; /* result string object */
1955 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001956 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001957 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001958 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001959
Tim Peters602f7402002-04-27 18:03:26 +00001960 assert(s != NULL);
1961 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962
Tim Peters602f7402002-04-27 18:03:26 +00001963 if (size <= MAX_SHORT_UNICHARS) {
1964 /* Write into the stack buffer; nallocated can't overflow.
1965 * At the end, we'll allocate exactly as much heap space as it
1966 * turns out we need.
1967 */
1968 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1969 v = NULL; /* will allocate after we're done */
1970 p = stackbuf;
1971 }
1972 else {
1973 /* Overallocate on the heap, and give the excess back at the end. */
1974 nallocated = size * 4;
1975 if (nallocated / 4 != size) /* overflow! */
1976 return PyErr_NoMemory();
1977 v = PyString_FromStringAndSize(NULL, nallocated);
1978 if (v == NULL)
1979 return NULL;
1980 p = PyString_AS_STRING(v);
1981 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001982
Tim Peters602f7402002-04-27 18:03:26 +00001983 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001984 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001985
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001986 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001987 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001989
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001991 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001992 *p++ = (char)(0xc0 | (ch >> 6));
1993 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001994 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001995 else {
Tim Peters602f7402002-04-27 18:03:26 +00001996 /* Encode UCS2 Unicode ordinals */
1997 if (ch < 0x10000) {
1998 /* Special case: check for high surrogate */
1999 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2000 Py_UCS4 ch2 = s[i];
2001 /* Check for low surrogate and combine the two to
2002 form a UCS4 value */
2003 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002004 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002005 i++;
2006 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002007 }
Tim Peters602f7402002-04-27 18:03:26 +00002008 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002010 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002011 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2012 *p++ = (char)(0x80 | (ch & 0x3f));
2013 continue;
2014 }
2015encodeUCS4:
2016 /* Encode UCS4 Unicode ordinals */
2017 *p++ = (char)(0xf0 | (ch >> 18));
2018 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2019 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2020 *p++ = (char)(0x80 | (ch & 0x3f));
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002023
Tim Peters602f7402002-04-27 18:03:26 +00002024 if (v == NULL) {
2025 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002026 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002027 assert(nneeded <= nallocated);
2028 v = PyString_FromStringAndSize(stackbuf, nneeded);
2029 }
2030 else {
2031 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002032 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002033 assert(nneeded <= nallocated);
2034 _PyString_Resize(&v, nneeded);
2035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002037
Tim Peters602f7402002-04-27 18:03:26 +00002038#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039}
2040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 if (!PyUnicode_Check(unicode)) {
2044 PyErr_BadArgument();
2045 return NULL;
2046 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002047 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2048 PyUnicode_GET_SIZE(unicode),
2049 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050}
2051
Walter Dörwald6e390802007-08-17 16:41:28 +00002052/* --- UTF-32 Codec ------------------------------------------------------- */
2053
2054PyObject *
2055PyUnicode_DecodeUTF32(const char *s,
2056 Py_ssize_t size,
2057 const char *errors,
2058 int *byteorder)
2059{
2060 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2061}
2062
2063PyObject *
2064PyUnicode_DecodeUTF32Stateful(const char *s,
2065 Py_ssize_t size,
2066 const char *errors,
2067 int *byteorder,
2068 Py_ssize_t *consumed)
2069{
2070 const char *starts = s;
2071 Py_ssize_t startinpos;
2072 Py_ssize_t endinpos;
2073 Py_ssize_t outpos;
2074 PyUnicodeObject *unicode;
2075 Py_UNICODE *p;
2076#ifndef Py_UNICODE_WIDE
2077 int i, pairs;
2078#else
2079 const int pairs = 0;
2080#endif
2081 const unsigned char *q, *e;
2082 int bo = 0; /* assume native ordering by default */
2083 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002084 /* Offsets from q for retrieving bytes in the right order. */
2085#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2086 int iorder[] = {0, 1, 2, 3};
2087#else
2088 int iorder[] = {3, 2, 1, 0};
2089#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002090 PyObject *errorHandler = NULL;
2091 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002092 /* On narrow builds we split characters outside the BMP into two
2093 codepoints => count how much extra space we need. */
2094#ifndef Py_UNICODE_WIDE
2095 for (i = pairs = 0; i < size/4; i++)
2096 if (((Py_UCS4 *)s)[i] >= 0x10000)
2097 pairs++;
2098#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002099
2100 /* This might be one to much, because of a BOM */
2101 unicode = _PyUnicode_New((size+3)/4+pairs);
2102 if (!unicode)
2103 return NULL;
2104 if (size == 0)
2105 return (PyObject *)unicode;
2106
2107 /* Unpack UTF-32 encoded data */
2108 p = unicode->str;
2109 q = (unsigned char *)s;
2110 e = q + size;
2111
2112 if (byteorder)
2113 bo = *byteorder;
2114
2115 /* Check for BOM marks (U+FEFF) in the input and adjust current
2116 byte order setting accordingly. In native mode, the leading BOM
2117 mark is skipped, in all other modes, it is copied to the output
2118 stream as-is (giving a ZWNBSP character). */
2119 if (bo == 0) {
2120 if (size >= 4) {
2121 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2122 (q[iorder[1]] << 8) | q[iorder[0]];
2123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2124 if (bom == 0x0000FEFF) {
2125 q += 4;
2126 bo = -1;
2127 }
2128 else if (bom == 0xFFFE0000) {
2129 q += 4;
2130 bo = 1;
2131 }
2132#else
2133 if (bom == 0x0000FEFF) {
2134 q += 4;
2135 bo = 1;
2136 }
2137 else if (bom == 0xFFFE0000) {
2138 q += 4;
2139 bo = -1;
2140 }
2141#endif
2142 }
2143 }
2144
2145 if (bo == -1) {
2146 /* force LE */
2147 iorder[0] = 0;
2148 iorder[1] = 1;
2149 iorder[2] = 2;
2150 iorder[3] = 3;
2151 }
2152 else if (bo == 1) {
2153 /* force BE */
2154 iorder[0] = 3;
2155 iorder[1] = 2;
2156 iorder[2] = 1;
2157 iorder[3] = 0;
2158 }
2159
2160 while (q < e) {
2161 Py_UCS4 ch;
2162 /* remaining bytes at the end? (size should be divisible by 4) */
2163 if (e-q<4) {
2164 if (consumed)
2165 break;
2166 errmsg = "truncated data";
2167 startinpos = ((const char *)q)-starts;
2168 endinpos = ((const char *)e)-starts;
2169 goto utf32Error;
2170 /* The remaining input chars are ignored if the callback
2171 chooses to skip the input */
2172 }
2173 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2174 (q[iorder[1]] << 8) | q[iorder[0]];
2175
2176 if (ch >= 0x110000)
2177 {
2178 errmsg = "codepoint not in range(0x110000)";
2179 startinpos = ((const char *)q)-starts;
2180 endinpos = startinpos+4;
2181 goto utf32Error;
2182 }
2183#ifndef Py_UNICODE_WIDE
2184 if (ch >= 0x10000)
2185 {
2186 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2187 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2188 }
2189 else
2190#endif
2191 *p++ = ch;
2192 q += 4;
2193 continue;
2194 utf32Error:
2195 outpos = p-PyUnicode_AS_UNICODE(unicode);
2196 if (unicode_decode_call_errorhandler(
2197 errors, &errorHandler,
2198 "utf32", errmsg,
2199 starts, size, &startinpos, &endinpos, &exc, &s,
2200 (PyObject **)&unicode, &outpos, &p))
2201 goto onError;
2202 }
2203
2204 if (byteorder)
2205 *byteorder = bo;
2206
2207 if (consumed)
2208 *consumed = (const char *)q-starts;
2209
2210 /* Adjust length */
2211 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2212 goto onError;
2213
2214 Py_XDECREF(errorHandler);
2215 Py_XDECREF(exc);
2216 return (PyObject *)unicode;
2217
2218onError:
2219 Py_DECREF(unicode);
2220 Py_XDECREF(errorHandler);
2221 Py_XDECREF(exc);
2222 return NULL;
2223}
2224
2225PyObject *
2226PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2227 Py_ssize_t size,
2228 const char *errors,
2229 int byteorder)
2230{
2231 PyObject *v;
2232 unsigned char *p;
2233#ifndef Py_UNICODE_WIDE
2234 int i, pairs;
2235#else
2236 const int pairs = 0;
2237#endif
2238 /* Offsets from p for storing byte pairs in the right order. */
2239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240 int iorder[] = {0, 1, 2, 3};
2241#else
2242 int iorder[] = {3, 2, 1, 0};
2243#endif
2244
2245#define STORECHAR(CH) \
2246 do { \
2247 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2248 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2249 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2250 p[iorder[0]] = (CH) & 0xff; \
2251 p += 4; \
2252 } while(0)
2253
2254 /* In narrow builds we can output surrogate pairs as one codepoint,
2255 so we need less space. */
2256#ifndef Py_UNICODE_WIDE
2257 for (i = pairs = 0; i < size-1; i++)
2258 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2259 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2260 pairs++;
2261#endif
2262 v = PyString_FromStringAndSize(NULL,
2263 4 * (size - pairs + (byteorder == 0)));
2264 if (v == NULL)
2265 return NULL;
2266
2267 p = (unsigned char *)PyString_AS_STRING(v);
2268 if (byteorder == 0)
2269 STORECHAR(0xFEFF);
2270 if (size == 0)
2271 return v;
2272
2273 if (byteorder == -1) {
2274 /* force LE */
2275 iorder[0] = 0;
2276 iorder[1] = 1;
2277 iorder[2] = 2;
2278 iorder[3] = 3;
2279 }
2280 else if (byteorder == 1) {
2281 /* force BE */
2282 iorder[0] = 3;
2283 iorder[1] = 2;
2284 iorder[2] = 1;
2285 iorder[3] = 0;
2286 }
2287
2288 while (size-- > 0) {
2289 Py_UCS4 ch = *s++;
2290#ifndef Py_UNICODE_WIDE
2291 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2292 Py_UCS4 ch2 = *s;
2293 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2294 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2295 s++;
2296 size--;
2297 }
2298 }
2299#endif
2300 STORECHAR(ch);
2301 }
2302 return v;
2303#undef STORECHAR
2304}
2305
2306PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2307{
2308 if (!PyUnicode_Check(unicode)) {
2309 PyErr_BadArgument();
2310 return NULL;
2311 }
2312 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2313 PyUnicode_GET_SIZE(unicode),
2314 NULL,
2315 0);
2316}
2317
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318/* --- UTF-16 Codec ------------------------------------------------------- */
2319
Tim Peters772747b2001-08-09 22:21:55 +00002320PyObject *
2321PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002322 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002323 const char *errors,
2324 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325{
Walter Dörwald69652032004-09-07 20:24:22 +00002326 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2327}
2328
2329PyObject *
2330PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002331 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002332 const char *errors,
2333 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002334 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002336 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002337 Py_ssize_t startinpos;
2338 Py_ssize_t endinpos;
2339 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 PyUnicodeObject *unicode;
2341 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002342 const unsigned char *q, *e;
2343 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002344 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002345 /* Offsets from q for retrieving byte pairs in the right order. */
2346#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2347 int ihi = 1, ilo = 0;
2348#else
2349 int ihi = 0, ilo = 1;
2350#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 PyObject *errorHandler = NULL;
2352 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 /* Note: size will always be longer than the resulting Unicode
2355 character count */
2356 unicode = _PyUnicode_New(size);
2357 if (!unicode)
2358 return NULL;
2359 if (size == 0)
2360 return (PyObject *)unicode;
2361
2362 /* Unpack UTF-16 encoded data */
2363 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002364 q = (unsigned char *)s;
2365 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366
2367 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002368 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002370 /* Check for BOM marks (U+FEFF) in the input and adjust current
2371 byte order setting accordingly. In native mode, the leading BOM
2372 mark is skipped, in all other modes, it is copied to the output
2373 stream as-is (giving a ZWNBSP character). */
2374 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002375 if (size >= 2) {
2376 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002377#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002378 if (bom == 0xFEFF) {
2379 q += 2;
2380 bo = -1;
2381 }
2382 else if (bom == 0xFFFE) {
2383 q += 2;
2384 bo = 1;
2385 }
Tim Petersced69f82003-09-16 20:30:58 +00002386#else
Walter Dörwald69652032004-09-07 20:24:22 +00002387 if (bom == 0xFEFF) {
2388 q += 2;
2389 bo = 1;
2390 }
2391 else if (bom == 0xFFFE) {
2392 q += 2;
2393 bo = -1;
2394 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002395#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002396 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398
Tim Peters772747b2001-08-09 22:21:55 +00002399 if (bo == -1) {
2400 /* force LE */
2401 ihi = 1;
2402 ilo = 0;
2403 }
2404 else if (bo == 1) {
2405 /* force BE */
2406 ihi = 0;
2407 ilo = 1;
2408 }
2409
2410 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002412 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002414 if (consumed)
2415 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 errmsg = "truncated data";
2417 startinpos = ((const char *)q)-starts;
2418 endinpos = ((const char *)e)-starts;
2419 goto utf16Error;
2420 /* The remaining input chars are ignored if the callback
2421 chooses to skip the input */
2422 }
2423 ch = (q[ihi] << 8) | q[ilo];
2424
Tim Peters772747b2001-08-09 22:21:55 +00002425 q += 2;
2426
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 if (ch < 0xD800 || ch > 0xDFFF) {
2428 *p++ = ch;
2429 continue;
2430 }
2431
2432 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002433 if (q >= e) {
2434 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435 startinpos = (((const char *)q)-2)-starts;
2436 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002437 goto utf16Error;
2438 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002439 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002440 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2441 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002442 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002443#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002444 *p++ = ch;
2445 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002446#else
2447 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002448#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002449 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002450 }
2451 else {
2452 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 startinpos = (((const char *)q)-4)-starts;
2454 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002455 goto utf16Error;
2456 }
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002459 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 startinpos = (((const char *)q)-2)-starts;
2461 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002462 /* Fall through to report the error */
2463
2464 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 outpos = p-PyUnicode_AS_UNICODE(unicode);
2466 if (unicode_decode_call_errorhandler(
2467 errors, &errorHandler,
2468 "utf16", errmsg,
2469 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2470 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
2473
2474 if (byteorder)
2475 *byteorder = bo;
2476
Walter Dörwald69652032004-09-07 20:24:22 +00002477 if (consumed)
2478 *consumed = (const char *)q-starts;
2479
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002481 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 goto onError;
2483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_XDECREF(errorHandler);
2485 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 return (PyObject *)unicode;
2487
2488onError:
2489 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 Py_XDECREF(errorHandler);
2491 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 return NULL;
2493}
2494
Tim Peters772747b2001-08-09 22:21:55 +00002495PyObject *
2496PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002497 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002498 const char *errors,
2499 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500{
2501 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002502 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002503#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002504 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002505#else
2506 const int pairs = 0;
2507#endif
Tim Peters772747b2001-08-09 22:21:55 +00002508 /* Offsets from p for storing byte pairs in the right order. */
2509#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2510 int ihi = 1, ilo = 0;
2511#else
2512 int ihi = 0, ilo = 1;
2513#endif
2514
2515#define STORECHAR(CH) \
2516 do { \
2517 p[ihi] = ((CH) >> 8) & 0xff; \
2518 p[ilo] = (CH) & 0xff; \
2519 p += 2; \
2520 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002522#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002523 for (i = pairs = 0; i < size; i++)
2524 if (s[i] >= 0x10000)
2525 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002526#endif
Tim Petersced69f82003-09-16 20:30:58 +00002527 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002528 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 if (v == NULL)
2530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
Tim Peters772747b2001-08-09 22:21:55 +00002532 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002534 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002535 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002536 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002537
2538 if (byteorder == -1) {
2539 /* force LE */
2540 ihi = 1;
2541 ilo = 0;
2542 }
2543 else if (byteorder == 1) {
2544 /* force BE */
2545 ihi = 0;
2546 ilo = 1;
2547 }
2548
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002549 while (size-- > 0) {
2550 Py_UNICODE ch = *s++;
2551 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002552#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002553 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002554 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2555 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002557#endif
Tim Peters772747b2001-08-09 22:21:55 +00002558 STORECHAR(ch);
2559 if (ch2)
2560 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002563#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564}
2565
2566PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2567{
2568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadArgument();
2570 return NULL;
2571 }
2572 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2573 PyUnicode_GET_SIZE(unicode),
2574 NULL,
2575 0);
2576}
2577
2578/* --- Unicode Escape Codec ----------------------------------------------- */
2579
Fredrik Lundh06d12682001-01-24 07:59:11 +00002580static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002583 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 const char *errors)
2585{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002586 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002587 Py_ssize_t startinpos;
2588 Py_ssize_t endinpos;
2589 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002594 char* message;
2595 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 PyObject *errorHandler = NULL;
2597 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002598
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 /* Escaped strings will always be longer than the resulting
2600 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 length after conversion to the true value.
2602 (but if the error callback returns a long replacement string
2603 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 v = _PyUnicode_New(size);
2605 if (v == NULL)
2606 goto onError;
2607 if (size == 0)
2608 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002612
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 while (s < end) {
2614 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002615 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617
2618 /* Non-escape characters are interpreted as Unicode ordinals */
2619 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002620 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 continue;
2622 }
2623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 /* \ - Escapes */
2626 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002627 c = *s++;
2628 if (s > end)
2629 c = '\0'; /* Invalid after \ */
2630 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631
2632 /* \x escapes */
2633 case '\n': break;
2634 case '\\': *p++ = '\\'; break;
2635 case '\'': *p++ = '\''; break;
2636 case '\"': *p++ = '\"'; break;
2637 case 'b': *p++ = '\b'; break;
2638 case 'f': *p++ = '\014'; break; /* FF */
2639 case 't': *p++ = '\t'; break;
2640 case 'n': *p++ = '\n'; break;
2641 case 'r': *p++ = '\r'; break;
2642 case 'v': *p++ = '\013'; break; /* VT */
2643 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2644
2645 /* \OOO (octal) escapes */
2646 case '0': case '1': case '2': case '3':
2647 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002648 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002649 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002650 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002651 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002652 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002654 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 break;
2656
Fredrik Lundhccc74732001-02-18 22:13:49 +00002657 /* hex escapes */
2658 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002660 digits = 2;
2661 message = "truncated \\xXX escape";
2662 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663
Fredrik Lundhccc74732001-02-18 22:13:49 +00002664 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002666 digits = 4;
2667 message = "truncated \\uXXXX escape";
2668 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002671 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002672 digits = 8;
2673 message = "truncated \\UXXXXXXXX escape";
2674 hexescape:
2675 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002676 outpos = p-PyUnicode_AS_UNICODE(v);
2677 if (s+digits>end) {
2678 endinpos = size;
2679 if (unicode_decode_call_errorhandler(
2680 errors, &errorHandler,
2681 "unicodeescape", "end of string in escape sequence",
2682 starts, size, &startinpos, &endinpos, &exc, &s,
2683 (PyObject **)&v, &outpos, &p))
2684 goto onError;
2685 goto nextByte;
2686 }
2687 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002689 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 endinpos = (s+i+1)-starts;
2691 if (unicode_decode_call_errorhandler(
2692 errors, &errorHandler,
2693 "unicodeescape", message,
2694 starts, size, &startinpos, &endinpos, &exc, &s,
2695 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002696 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002697 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002698 }
2699 chr = (chr<<4) & ~0xF;
2700 if (c >= '0' && c <= '9')
2701 chr += c - '0';
2702 else if (c >= 'a' && c <= 'f')
2703 chr += 10 + c - 'a';
2704 else
2705 chr += 10 + c - 'A';
2706 }
2707 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002708 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 /* _decoding_error will have already written into the
2710 target buffer. */
2711 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 /* when we get here, chr is a 32-bit unicode character */
2714 if (chr <= 0xffff)
2715 /* UCS-2 character */
2716 *p++ = (Py_UNICODE) chr;
2717 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002718 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002719 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002720#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002721 *p++ = chr;
2722#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002723 chr -= 0x10000L;
2724 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002725 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002726#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002727 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 endinpos = s-starts;
2729 outpos = p-PyUnicode_AS_UNICODE(v);
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "unicodeescape", "illegal Unicode character",
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002735 goto onError;
2736 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 break;
2738
2739 /* \N{name} */
2740 case 'N':
2741 message = "malformed \\N character escape";
2742 if (ucnhash_CAPI == NULL) {
2743 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002744 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002745 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746 if (m == NULL)
2747 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002750 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002752 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002753 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754 if (ucnhash_CAPI == NULL)
2755 goto ucnhashError;
2756 }
2757 if (*s == '{') {
2758 const char *start = s+1;
2759 /* look for the closing brace */
2760 while (*s != '}' && s < end)
2761 s++;
2762 if (s > start && s < end && *s == '}') {
2763 /* found a name. look it up in the unicode database */
2764 message = "unknown Unicode character name";
2765 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002766 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002767 goto store;
2768 }
2769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 endinpos = s-starts;
2771 outpos = p-PyUnicode_AS_UNICODE(v);
2772 if (unicode_decode_call_errorhandler(
2773 errors, &errorHandler,
2774 "unicodeescape", message,
2775 starts, size, &startinpos, &endinpos, &exc, &s,
2776 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002777 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 break;
2779
2780 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002781 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 message = "\\ at end of string";
2783 s--;
2784 endinpos = s-starts;
2785 outpos = p-PyUnicode_AS_UNICODE(v);
2786 if (unicode_decode_call_errorhandler(
2787 errors, &errorHandler,
2788 "unicodeescape", message,
2789 starts, size, &startinpos, &endinpos, &exc, &s,
2790 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002791 goto onError;
2792 }
2793 else {
2794 *p++ = '\\';
2795 *p++ = (unsigned char)s[-1];
2796 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002797 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 nextByte:
2800 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002802 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002804 Py_XDECREF(errorHandler);
2805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002807
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002809 PyErr_SetString(
2810 PyExc_UnicodeError,
2811 "\\N escapes not supported (can't load unicodedata module)"
2812 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002816 return NULL;
2817
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 Py_XDECREF(errorHandler);
2821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 return NULL;
2823}
2824
2825/* Return a Unicode-Escape string version of the Unicode object.
2826
2827 If quotes is true, the string is enclosed in u"" or u'' quotes as
2828 appropriate.
2829
2830*/
2831
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002832Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002833 Py_ssize_t size,
2834 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002835{
2836 /* like wcschr, but doesn't stop at NULL characters */
2837
2838 while (size-- > 0) {
2839 if (*s == ch)
2840 return s;
2841 s++;
2842 }
2843
2844 return NULL;
2845}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847static
2848PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 int quotes)
2851{
2852 PyObject *repr;
2853 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002855 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856
Neal Norwitz17753ec2006-08-21 22:21:19 +00002857 /* XXX(nnorwitz): rather than over-allocating, it would be
2858 better to choose a different scheme. Perhaps scan the
2859 first N-chars of the string and allocate based on that size.
2860 */
2861 /* Initial allocation is based on the longest-possible unichr
2862 escape.
2863
2864 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2865 unichr, so in this case it's the longest unichr escape. In
2866 narrow (UTF-16) builds this is five chars per source unichr
2867 since there are two unichrs in the surrogate pair, so in narrow
2868 (UTF-16) builds it's not the longest unichr escape.
2869
2870 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2871 so in the narrow (UTF-16) build case it's the longest unichr
2872 escape.
2873 */
2874
2875 repr = PyString_FromStringAndSize(NULL,
2876 2
2877#ifdef Py_UNICODE_WIDE
2878 + 10*size
2879#else
2880 + 6*size
2881#endif
2882 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 if (repr == NULL)
2884 return NULL;
2885
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002886 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887
2888 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002890 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 !findchar(s, size, '"')) ? '"' : '\'';
2892 }
2893 while (size-- > 0) {
2894 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002895
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002896 /* Escape quotes and backslashes */
2897 if ((quotes &&
2898 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 *p++ = '\\';
2900 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002901 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002902 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002903
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002904#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002905 /* Map 21-bit characters to '\U00xxxxxx' */
2906 else if (ch >= 0x10000) {
2907 *p++ = '\\';
2908 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002909 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2910 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2911 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2912 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2913 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2914 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2915 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002916 *p++ = hexdigit[ch & 0x0000000F];
2917 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002918 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002919#else
2920 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002921 else if (ch >= 0xD800 && ch < 0xDC00) {
2922 Py_UNICODE ch2;
2923 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002924
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002925 ch2 = *s++;
2926 size--;
2927 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2928 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2929 *p++ = '\\';
2930 *p++ = 'U';
2931 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2932 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2933 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2934 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2935 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2936 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2937 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2938 *p++ = hexdigit[ucs & 0x0000000F];
2939 continue;
2940 }
2941 /* Fall through: isolated surrogates are copied as-is */
2942 s--;
2943 size++;
2944 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002945#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002946
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002948 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 *p++ = '\\';
2950 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002951 *p++ = hexdigit[(ch >> 12) & 0x000F];
2952 *p++ = hexdigit[(ch >> 8) & 0x000F];
2953 *p++ = hexdigit[(ch >> 4) & 0x000F];
2954 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002956
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002957 /* Map special whitespace to '\t', \n', '\r' */
2958 else if (ch == '\t') {
2959 *p++ = '\\';
2960 *p++ = 't';
2961 }
2962 else if (ch == '\n') {
2963 *p++ = '\\';
2964 *p++ = 'n';
2965 }
2966 else if (ch == '\r') {
2967 *p++ = '\\';
2968 *p++ = 'r';
2969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002970
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002971 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002972 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002974 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002975 *p++ = hexdigit[(ch >> 4) & 0x000F];
2976 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002977 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 /* Copy everything else as-is */
2980 else
2981 *p++ = (char) ch;
2982 }
2983 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002984 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985
2986 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002987 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return repr;
2989}
2990
2991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 return unicodeescape_string(s, size, 0);
2995}
2996
2997PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2998{
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 return NULL;
3002 }
3003 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3004 PyUnicode_GET_SIZE(unicode));
3005}
3006
3007/* --- Raw Unicode Escape Codec ------------------------------------------- */
3008
3009PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 const char *errors)
3012{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 Py_ssize_t startinpos;
3015 Py_ssize_t endinpos;
3016 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 const char *end;
3020 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 PyObject *errorHandler = NULL;
3022 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 /* Escaped strings will always be longer than the resulting
3025 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 length after conversion to the true value. (But decoding error
3027 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 v = _PyUnicode_New(size);
3029 if (v == NULL)
3030 goto onError;
3031 if (size == 0)
3032 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 end = s + size;
3035 while (s < end) {
3036 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003037 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003039 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040
3041 /* Non-escape characters are interpreted as Unicode ordinals */
3042 if (*s != '\\') {
3043 *p++ = (unsigned char)*s++;
3044 continue;
3045 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047
3048 /* \u-escapes are only interpreted iff the number of leading
3049 backslashes if odd */
3050 bs = s;
3051 for (;s < end;) {
3052 if (*s != '\\')
3053 break;
3054 *p++ = (unsigned char)*s++;
3055 }
3056 if (((s - bs) & 1) == 0 ||
3057 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003058 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 continue;
3060 }
3061 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 s++;
3064
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003065 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003067 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 endinpos = s-starts;
3071 if (unicode_decode_call_errorhandler(
3072 errors, &errorHandler,
3073 "rawunicodeescape", "truncated \\uXXXX",
3074 starts, size, &startinpos, &endinpos, &exc, &s,
3075 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 }
3079 x = (x<<4) & ~0xF;
3080 if (c >= '0' && c <= '9')
3081 x += c - '0';
3082 else if (c >= 'a' && c <= 'f')
3083 x += 10 + c - 'a';
3084 else
3085 x += 10 + c - 'A';
3086 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003087#ifndef Py_UNICODE_WIDE
3088 if (x > 0x10000) {
3089 if (unicode_decode_call_errorhandler(
3090 errors, &errorHandler,
3091 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3092 starts, size, &startinpos, &endinpos, &exc, &s,
3093 (PyObject **)&v, &outpos, &p))
3094 goto onError;
3095 }
3096#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 *p++ = x;
3098 nextByte:
3099 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 Py_XDECREF(errorHandler);
3104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 onError:
3108 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 Py_XDECREF(errorHandler);
3110 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003115 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116{
3117 PyObject *repr;
3118 char *p;
3119 char *q;
3120
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003121 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003123#ifdef Py_UNICODE_WIDE
3124 repr = PyString_FromStringAndSize(NULL, 10 * size);
3125#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003127#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 if (repr == NULL)
3129 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003130 if (size == 0)
3131 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 p = q = PyString_AS_STRING(repr);
3134 while (size-- > 0) {
3135 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003136#ifdef Py_UNICODE_WIDE
3137 /* Map 32-bit characters to '\Uxxxxxxxx' */
3138 if (ch >= 0x10000) {
3139 *p++ = '\\';
3140 *p++ = 'U';
3141 *p++ = hexdigit[(ch >> 28) & 0xf];
3142 *p++ = hexdigit[(ch >> 24) & 0xf];
3143 *p++ = hexdigit[(ch >> 20) & 0xf];
3144 *p++ = hexdigit[(ch >> 16) & 0xf];
3145 *p++ = hexdigit[(ch >> 12) & 0xf];
3146 *p++ = hexdigit[(ch >> 8) & 0xf];
3147 *p++ = hexdigit[(ch >> 4) & 0xf];
3148 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003149 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003150 else
3151#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 /* Map 16-bit characters to '\uxxxx' */
3153 if (ch >= 256) {
3154 *p++ = '\\';
3155 *p++ = 'u';
3156 *p++ = hexdigit[(ch >> 12) & 0xf];
3157 *p++ = hexdigit[(ch >> 8) & 0xf];
3158 *p++ = hexdigit[(ch >> 4) & 0xf];
3159 *p++ = hexdigit[ch & 15];
3160 }
3161 /* Copy everything else as-is */
3162 else
3163 *p++ = (char) ch;
3164 }
3165 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003166 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 return repr;
3168}
3169
3170PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3171{
3172 if (!PyUnicode_Check(unicode)) {
3173 PyErr_BadArgument();
3174 return NULL;
3175 }
3176 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3177 PyUnicode_GET_SIZE(unicode));
3178}
3179
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003180/* --- Unicode Internal Codec ------------------------------------------- */
3181
3182PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003183 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003184 const char *errors)
3185{
3186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003187 Py_ssize_t startinpos;
3188 Py_ssize_t endinpos;
3189 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003190 PyUnicodeObject *v;
3191 Py_UNICODE *p;
3192 const char *end;
3193 const char *reason;
3194 PyObject *errorHandler = NULL;
3195 PyObject *exc = NULL;
3196
Neal Norwitzd43069c2006-01-08 01:12:10 +00003197#ifdef Py_UNICODE_WIDE
3198 Py_UNICODE unimax = PyUnicode_GetMax();
3199#endif
3200
Armin Rigo7ccbca92006-10-04 12:17:45 +00003201 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003202 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3203 if (v == NULL)
3204 goto onError;
3205 if (PyUnicode_GetSize((PyObject *)v) == 0)
3206 return (PyObject *)v;
3207 p = PyUnicode_AS_UNICODE(v);
3208 end = s + size;
3209
3210 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003211 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003212 /* We have to sanity check the raw data, otherwise doom looms for
3213 some malformed UCS-4 data. */
3214 if (
3215 #ifdef Py_UNICODE_WIDE
3216 *p > unimax || *p < 0 ||
3217 #endif
3218 end-s < Py_UNICODE_SIZE
3219 )
3220 {
3221 startinpos = s - starts;
3222 if (end-s < Py_UNICODE_SIZE) {
3223 endinpos = end-starts;
3224 reason = "truncated input";
3225 }
3226 else {
3227 endinpos = s - starts + Py_UNICODE_SIZE;
3228 reason = "illegal code point (> 0x10FFFF)";
3229 }
3230 outpos = p - PyUnicode_AS_UNICODE(v);
3231 if (unicode_decode_call_errorhandler(
3232 errors, &errorHandler,
3233 "unicode_internal", reason,
3234 starts, size, &startinpos, &endinpos, &exc, &s,
3235 (PyObject **)&v, &outpos, &p)) {
3236 goto onError;
3237 }
3238 }
3239 else {
3240 p++;
3241 s += Py_UNICODE_SIZE;
3242 }
3243 }
3244
Martin v. Löwis412fb672006-04-13 06:34:32 +00003245 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003246 goto onError;
3247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
3249 return (PyObject *)v;
3250
3251 onError:
3252 Py_XDECREF(v);
3253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
3255 return NULL;
3256}
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258/* --- Latin-1 Codec ------------------------------------------------------ */
3259
3260PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003261 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 const char *errors)
3263{
3264 PyUnicodeObject *v;
3265 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003266
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003268 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003269 Py_UNICODE r = *(unsigned char*)s;
3270 return PyUnicode_FromUnicode(&r, 1);
3271 }
3272
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 v = _PyUnicode_New(size);
3274 if (v == NULL)
3275 goto onError;
3276 if (size == 0)
3277 return (PyObject *)v;
3278 p = PyUnicode_AS_UNICODE(v);
3279 while (size-- > 0)
3280 *p++ = (unsigned char)*s++;
3281 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003282
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 onError:
3284 Py_XDECREF(v);
3285 return NULL;
3286}
3287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288/* create or adjust a UnicodeEncodeError */
3289static void make_encode_exception(PyObject **exceptionObject,
3290 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003291 const Py_UNICODE *unicode, Py_ssize_t size,
3292 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 if (*exceptionObject == NULL) {
3296 *exceptionObject = PyUnicodeEncodeError_Create(
3297 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 }
3299 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3301 goto onError;
3302 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3303 goto onError;
3304 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3305 goto onError;
3306 return;
3307 onError:
3308 Py_DECREF(*exceptionObject);
3309 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 }
3311}
3312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313/* raises a UnicodeEncodeError */
3314static void raise_encode_exception(PyObject **exceptionObject,
3315 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 const Py_UNICODE *unicode, Py_ssize_t size,
3317 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 const char *reason)
3319{
3320 make_encode_exception(exceptionObject,
3321 encoding, unicode, size, startpos, endpos, reason);
3322 if (*exceptionObject != NULL)
3323 PyCodec_StrictErrors(*exceptionObject);
3324}
3325
3326/* error handling callback helper:
3327 build arguments, call the callback and check the arguments,
3328 put the result into newpos and return the replacement string, which
3329 has to be freed by the caller */
3330static PyObject *unicode_encode_call_errorhandler(const char *errors,
3331 PyObject **errorHandler,
3332 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
3335 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338
3339 PyObject *restuple;
3340 PyObject *resunicode;
3341
3342 if (*errorHandler == NULL) {
3343 *errorHandler = PyCodec_LookupError(errors);
3344 if (*errorHandler == NULL)
3345 return NULL;
3346 }
3347
3348 make_encode_exception(exceptionObject,
3349 encoding, unicode, size, startpos, endpos, reason);
3350 if (*exceptionObject == NULL)
3351 return NULL;
3352
3353 restuple = PyObject_CallFunctionObjArgs(
3354 *errorHandler, *exceptionObject, NULL);
3355 if (restuple == NULL)
3356 return NULL;
3357 if (!PyTuple_Check(restuple)) {
3358 PyErr_Format(PyExc_TypeError, &argparse[4]);
3359 Py_DECREF(restuple);
3360 return NULL;
3361 }
3362 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3363 &resunicode, newpos)) {
3364 Py_DECREF(restuple);
3365 return NULL;
3366 }
3367 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003368 *newpos = size+*newpos;
3369 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003370 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003371 Py_DECREF(restuple);
3372 return NULL;
3373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 Py_INCREF(resunicode);
3375 Py_DECREF(restuple);
3376 return resunicode;
3377}
3378
3379static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003380 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 const char *errors,
3382 int limit)
3383{
3384 /* output object */
3385 PyObject *res;
3386 /* pointers to the beginning and end+1 of input */
3387 const Py_UNICODE *startp = p;
3388 const Py_UNICODE *endp = p + size;
3389 /* pointer to the beginning of the unencodable characters */
3390 /* const Py_UNICODE *badp = NULL; */
3391 /* pointer into the output */
3392 char *str;
3393 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t respos = 0;
3395 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003396 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3397 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *errorHandler = NULL;
3399 PyObject *exc = NULL;
3400 /* the following variable is used for caching string comparisons
3401 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3402 int known_errorHandler = -1;
3403
3404 /* allocate enough for a simple encoding without
3405 replacements, if we need more, we'll resize */
3406 res = PyString_FromStringAndSize(NULL, size);
3407 if (res == NULL)
3408 goto onError;
3409 if (size == 0)
3410 return res;
3411 str = PyString_AS_STRING(res);
3412 ressize = size;
3413
3414 while (p<endp) {
3415 Py_UNICODE c = *p;
3416
3417 /* can we encode this? */
3418 if (c<limit) {
3419 /* no overflow check, because we know that the space is enough */
3420 *str++ = (char)c;
3421 ++p;
3422 }
3423 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003424 Py_ssize_t unicodepos = p-startp;
3425 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t repsize;
3428 Py_ssize_t newpos;
3429 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 Py_UNICODE *uni2;
3431 /* startpos for collecting unencodable chars */
3432 const Py_UNICODE *collstart = p;
3433 const Py_UNICODE *collend = p;
3434 /* find all unecodable characters */
3435 while ((collend < endp) && ((*collend)>=limit))
3436 ++collend;
3437 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3438 if (known_errorHandler==-1) {
3439 if ((errors==NULL) || (!strcmp(errors, "strict")))
3440 known_errorHandler = 1;
3441 else if (!strcmp(errors, "replace"))
3442 known_errorHandler = 2;
3443 else if (!strcmp(errors, "ignore"))
3444 known_errorHandler = 3;
3445 else if (!strcmp(errors, "xmlcharrefreplace"))
3446 known_errorHandler = 4;
3447 else
3448 known_errorHandler = 0;
3449 }
3450 switch (known_errorHandler) {
3451 case 1: /* strict */
3452 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3453 goto onError;
3454 case 2: /* replace */
3455 while (collstart++<collend)
3456 *str++ = '?'; /* fall through */
3457 case 3: /* ignore */
3458 p = collend;
3459 break;
3460 case 4: /* xmlcharrefreplace */
3461 respos = str-PyString_AS_STRING(res);
3462 /* determine replacement size (temporarily (mis)uses p) */
3463 for (p = collstart, repsize = 0; p < collend; ++p) {
3464 if (*p<10)
3465 repsize += 2+1+1;
3466 else if (*p<100)
3467 repsize += 2+2+1;
3468 else if (*p<1000)
3469 repsize += 2+3+1;
3470 else if (*p<10000)
3471 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003472#ifndef Py_UNICODE_WIDE
3473 else
3474 repsize += 2+5+1;
3475#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 else if (*p<100000)
3477 repsize += 2+5+1;
3478 else if (*p<1000000)
3479 repsize += 2+6+1;
3480 else
3481 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003482#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 }
3484 requiredsize = respos+repsize+(endp-collend);
3485 if (requiredsize > ressize) {
3486 if (requiredsize<2*ressize)
3487 requiredsize = 2*ressize;
3488 if (_PyString_Resize(&res, requiredsize))
3489 goto onError;
3490 str = PyString_AS_STRING(res) + respos;
3491 ressize = requiredsize;
3492 }
3493 /* generate replacement (temporarily (mis)uses p) */
3494 for (p = collstart; p < collend; ++p) {
3495 str += sprintf(str, "&#%d;", (int)*p);
3496 }
3497 p = collend;
3498 break;
3499 default:
3500 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3501 encoding, reason, startp, size, &exc,
3502 collstart-startp, collend-startp, &newpos);
3503 if (repunicode == NULL)
3504 goto onError;
3505 /* need more space? (at least enough for what we
3506 have+the replacement+the rest of the string, so
3507 we won't have to check space for encodable characters) */
3508 respos = str-PyString_AS_STRING(res);
3509 repsize = PyUnicode_GET_SIZE(repunicode);
3510 requiredsize = respos+repsize+(endp-collend);
3511 if (requiredsize > ressize) {
3512 if (requiredsize<2*ressize)
3513 requiredsize = 2*ressize;
3514 if (_PyString_Resize(&res, requiredsize)) {
3515 Py_DECREF(repunicode);
3516 goto onError;
3517 }
3518 str = PyString_AS_STRING(res) + respos;
3519 ressize = requiredsize;
3520 }
3521 /* check if there is anything unencodable in the replacement
3522 and copy it to the output */
3523 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3524 c = *uni2;
3525 if (c >= limit) {
3526 raise_encode_exception(&exc, encoding, startp, size,
3527 unicodepos, unicodepos+1, reason);
3528 Py_DECREF(repunicode);
3529 goto onError;
3530 }
3531 *str = (char)c;
3532 }
3533 p = startp + newpos;
3534 Py_DECREF(repunicode);
3535 }
3536 }
3537 }
3538 /* Resize if we allocated to much */
3539 respos = str-PyString_AS_STRING(res);
3540 if (respos<ressize)
3541 /* If this falls res will be NULL */
3542 _PyString_Resize(&res, respos);
3543 Py_XDECREF(errorHandler);
3544 Py_XDECREF(exc);
3545 return res;
3546
3547 onError:
3548 Py_XDECREF(res);
3549 Py_XDECREF(errorHandler);
3550 Py_XDECREF(exc);
3551 return NULL;
3552}
3553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 const char *errors)
3557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559}
3560
3561PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3562{
3563 if (!PyUnicode_Check(unicode)) {
3564 PyErr_BadArgument();
3565 return NULL;
3566 }
3567 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3568 PyUnicode_GET_SIZE(unicode),
3569 NULL);
3570}
3571
3572/* --- 7-bit ASCII Codec -------------------------------------------------- */
3573
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 const char *errors)
3577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 PyUnicodeObject *v;
3580 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003581 Py_ssize_t startinpos;
3582 Py_ssize_t endinpos;
3583 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 const char *e;
3585 PyObject *errorHandler = NULL;
3586 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003587
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003589 if (size == 1 && *(unsigned char*)s < 128) {
3590 Py_UNICODE r = *(unsigned char*)s;
3591 return PyUnicode_FromUnicode(&r, 1);
3592 }
Tim Petersced69f82003-09-16 20:30:58 +00003593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 v = _PyUnicode_New(size);
3595 if (v == NULL)
3596 goto onError;
3597 if (size == 0)
3598 return (PyObject *)v;
3599 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 e = s + size;
3601 while (s < e) {
3602 register unsigned char c = (unsigned char)*s;
3603 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 ++s;
3606 }
3607 else {
3608 startinpos = s-starts;
3609 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003610 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 if (unicode_decode_call_errorhandler(
3612 errors, &errorHandler,
3613 "ascii", "ordinal not in range(128)",
3614 starts, size, &startinpos, &endinpos, &exc, &s,
3615 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003619 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003620 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003625
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 onError:
3627 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631}
3632
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003634 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 const char *errors)
3636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638}
3639
3640PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3641{
3642 if (!PyUnicode_Check(unicode)) {
3643 PyErr_BadArgument();
3644 return NULL;
3645 }
3646 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3647 PyUnicode_GET_SIZE(unicode),
3648 NULL);
3649}
3650
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003652
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003653/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003654
Martin v. Löwisd8251432006-06-14 05:21:04 +00003655#if SIZEOF_INT < SIZEOF_SSIZE_T
3656#define NEED_RETRY
3657#endif
3658
3659/* XXX This code is limited to "true" double-byte encodings, as
3660 a) it assumes an incomplete character consists of a single byte, and
3661 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3662 encodings, see IsDBCSLeadByteEx documentation. */
3663
3664static int is_dbcs_lead_byte(const char *s, int offset)
3665{
3666 const char *curr = s + offset;
3667
3668 if (IsDBCSLeadByte(*curr)) {
3669 const char *prev = CharPrev(s, curr);
3670 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3671 }
3672 return 0;
3673}
3674
3675/*
3676 * Decode MBCS string into unicode object. If 'final' is set, converts
3677 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3678 */
3679static int decode_mbcs(PyUnicodeObject **v,
3680 const char *s, /* MBCS string */
3681 int size, /* sizeof MBCS string */
3682 int final)
3683{
3684 Py_UNICODE *p;
3685 Py_ssize_t n = 0;
3686 int usize = 0;
3687
3688 assert(size >= 0);
3689
3690 /* Skip trailing lead-byte unless 'final' is set */
3691 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3692 --size;
3693
3694 /* First get the size of the result */
3695 if (size > 0) {
3696 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3697 if (usize == 0) {
3698 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3699 return -1;
3700 }
3701 }
3702
3703 if (*v == NULL) {
3704 /* Create unicode object */
3705 *v = _PyUnicode_New(usize);
3706 if (*v == NULL)
3707 return -1;
3708 }
3709 else {
3710 /* Extend unicode object */
3711 n = PyUnicode_GET_SIZE(*v);
3712 if (_PyUnicode_Resize(v, n + usize) < 0)
3713 return -1;
3714 }
3715
3716 /* Do the conversion */
3717 if (size > 0) {
3718 p = PyUnicode_AS_UNICODE(*v) + n;
3719 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3720 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3721 return -1;
3722 }
3723 }
3724
3725 return size;
3726}
3727
3728PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3729 Py_ssize_t size,
3730 const char *errors,
3731 Py_ssize_t *consumed)
3732{
3733 PyUnicodeObject *v = NULL;
3734 int done;
3735
3736 if (consumed)
3737 *consumed = 0;
3738
3739#ifdef NEED_RETRY
3740 retry:
3741 if (size > INT_MAX)
3742 done = decode_mbcs(&v, s, INT_MAX, 0);
3743 else
3744#endif
3745 done = decode_mbcs(&v, s, (int)size, !consumed);
3746
3747 if (done < 0) {
3748 Py_XDECREF(v);
3749 return NULL;
3750 }
3751
3752 if (consumed)
3753 *consumed += done;
3754
3755#ifdef NEED_RETRY
3756 if (size > INT_MAX) {
3757 s += done;
3758 size -= done;
3759 goto retry;
3760 }
3761#endif
3762
3763 return (PyObject *)v;
3764}
3765
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003767 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003768 const char *errors)
3769{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003770 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3771}
3772
3773/*
3774 * Convert unicode into string object (MBCS).
3775 * Returns 0 if succeed, -1 otherwise.
3776 */
3777static int encode_mbcs(PyObject **repr,
3778 const Py_UNICODE *p, /* unicode */
3779 int size) /* size of unicode */
3780{
3781 int mbcssize = 0;
3782 Py_ssize_t n = 0;
3783
3784 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003785
3786 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003787 if (size > 0) {
3788 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3789 if (mbcssize == 0) {
3790 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3791 return -1;
3792 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003793 }
3794
Martin v. Löwisd8251432006-06-14 05:21:04 +00003795 if (*repr == NULL) {
3796 /* Create string object */
3797 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3798 if (*repr == NULL)
3799 return -1;
3800 }
3801 else {
3802 /* Extend string object */
3803 n = PyString_Size(*repr);
3804 if (_PyString_Resize(repr, n + mbcssize) < 0)
3805 return -1;
3806 }
3807
3808 /* Do the conversion */
3809 if (size > 0) {
3810 char *s = PyString_AS_STRING(*repr) + n;
3811 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3812 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3813 return -1;
3814 }
3815 }
3816
3817 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003818}
3819
3820PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003822 const char *errors)
3823{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003824 PyObject *repr = NULL;
3825 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003826
Martin v. Löwisd8251432006-06-14 05:21:04 +00003827#ifdef NEED_RETRY
3828 retry:
3829 if (size > INT_MAX)
3830 ret = encode_mbcs(&repr, p, INT_MAX);
3831 else
3832#endif
3833 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003834
Martin v. Löwisd8251432006-06-14 05:21:04 +00003835 if (ret < 0) {
3836 Py_XDECREF(repr);
3837 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839
3840#ifdef NEED_RETRY
3841 if (size > INT_MAX) {
3842 p += INT_MAX;
3843 size -= INT_MAX;
3844 goto retry;
3845 }
3846#endif
3847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003848 return repr;
3849}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003850
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003851PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3852{
3853 if (!PyUnicode_Check(unicode)) {
3854 PyErr_BadArgument();
3855 return NULL;
3856 }
3857 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3858 PyUnicode_GET_SIZE(unicode),
3859 NULL);
3860}
3861
Martin v. Löwisd8251432006-06-14 05:21:04 +00003862#undef NEED_RETRY
3863
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003864#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866/* --- Character Mapping Codec -------------------------------------------- */
3867
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003869 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 PyObject *mapping,
3871 const char *errors)
3872{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t startinpos;
3875 Py_ssize_t endinpos;
3876 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 PyUnicodeObject *v;
3879 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003880 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 PyObject *errorHandler = NULL;
3882 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003883 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003884 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 /* Default to Latin-1 */
3887 if (mapping == NULL)
3888 return PyUnicode_DecodeLatin1(s, size, errors);
3889
3890 v = _PyUnicode_New(size);
3891 if (v == NULL)
3892 goto onError;
3893 if (size == 0)
3894 return (PyObject *)v;
3895 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003897 if (PyUnicode_CheckExact(mapping)) {
3898 mapstring = PyUnicode_AS_UNICODE(mapping);
3899 maplen = PyUnicode_GET_SIZE(mapping);
3900 while (s < e) {
3901 unsigned char ch = *s;
3902 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003904 if (ch < maplen)
3905 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003907 if (x == 0xfffe) {
3908 /* undefined mapping */
3909 outpos = p-PyUnicode_AS_UNICODE(v);
3910 startinpos = s-starts;
3911 endinpos = startinpos+1;
3912 if (unicode_decode_call_errorhandler(
3913 errors, &errorHandler,
3914 "charmap", "character maps to <undefined>",
3915 starts, size, &startinpos, &endinpos, &exc, &s,
3916 (PyObject **)&v, &outpos, &p)) {
3917 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003919 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003920 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003921 *p++ = x;
3922 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003924 }
3925 else {
3926 while (s < e) {
3927 unsigned char ch = *s;
3928 PyObject *w, *x;
3929
3930 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3931 w = PyInt_FromLong((long)ch);
3932 if (w == NULL)
3933 goto onError;
3934 x = PyObject_GetItem(mapping, w);
3935 Py_DECREF(w);
3936 if (x == NULL) {
3937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3938 /* No mapping found means: mapping is undefined. */
3939 PyErr_Clear();
3940 x = Py_None;
3941 Py_INCREF(x);
3942 } else
3943 goto onError;
3944 }
3945
3946 /* Apply mapping */
3947 if (PyInt_Check(x)) {
3948 long value = PyInt_AS_LONG(x);
3949 if (value < 0 || value > 65535) {
3950 PyErr_SetString(PyExc_TypeError,
3951 "character mapping must be in range(65536)");
3952 Py_DECREF(x);
3953 goto onError;
3954 }
3955 *p++ = (Py_UNICODE)value;
3956 }
3957 else if (x == Py_None) {
3958 /* undefined mapping */
3959 outpos = p-PyUnicode_AS_UNICODE(v);
3960 startinpos = s-starts;
3961 endinpos = startinpos+1;
3962 if (unicode_decode_call_errorhandler(
3963 errors, &errorHandler,
3964 "charmap", "character maps to <undefined>",
3965 starts, size, &startinpos, &endinpos, &exc, &s,
3966 (PyObject **)&v, &outpos, &p)) {
3967 Py_DECREF(x);
3968 goto onError;
3969 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003970 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003971 continue;
3972 }
3973 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003974 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003975
3976 if (targetsize == 1)
3977 /* 1-1 mapping */
3978 *p++ = *PyUnicode_AS_UNICODE(x);
3979
3980 else if (targetsize > 1) {
3981 /* 1-n mapping */
3982 if (targetsize > extrachars) {
3983 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3985 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003986 (targetsize << 2);
3987 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003988 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003989 if (_PyUnicode_Resize(&v,
3990 PyUnicode_GET_SIZE(v) + needed) < 0) {
3991 Py_DECREF(x);
3992 goto onError;
3993 }
3994 p = PyUnicode_AS_UNICODE(v) + oldpos;
3995 }
3996 Py_UNICODE_COPY(p,
3997 PyUnicode_AS_UNICODE(x),
3998 targetsize);
3999 p += targetsize;
4000 extrachars -= targetsize;
4001 }
4002 /* 1-0 mapping: skip the character */
4003 }
4004 else {
4005 /* wrong return value */
4006 PyErr_SetString(PyExc_TypeError,
4007 "character mapping must return integer, None or unicode");
4008 Py_DECREF(x);
4009 goto onError;
4010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004012 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 }
4015 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004016 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 Py_XDECREF(errorHandler);
4019 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004021
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 Py_XDECREF(errorHandler);
4024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 Py_XDECREF(v);
4026 return NULL;
4027}
4028
Martin v. Löwis3f767792006-06-04 19:36:28 +00004029/* Charmap encoding: the lookup table */
4030
4031struct encoding_map{
4032 PyObject_HEAD
4033 unsigned char level1[32];
4034 int count2, count3;
4035 unsigned char level23[1];
4036};
4037
4038static PyObject*
4039encoding_map_size(PyObject *obj, PyObject* args)
4040{
4041 struct encoding_map *map = (struct encoding_map*)obj;
4042 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4043 128*map->count3);
4044}
4045
4046static PyMethodDef encoding_map_methods[] = {
4047 {"size", encoding_map_size, METH_NOARGS,
4048 PyDoc_STR("Return the size (in bytes) of this object") },
4049 { 0 }
4050};
4051
4052static void
4053encoding_map_dealloc(PyObject* o)
4054{
4055 PyObject_FREE(o);
4056}
4057
4058static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004059 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004060 "EncodingMap", /*tp_name*/
4061 sizeof(struct encoding_map), /*tp_basicsize*/
4062 0, /*tp_itemsize*/
4063 /* methods */
4064 encoding_map_dealloc, /*tp_dealloc*/
4065 0, /*tp_print*/
4066 0, /*tp_getattr*/
4067 0, /*tp_setattr*/
4068 0, /*tp_compare*/
4069 0, /*tp_repr*/
4070 0, /*tp_as_number*/
4071 0, /*tp_as_sequence*/
4072 0, /*tp_as_mapping*/
4073 0, /*tp_hash*/
4074 0, /*tp_call*/
4075 0, /*tp_str*/
4076 0, /*tp_getattro*/
4077 0, /*tp_setattro*/
4078 0, /*tp_as_buffer*/
4079 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4080 0, /*tp_doc*/
4081 0, /*tp_traverse*/
4082 0, /*tp_clear*/
4083 0, /*tp_richcompare*/
4084 0, /*tp_weaklistoffset*/
4085 0, /*tp_iter*/
4086 0, /*tp_iternext*/
4087 encoding_map_methods, /*tp_methods*/
4088 0, /*tp_members*/
4089 0, /*tp_getset*/
4090 0, /*tp_base*/
4091 0, /*tp_dict*/
4092 0, /*tp_descr_get*/
4093 0, /*tp_descr_set*/
4094 0, /*tp_dictoffset*/
4095 0, /*tp_init*/
4096 0, /*tp_alloc*/
4097 0, /*tp_new*/
4098 0, /*tp_free*/
4099 0, /*tp_is_gc*/
4100};
4101
4102PyObject*
4103PyUnicode_BuildEncodingMap(PyObject* string)
4104{
4105 Py_UNICODE *decode;
4106 PyObject *result;
4107 struct encoding_map *mresult;
4108 int i;
4109 int need_dict = 0;
4110 unsigned char level1[32];
4111 unsigned char level2[512];
4112 unsigned char *mlevel1, *mlevel2, *mlevel3;
4113 int count2 = 0, count3 = 0;
4114
4115 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4116 PyErr_BadArgument();
4117 return NULL;
4118 }
4119 decode = PyUnicode_AS_UNICODE(string);
4120 memset(level1, 0xFF, sizeof level1);
4121 memset(level2, 0xFF, sizeof level2);
4122
4123 /* If there isn't a one-to-one mapping of NULL to \0,
4124 or if there are non-BMP characters, we need to use
4125 a mapping dictionary. */
4126 if (decode[0] != 0)
4127 need_dict = 1;
4128 for (i = 1; i < 256; i++) {
4129 int l1, l2;
4130 if (decode[i] == 0
4131 #ifdef Py_UNICODE_WIDE
4132 || decode[i] > 0xFFFF
4133 #endif
4134 ) {
4135 need_dict = 1;
4136 break;
4137 }
4138 if (decode[i] == 0xFFFE)
4139 /* unmapped character */
4140 continue;
4141 l1 = decode[i] >> 11;
4142 l2 = decode[i] >> 7;
4143 if (level1[l1] == 0xFF)
4144 level1[l1] = count2++;
4145 if (level2[l2] == 0xFF)
4146 level2[l2] = count3++;
4147 }
4148
4149 if (count2 >= 0xFF || count3 >= 0xFF)
4150 need_dict = 1;
4151
4152 if (need_dict) {
4153 PyObject *result = PyDict_New();
4154 PyObject *key, *value;
4155 if (!result)
4156 return NULL;
4157 for (i = 0; i < 256; i++) {
4158 key = value = NULL;
4159 key = PyInt_FromLong(decode[i]);
4160 value = PyInt_FromLong(i);
4161 if (!key || !value)
4162 goto failed1;
4163 if (PyDict_SetItem(result, key, value) == -1)
4164 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004165 Py_DECREF(key);
4166 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004167 }
4168 return result;
4169 failed1:
4170 Py_XDECREF(key);
4171 Py_XDECREF(value);
4172 Py_DECREF(result);
4173 return NULL;
4174 }
4175
4176 /* Create a three-level trie */
4177 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4178 16*count2 + 128*count3 - 1);
4179 if (!result)
4180 return PyErr_NoMemory();
4181 PyObject_Init(result, &EncodingMapType);
4182 mresult = (struct encoding_map*)result;
4183 mresult->count2 = count2;
4184 mresult->count3 = count3;
4185 mlevel1 = mresult->level1;
4186 mlevel2 = mresult->level23;
4187 mlevel3 = mresult->level23 + 16*count2;
4188 memcpy(mlevel1, level1, 32);
4189 memset(mlevel2, 0xFF, 16*count2);
4190 memset(mlevel3, 0, 128*count3);
4191 count3 = 0;
4192 for (i = 1; i < 256; i++) {
4193 int o1, o2, o3, i2, i3;
4194 if (decode[i] == 0xFFFE)
4195 /* unmapped character */
4196 continue;
4197 o1 = decode[i]>>11;
4198 o2 = (decode[i]>>7) & 0xF;
4199 i2 = 16*mlevel1[o1] + o2;
4200 if (mlevel2[i2] == 0xFF)
4201 mlevel2[i2] = count3++;
4202 o3 = decode[i] & 0x7F;
4203 i3 = 128*mlevel2[i2] + o3;
4204 mlevel3[i3] = i;
4205 }
4206 return result;
4207}
4208
4209static int
4210encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4211{
4212 struct encoding_map *map = (struct encoding_map*)mapping;
4213 int l1 = c>>11;
4214 int l2 = (c>>7) & 0xF;
4215 int l3 = c & 0x7F;
4216 int i;
4217
4218#ifdef Py_UNICODE_WIDE
4219 if (c > 0xFFFF) {
4220 return -1;
4221 }
4222#endif
4223 if (c == 0)
4224 return 0;
4225 /* level 1*/
4226 i = map->level1[l1];
4227 if (i == 0xFF) {
4228 return -1;
4229 }
4230 /* level 2*/
4231 i = map->level23[16*i+l2];
4232 if (i == 0xFF) {
4233 return -1;
4234 }
4235 /* level 3 */
4236 i = map->level23[16*map->count2 + 128*i + l3];
4237 if (i == 0) {
4238 return -1;
4239 }
4240 return i;
4241}
4242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243/* Lookup the character ch in the mapping. If the character
4244 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004245 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 PyObject *w = PyInt_FromLong((long)c);
4249 PyObject *x;
4250
4251 if (w == NULL)
4252 return NULL;
4253 x = PyObject_GetItem(mapping, w);
4254 Py_DECREF(w);
4255 if (x == NULL) {
4256 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4257 /* No mapping found means: mapping is undefined. */
4258 PyErr_Clear();
4259 x = Py_None;
4260 Py_INCREF(x);
4261 return x;
4262 } else
4263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004265 else if (x == Py_None)
4266 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 else if (PyInt_Check(x)) {
4268 long value = PyInt_AS_LONG(x);
4269 if (value < 0 || value > 255) {
4270 PyErr_SetString(PyExc_TypeError,
4271 "character mapping must be in range(256)");
4272 Py_DECREF(x);
4273 return NULL;
4274 }
4275 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 else if (PyString_Check(x))
4278 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 /* wrong return value */
4281 PyErr_SetString(PyExc_TypeError,
4282 "character mapping must return integer, None or str");
4283 Py_DECREF(x);
4284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 }
4286}
4287
Martin v. Löwis3f767792006-06-04 19:36:28 +00004288static int
4289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4290{
4291 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4292 /* exponentially overallocate to minimize reallocations */
4293 if (requiredsize < 2*outsize)
4294 requiredsize = 2*outsize;
4295 if (_PyString_Resize(outobj, requiredsize)) {
4296 return 0;
4297 }
4298 return 1;
4299}
4300
4301typedef enum charmapencode_result {
4302 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4303}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304/* lookup the character, put the result in the output string and adjust
4305 various state variables. Reallocate the output string if not enough
4306 space is available. Return a new reference to the object that
4307 was put in the output buffer, or Py_None, if the mapping was undefined
4308 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004309 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004311charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004312 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004314 PyObject *rep;
4315 char *outstart;
4316 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317
Christian Heimese93237d2007-12-19 02:37:44 +00004318 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004319 int res = encoding_map_lookup(c, mapping);
4320 Py_ssize_t requiredsize = *outpos+1;
4321 if (res == -1)
4322 return enc_FAILED;
4323 if (outsize<requiredsize)
4324 if (!charmapencode_resize(outobj, outpos, requiredsize))
4325 return enc_EXCEPTION;
4326 outstart = PyString_AS_STRING(*outobj);
4327 outstart[(*outpos)++] = (char)res;
4328 return enc_SUCCESS;
4329 }
4330
4331 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004333 return enc_EXCEPTION;
4334 else if (rep==Py_None) {
4335 Py_DECREF(rep);
4336 return enc_FAILED;
4337 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004339 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340 if (outsize<requiredsize)
4341 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4347 }
4348 else {
4349 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4351 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352 if (outsize<requiredsize)
4353 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004355 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004357 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 memcpy(outstart + *outpos, repchars, repsize);
4359 *outpos += repsize;
4360 }
4361 }
Georg Brandl9f167602006-06-04 21:46:16 +00004362 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004363 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364}
4365
4366/* handle an error in PyUnicode_EncodeCharmap
4367 Return 0 on success, -1 on error */
4368static
4369int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004370 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004372 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374{
4375 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004376 Py_ssize_t repsize;
4377 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 Py_UNICODE *uni2;
4379 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t collstartpos = *inpos;
4381 Py_ssize_t collendpos = *inpos+1;
4382 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 char *encoding = "charmap";
4384 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004385 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 /* find all unencodable characters */
4388 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004389 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004390 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004391 int res = encoding_map_lookup(p[collendpos], mapping);
4392 if (res != -1)
4393 break;
4394 ++collendpos;
4395 continue;
4396 }
4397
4398 rep = charmapencode_lookup(p[collendpos], mapping);
4399 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004401 else if (rep!=Py_None) {
4402 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 break;
4404 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 ++collendpos;
4407 }
4408 /* cache callback name lookup
4409 * (if not done yet, i.e. it's the first error) */
4410 if (*known_errorHandler==-1) {
4411 if ((errors==NULL) || (!strcmp(errors, "strict")))
4412 *known_errorHandler = 1;
4413 else if (!strcmp(errors, "replace"))
4414 *known_errorHandler = 2;
4415 else if (!strcmp(errors, "ignore"))
4416 *known_errorHandler = 3;
4417 else if (!strcmp(errors, "xmlcharrefreplace"))
4418 *known_errorHandler = 4;
4419 else
4420 *known_errorHandler = 0;
4421 }
4422 switch (*known_errorHandler) {
4423 case 1: /* strict */
4424 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4425 return -1;
4426 case 2: /* replace */
4427 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4428 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004429 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 return -1;
4431 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004432 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4434 return -1;
4435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 }
4437 /* fall through */
4438 case 3: /* ignore */
4439 *inpos = collendpos;
4440 break;
4441 case 4: /* xmlcharrefreplace */
4442 /* generate replacement (temporarily (mis)uses p) */
4443 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4444 char buffer[2+29+1+1];
4445 char *cp;
4446 sprintf(buffer, "&#%d;", (int)p[collpos]);
4447 for (cp = buffer; *cp; ++cp) {
4448 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004449 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004451 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4453 return -1;
4454 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 }
4456 }
4457 *inpos = collendpos;
4458 break;
4459 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004460 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 encoding, reason, p, size, exceptionObject,
4462 collstartpos, collendpos, &newpos);
4463 if (repunicode == NULL)
4464 return -1;
4465 /* generate replacement */
4466 repsize = PyUnicode_GET_SIZE(repunicode);
4467 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4468 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004469 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 return -1;
4471 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004472 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4475 return -1;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 }
4478 *inpos = newpos;
4479 Py_DECREF(repunicode);
4480 }
4481 return 0;
4482}
4483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004485 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 PyObject *mapping,
4487 const char *errors)
4488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 /* output object */
4490 PyObject *res = NULL;
4491 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004492 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004494 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 PyObject *errorHandler = NULL;
4496 PyObject *exc = NULL;
4497 /* the following variable is used for caching string comparisons
4498 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4499 * 3=ignore, 4=xmlcharrefreplace */
4500 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501
4502 /* Default to Latin-1 */
4503 if (mapping == NULL)
4504 return PyUnicode_EncodeLatin1(p, size, errors);
4505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 /* allocate enough for a simple encoding without
4507 replacements, if we need more, we'll resize */
4508 res = PyString_FromStringAndSize(NULL, size);
4509 if (res == NULL)
4510 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004511 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 while (inpos<size) {
4515 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4517 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004519 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (charmap_encoding_error(p, size, &inpos, mapping,
4521 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004522 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004523 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004524 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 else
4528 /* done with this character => adjust input position */
4529 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 /* Resize if we allocated to much */
4533 if (respos<PyString_GET_SIZE(res)) {
4534 if (_PyString_Resize(&res, respos))
4535 goto onError;
4536 }
4537 Py_XDECREF(exc);
4538 Py_XDECREF(errorHandler);
4539 return res;
4540
4541 onError:
4542 Py_XDECREF(res);
4543 Py_XDECREF(exc);
4544 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 return NULL;
4546}
4547
4548PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4549 PyObject *mapping)
4550{
4551 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4552 PyErr_BadArgument();
4553 return NULL;
4554 }
4555 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4556 PyUnicode_GET_SIZE(unicode),
4557 mapping,
4558 NULL);
4559}
4560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561/* create or adjust a UnicodeTranslateError */
4562static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 const Py_UNICODE *unicode, Py_ssize_t size,
4564 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 if (*exceptionObject == NULL) {
4568 *exceptionObject = PyUnicodeTranslateError_Create(
4569 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4573 goto onError;
4574 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4575 goto onError;
4576 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4577 goto onError;
4578 return;
4579 onError:
4580 Py_DECREF(*exceptionObject);
4581 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
4583}
4584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585/* raises a UnicodeTranslateError */
4586static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004587 const Py_UNICODE *unicode, Py_ssize_t size,
4588 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 const char *reason)
4590{
4591 make_translate_exception(exceptionObject,
4592 unicode, size, startpos, endpos, reason);
4593 if (*exceptionObject != NULL)
4594 PyCodec_StrictErrors(*exceptionObject);
4595}
4596
4597/* error handling callback helper:
4598 build arguments, call the callback and check the arguments,
4599 put the result into newpos and return the replacement string, which
4600 has to be freed by the caller */
4601static PyObject *unicode_translate_call_errorhandler(const char *errors,
4602 PyObject **errorHandler,
4603 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004604 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4605 Py_ssize_t startpos, Py_ssize_t endpos,
4606 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004608 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609
Martin v. Löwis412fb672006-04-13 06:34:32 +00004610 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 PyObject *restuple;
4612 PyObject *resunicode;
4613
4614 if (*errorHandler == NULL) {
4615 *errorHandler = PyCodec_LookupError(errors);
4616 if (*errorHandler == NULL)
4617 return NULL;
4618 }
4619
4620 make_translate_exception(exceptionObject,
4621 unicode, size, startpos, endpos, reason);
4622 if (*exceptionObject == NULL)
4623 return NULL;
4624
4625 restuple = PyObject_CallFunctionObjArgs(
4626 *errorHandler, *exceptionObject, NULL);
4627 if (restuple == NULL)
4628 return NULL;
4629 if (!PyTuple_Check(restuple)) {
4630 PyErr_Format(PyExc_TypeError, &argparse[4]);
4631 Py_DECREF(restuple);
4632 return NULL;
4633 }
4634 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 Py_DECREF(restuple);
4637 return NULL;
4638 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 if (i_newpos<0)
4640 *newpos = size+i_newpos;
4641 else
4642 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004643 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004644 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004645 Py_DECREF(restuple);
4646 return NULL;
4647 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 Py_INCREF(resunicode);
4649 Py_DECREF(restuple);
4650 return resunicode;
4651}
4652
4653/* Lookup the character ch in the mapping and put the result in result,
4654 which must be decrefed by the caller.
4655 Return 0 on success, -1 on error */
4656static
4657int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4658{
4659 PyObject *w = PyInt_FromLong((long)c);
4660 PyObject *x;
4661
4662 if (w == NULL)
4663 return -1;
4664 x = PyObject_GetItem(mapping, w);
4665 Py_DECREF(w);
4666 if (x == NULL) {
4667 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4668 /* No mapping found means: use 1:1 mapping. */
4669 PyErr_Clear();
4670 *result = NULL;
4671 return 0;
4672 } else
4673 return -1;
4674 }
4675 else if (x == Py_None) {
4676 *result = x;
4677 return 0;
4678 }
4679 else if (PyInt_Check(x)) {
4680 long value = PyInt_AS_LONG(x);
4681 long max = PyUnicode_GetMax();
4682 if (value < 0 || value > max) {
4683 PyErr_Format(PyExc_TypeError,
4684 "character mapping must be in range(0x%lx)", max+1);
4685 Py_DECREF(x);
4686 return -1;
4687 }
4688 *result = x;
4689 return 0;
4690 }
4691 else if (PyUnicode_Check(x)) {
4692 *result = x;
4693 return 0;
4694 }
4695 else {
4696 /* wrong return value */
4697 PyErr_SetString(PyExc_TypeError,
4698 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004699 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 return -1;
4701 }
4702}
4703/* ensure that *outobj is at least requiredsize characters long,
4704if not reallocate and adjust various state variables.
4705Return 0 on success, -1 on error */
4706static
Walter Dörwald4894c302003-10-24 14:25:28 +00004707int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004711 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004713 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004715 if (requiredsize < 2 * oldsize)
4716 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004717 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 return -1;
4719 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 }
4721 return 0;
4722}
4723/* lookup the character, put the result in the output string and adjust
4724 various state variables. Return a new reference to the object that
4725 was put in the output buffer in *result, or Py_None, if the mapping was
4726 undefined (in which case no character was written).
4727 The called must decref result.
4728 Return 0 on success, -1 on error. */
4729static
Walter Dörwald4894c302003-10-24 14:25:28 +00004730int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004731 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004732 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733{
Walter Dörwald4894c302003-10-24 14:25:28 +00004734 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 return -1;
4736 if (*res==NULL) {
4737 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004738 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 }
4740 else if (*res==Py_None)
4741 ;
4742 else if (PyInt_Check(*res)) {
4743 /* no overflow check, because we know that the space is enough */
4744 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4745 }
4746 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 if (repsize==1) {
4749 /* no overflow check, because we know that the space is enough */
4750 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4751 }
4752 else if (repsize!=0) {
4753 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004755 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004756 repsize - 1;
4757 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 return -1;
4759 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4760 *outp += repsize;
4761 }
4762 }
4763 else
4764 return -1;
4765 return 0;
4766}
4767
4768PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 PyObject *mapping,
4771 const char *errors)
4772{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 /* output object */
4774 PyObject *res = NULL;
4775 /* pointers to the beginning and end+1 of input */
4776 const Py_UNICODE *startp = p;
4777 const Py_UNICODE *endp = p + size;
4778 /* pointer into the output */
4779 Py_UNICODE *str;
4780 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 char *reason = "character maps to <undefined>";
4783 PyObject *errorHandler = NULL;
4784 PyObject *exc = NULL;
4785 /* the following variable is used for caching string comparisons
4786 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4787 * 3=ignore, 4=xmlcharrefreplace */
4788 int known_errorHandler = -1;
4789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 if (mapping == NULL) {
4791 PyErr_BadArgument();
4792 return NULL;
4793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794
4795 /* allocate enough for a simple 1:1 translation without
4796 replacements, if we need more, we'll resize */
4797 res = PyUnicode_FromUnicode(NULL, size);
4798 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004799 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 return res;
4802 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 while (p<endp) {
4805 /* try to encode it */
4806 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004807 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 goto onError;
4810 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004811 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (x!=Py_None) /* it worked => adjust input pointer */
4813 ++p;
4814 else { /* untranslatable character */
4815 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004816 Py_ssize_t repsize;
4817 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_UNICODE *uni2;
4819 /* startpos for collecting untranslatable chars */
4820 const Py_UNICODE *collstart = p;
4821 const Py_UNICODE *collend = p+1;
4822 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 /* find all untranslatable characters */
4825 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004826 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 goto onError;
4828 Py_XDECREF(x);
4829 if (x!=Py_None)
4830 break;
4831 ++collend;
4832 }
4833 /* cache callback name lookup
4834 * (if not done yet, i.e. it's the first error) */
4835 if (known_errorHandler==-1) {
4836 if ((errors==NULL) || (!strcmp(errors, "strict")))
4837 known_errorHandler = 1;
4838 else if (!strcmp(errors, "replace"))
4839 known_errorHandler = 2;
4840 else if (!strcmp(errors, "ignore"))
4841 known_errorHandler = 3;
4842 else if (!strcmp(errors, "xmlcharrefreplace"))
4843 known_errorHandler = 4;
4844 else
4845 known_errorHandler = 0;
4846 }
4847 switch (known_errorHandler) {
4848 case 1: /* strict */
4849 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4850 goto onError;
4851 case 2: /* replace */
4852 /* No need to check for space, this is a 1:1 replacement */
4853 for (coll = collstart; coll<collend; ++coll)
4854 *str++ = '?';
4855 /* fall through */
4856 case 3: /* ignore */
4857 p = collend;
4858 break;
4859 case 4: /* xmlcharrefreplace */
4860 /* generate replacement (temporarily (mis)uses p) */
4861 for (p = collstart; p < collend; ++p) {
4862 char buffer[2+29+1+1];
4863 char *cp;
4864 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004865 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4867 goto onError;
4868 for (cp = buffer; *cp; ++cp)
4869 *str++ = *cp;
4870 }
4871 p = collend;
4872 break;
4873 default:
4874 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4875 reason, startp, size, &exc,
4876 collstart-startp, collend-startp, &newpos);
4877 if (repunicode == NULL)
4878 goto onError;
4879 /* generate replacement */
4880 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004881 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4883 Py_DECREF(repunicode);
4884 goto onError;
4885 }
4886 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4887 *str++ = *uni2;
4888 p = startp + newpos;
4889 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
4891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 /* Resize if we allocated to much */
4894 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004895 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004896 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004897 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 }
4899 Py_XDECREF(exc);
4900 Py_XDECREF(errorHandler);
4901 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 onError:
4904 Py_XDECREF(res);
4905 Py_XDECREF(exc);
4906 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 return NULL;
4908}
4909
4910PyObject *PyUnicode_Translate(PyObject *str,
4911 PyObject *mapping,
4912 const char *errors)
4913{
4914 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004915
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 str = PyUnicode_FromObject(str);
4917 if (str == NULL)
4918 goto onError;
4919 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4920 PyUnicode_GET_SIZE(str),
4921 mapping,
4922 errors);
4923 Py_DECREF(str);
4924 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004925
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 onError:
4927 Py_XDECREF(str);
4928 return NULL;
4929}
Tim Petersced69f82003-09-16 20:30:58 +00004930
Guido van Rossum9e896b32000-04-05 20:11:21 +00004931/* --- Decimal Encoder ---------------------------------------------------- */
4932
4933int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004934 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004935 char *output,
4936 const char *errors)
4937{
4938 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 PyObject *errorHandler = NULL;
4940 PyObject *exc = NULL;
4941 const char *encoding = "decimal";
4942 const char *reason = "invalid decimal Unicode string";
4943 /* the following variable is used for caching string comparisons
4944 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4945 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004946
4947 if (output == NULL) {
4948 PyErr_BadArgument();
4949 return -1;
4950 }
4951
4952 p = s;
4953 end = s + length;
4954 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004956 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004958 Py_ssize_t repsize;
4959 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004960 Py_UNICODE *uni2;
4961 Py_UNICODE *collstart;
4962 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Guido van Rossum9e896b32000-04-05 20:11:21 +00004964 if (Py_UNICODE_ISSPACE(ch)) {
4965 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004967 continue;
4968 }
4969 decimal = Py_UNICODE_TODECIMAL(ch);
4970 if (decimal >= 0) {
4971 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004973 continue;
4974 }
Guido van Rossumba477042000-04-06 18:18:10 +00004975 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004976 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004978 continue;
4979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 /* All other characters are considered unencodable */
4981 collstart = p;
4982 collend = p+1;
4983 while (collend < end) {
4984 if ((0 < *collend && *collend < 256) ||
4985 !Py_UNICODE_ISSPACE(*collend) ||
4986 Py_UNICODE_TODECIMAL(*collend))
4987 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 /* cache callback name lookup
4990 * (if not done yet, i.e. it's the first error) */
4991 if (known_errorHandler==-1) {
4992 if ((errors==NULL) || (!strcmp(errors, "strict")))
4993 known_errorHandler = 1;
4994 else if (!strcmp(errors, "replace"))
4995 known_errorHandler = 2;
4996 else if (!strcmp(errors, "ignore"))
4997 known_errorHandler = 3;
4998 else if (!strcmp(errors, "xmlcharrefreplace"))
4999 known_errorHandler = 4;
5000 else
5001 known_errorHandler = 0;
5002 }
5003 switch (known_errorHandler) {
5004 case 1: /* strict */
5005 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5006 goto onError;
5007 case 2: /* replace */
5008 for (p = collstart; p < collend; ++p)
5009 *output++ = '?';
5010 /* fall through */
5011 case 3: /* ignore */
5012 p = collend;
5013 break;
5014 case 4: /* xmlcharrefreplace */
5015 /* generate replacement (temporarily (mis)uses p) */
5016 for (p = collstart; p < collend; ++p)
5017 output += sprintf(output, "&#%d;", (int)*p);
5018 p = collend;
5019 break;
5020 default:
5021 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5022 encoding, reason, s, length, &exc,
5023 collstart-s, collend-s, &newpos);
5024 if (repunicode == NULL)
5025 goto onError;
5026 /* generate replacement */
5027 repsize = PyUnicode_GET_SIZE(repunicode);
5028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5029 Py_UNICODE ch = *uni2;
5030 if (Py_UNICODE_ISSPACE(ch))
5031 *output++ = ' ';
5032 else {
5033 decimal = Py_UNICODE_TODECIMAL(ch);
5034 if (decimal >= 0)
5035 *output++ = '0' + decimal;
5036 else if (0 < ch && ch < 256)
5037 *output++ = (char)ch;
5038 else {
5039 Py_DECREF(repunicode);
5040 raise_encode_exception(&exc, encoding,
5041 s, length, collstart-s, collend-s, reason);
5042 goto onError;
5043 }
5044 }
5045 }
5046 p = s + newpos;
5047 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005048 }
5049 }
5050 /* 0-terminate the output string */
5051 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 Py_XDECREF(exc);
5053 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005054 return 0;
5055
5056 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 Py_XDECREF(exc);
5058 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005059 return -1;
5060}
5061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062/* --- Helpers ------------------------------------------------------------ */
5063
Eric Smitha9f7d622008-02-17 19:46:49 +00005064#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005065
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005066#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005067
Fredrik Lundha50d2012006-05-26 17:04:58 +00005068#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005069
5070#include "stringlib/count.h"
5071#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005072#include "stringlib/partition.h"
5073
Fredrik Lundhc8162812006-05-26 19:33:03 +00005074/* helper macro to fixup start/end slice values */
5075#define FIX_START_END(obj) \
5076 if (start < 0) \
5077 start += (obj)->length; \
5078 if (start < 0) \
5079 start = 0; \
5080 if (end > (obj)->length) \
5081 end = (obj)->length; \
5082 if (end < 0) \
5083 end += (obj)->length; \
5084 if (end < 0) \
5085 end = 0;
5086
Martin v. Löwis18e16552006-02-15 17:27:45 +00005087Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005088 PyObject *substr,
5089 Py_ssize_t start,
5090 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005092 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005093 PyUnicodeObject* str_obj;
5094 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005095
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005096 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5097 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005099 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5100 if (!sub_obj) {
5101 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 return -1;
5103 }
Tim Petersced69f82003-09-16 20:30:58 +00005104
Fredrik Lundhc8162812006-05-26 19:33:03 +00005105 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005106
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005107 result = stringlib_count(
5108 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5109 );
5110
5111 Py_DECREF(sub_obj);
5112 Py_DECREF(str_obj);
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 return result;
5115}
5116
Martin v. Löwis18e16552006-02-15 17:27:45 +00005117Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005118 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005119 Py_ssize_t start,
5120 Py_ssize_t end,
5121 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005123 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005124
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005125 str = PyUnicode_FromObject(str);
5126 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005127 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005128 sub = PyUnicode_FromObject(sub);
5129 if (!sub) {
5130 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005131 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 }
Tim Petersced69f82003-09-16 20:30:58 +00005133
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005134 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005135 result = stringlib_find_slice(
5136 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5137 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5138 start, end
5139 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005140 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005141 result = stringlib_rfind_slice(
5142 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5143 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5144 start, end
5145 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005146
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005147 Py_DECREF(str);
5148 Py_DECREF(sub);
5149
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return result;
5151}
5152
Tim Petersced69f82003-09-16 20:30:58 +00005153static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154int tailmatch(PyUnicodeObject *self,
5155 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005156 Py_ssize_t start,
5157 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 int direction)
5159{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 if (substring->length == 0)
5161 return 1;
5162
Fredrik Lundhc8162812006-05-26 19:33:03 +00005163 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164
5165 end -= substring->length;
5166 if (end < start)
5167 return 0;
5168
5169 if (direction > 0) {
5170 if (Py_UNICODE_MATCH(self, end, substring))
5171 return 1;
5172 } else {
5173 if (Py_UNICODE_MATCH(self, start, substring))
5174 return 1;
5175 }
5176
5177 return 0;
5178}
5179
Martin v. Löwis18e16552006-02-15 17:27:45 +00005180Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005182 Py_ssize_t start,
5183 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 int direction)
5185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005186 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 str = PyUnicode_FromObject(str);
5189 if (str == NULL)
5190 return -1;
5191 substr = PyUnicode_FromObject(substr);
5192 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005193 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return -1;
5195 }
Tim Petersced69f82003-09-16 20:30:58 +00005196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 result = tailmatch((PyUnicodeObject *)str,
5198 (PyUnicodeObject *)substr,
5199 start, end, direction);
5200 Py_DECREF(str);
5201 Py_DECREF(substr);
5202 return result;
5203}
5204
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205/* Apply fixfct filter to the Unicode object self and return a
5206 reference to the modified object */
5207
Tim Petersced69f82003-09-16 20:30:58 +00005208static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209PyObject *fixup(PyUnicodeObject *self,
5210 int (*fixfct)(PyUnicodeObject *s))
5211{
5212
5213 PyUnicodeObject *u;
5214
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005215 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (u == NULL)
5217 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005218
5219 Py_UNICODE_COPY(u->str, self->str, self->length);
5220
Tim Peters7a29bd52001-09-12 03:03:31 +00005221 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 /* fixfct should return TRUE if it modified the buffer. If
5223 FALSE, return a reference to the original buffer instead
5224 (to save space, not time) */
5225 Py_INCREF(self);
5226 Py_DECREF(u);
5227 return (PyObject*) self;
5228 }
5229 return (PyObject*) u;
5230}
5231
Tim Petersced69f82003-09-16 20:30:58 +00005232static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233int fixupper(PyUnicodeObject *self)
5234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 Py_UNICODE *s = self->str;
5237 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 while (len-- > 0) {
5240 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005241
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 ch = Py_UNICODE_TOUPPER(*s);
5243 if (ch != *s) {
5244 status = 1;
5245 *s = ch;
5246 }
5247 s++;
5248 }
5249
5250 return status;
5251}
5252
Tim Petersced69f82003-09-16 20:30:58 +00005253static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254int fixlower(PyUnicodeObject *self)
5255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005256 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 Py_UNICODE *s = self->str;
5258 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005259
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 while (len-- > 0) {
5261 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005262
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 ch = Py_UNICODE_TOLOWER(*s);
5264 if (ch != *s) {
5265 status = 1;
5266 *s = ch;
5267 }
5268 s++;
5269 }
5270
5271 return status;
5272}
5273
Tim Petersced69f82003-09-16 20:30:58 +00005274static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275int fixswapcase(PyUnicodeObject *self)
5276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 Py_UNICODE *s = self->str;
5279 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 while (len-- > 0) {
5282 if (Py_UNICODE_ISUPPER(*s)) {
5283 *s = Py_UNICODE_TOLOWER(*s);
5284 status = 1;
5285 } else if (Py_UNICODE_ISLOWER(*s)) {
5286 *s = Py_UNICODE_TOUPPER(*s);
5287 status = 1;
5288 }
5289 s++;
5290 }
5291
5292 return status;
5293}
5294
Tim Petersced69f82003-09-16 20:30:58 +00005295static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296int fixcapitalize(PyUnicodeObject *self)
5297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005299 Py_UNICODE *s = self->str;
5300 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005302 if (len == 0)
5303 return 0;
5304 if (Py_UNICODE_ISLOWER(*s)) {
5305 *s = Py_UNICODE_TOUPPER(*s);
5306 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005308 s++;
5309 while (--len > 0) {
5310 if (Py_UNICODE_ISUPPER(*s)) {
5311 *s = Py_UNICODE_TOLOWER(*s);
5312 status = 1;
5313 }
5314 s++;
5315 }
5316 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317}
5318
5319static
5320int fixtitle(PyUnicodeObject *self)
5321{
5322 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5323 register Py_UNICODE *e;
5324 int previous_is_cased;
5325
5326 /* Shortcut for single character strings */
5327 if (PyUnicode_GET_SIZE(self) == 1) {
5328 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5329 if (*p != ch) {
5330 *p = ch;
5331 return 1;
5332 }
5333 else
5334 return 0;
5335 }
Tim Petersced69f82003-09-16 20:30:58 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 e = p + PyUnicode_GET_SIZE(self);
5338 previous_is_cased = 0;
5339 for (; p < e; p++) {
5340 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 if (previous_is_cased)
5343 *p = Py_UNICODE_TOLOWER(ch);
5344 else
5345 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005346
5347 if (Py_UNICODE_ISLOWER(ch) ||
5348 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_UNICODE_ISTITLE(ch))
5350 previous_is_cased = 1;
5351 else
5352 previous_is_cased = 0;
5353 }
5354 return 1;
5355}
5356
Tim Peters8ce9f162004-08-27 01:49:32 +00005357PyObject *
5358PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359{
Tim Peters8ce9f162004-08-27 01:49:32 +00005360 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005361 const Py_UNICODE blank = ' ';
5362 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005363 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005364 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005365 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5366 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005367 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5368 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005370 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005371 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372
Tim Peters05eba1f2004-08-27 21:32:02 +00005373 fseq = PySequence_Fast(seq, "");
5374 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005375 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005376 }
5377
Tim Peters91879ab2004-08-27 22:35:44 +00005378 /* Grrrr. A codec may be invoked to convert str objects to
5379 * Unicode, and so it's possible to call back into Python code
5380 * during PyUnicode_FromObject(), and so it's possible for a sick
5381 * codec to change the size of fseq (if seq is a list). Therefore
5382 * we have to keep refetching the size -- can't assume seqlen
5383 * is invariant.
5384 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005385 seqlen = PySequence_Fast_GET_SIZE(fseq);
5386 /* If empty sequence, return u"". */
5387 if (seqlen == 0) {
5388 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5389 goto Done;
5390 }
5391 /* If singleton sequence with an exact Unicode, return that. */
5392 if (seqlen == 1) {
5393 item = PySequence_Fast_GET_ITEM(fseq, 0);
5394 if (PyUnicode_CheckExact(item)) {
5395 Py_INCREF(item);
5396 res = (PyUnicodeObject *)item;
5397 goto Done;
5398 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 }
5400
Tim Peters05eba1f2004-08-27 21:32:02 +00005401 /* At least two items to join, or one that isn't exact Unicode. */
5402 if (seqlen > 1) {
5403 /* Set up sep and seplen -- they're needed. */
5404 if (separator == NULL) {
5405 sep = &blank;
5406 seplen = 1;
5407 }
5408 else {
5409 internal_separator = PyUnicode_FromObject(separator);
5410 if (internal_separator == NULL)
5411 goto onError;
5412 sep = PyUnicode_AS_UNICODE(internal_separator);
5413 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005414 /* In case PyUnicode_FromObject() mutated seq. */
5415 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 }
5417 }
5418
5419 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005420 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005422 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 res_p = PyUnicode_AS_UNICODE(res);
5424 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005425
Tim Peters05eba1f2004-08-27 21:32:02 +00005426 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005427 Py_ssize_t itemlen;
5428 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005429
5430 item = PySequence_Fast_GET_ITEM(fseq, i);
5431 /* Convert item to Unicode. */
5432 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5433 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005434 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005436 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005437 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005438 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 item = PyUnicode_FromObject(item);
5440 if (item == NULL)
5441 goto onError;
5442 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005443
Tim Peters91879ab2004-08-27 22:35:44 +00005444 /* In case PyUnicode_FromObject() mutated seq. */
5445 seqlen = PySequence_Fast_GET_SIZE(fseq);
5446
Tim Peters8ce9f162004-08-27 01:49:32 +00005447 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005450 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005451 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 if (i < seqlen - 1) {
5453 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005454 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005455 goto Overflow;
5456 }
5457 if (new_res_used > res_alloc) {
5458 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005459 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005460 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005461 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005464 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005465 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005467 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005468 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005470
5471 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005472 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005473 res_p += itemlen;
5474 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005475 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 res_p += seplen;
5477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 res_used = new_res_used;
5480 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005481
Tim Peters05eba1f2004-08-27 21:32:02 +00005482 /* Shrink res to match the used area; this probably can't fail,
5483 * but it's cheap to check.
5484 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005485 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005486 goto onError;
5487
5488 Done:
5489 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 return (PyObject *)res;
5492
Tim Peters8ce9f162004-08-27 01:49:32 +00005493 Overflow:
5494 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005495 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005496 Py_DECREF(item);
5497 /* fall through */
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005502 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 return NULL;
5504}
5505
Tim Petersced69f82003-09-16 20:30:58 +00005506static
5507PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005508 Py_ssize_t left,
5509 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_UNICODE fill)
5511{
5512 PyUnicodeObject *u;
5513
5514 if (left < 0)
5515 left = 0;
5516 if (right < 0)
5517 right = 0;
5518
Tim Peters7a29bd52001-09-12 03:03:31 +00005519 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 Py_INCREF(self);
5521 return self;
5522 }
5523
5524 u = _PyUnicode_New(left + self->length + right);
5525 if (u) {
5526 if (left)
5527 Py_UNICODE_FILL(u->str, fill, left);
5528 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5529 if (right)
5530 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5531 }
5532
5533 return u;
5534}
5535
5536#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005537 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 if (!str) \
5539 goto onError; \
5540 if (PyList_Append(list, str)) { \
5541 Py_DECREF(str); \
5542 goto onError; \
5543 } \
5544 else \
5545 Py_DECREF(str);
5546
5547static
5548PyObject *split_whitespace(PyUnicodeObject *self,
5549 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005550 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 register Py_ssize_t i;
5553 register Py_ssize_t j;
5554 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005556 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558 for (i = j = 0; i < len; ) {
5559 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005560 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 i++;
5562 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005563 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 i++;
5565 if (j < i) {
5566 if (maxcount-- <= 0)
5567 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005568 SPLIT_APPEND(buf, j, i);
5569 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 i++;
5571 j = i;
5572 }
5573 }
5574 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005575 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
5577 return list;
5578
5579 onError:
5580 Py_DECREF(list);
5581 return NULL;
5582}
5583
5584PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005585 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 register Py_ssize_t i;
5588 register Py_ssize_t j;
5589 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 PyObject *list;
5591 PyObject *str;
5592 Py_UNICODE *data;
5593
5594 string = PyUnicode_FromObject(string);
5595 if (string == NULL)
5596 return NULL;
5597 data = PyUnicode_AS_UNICODE(string);
5598 len = PyUnicode_GET_SIZE(string);
5599
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 list = PyList_New(0);
5601 if (!list)
5602 goto onError;
5603
5604 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005608 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005612 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 if (i < len) {
5614 if (data[i] == '\r' && i + 1 < len &&
5615 data[i+1] == '\n')
5616 i += 2;
5617 else
5618 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005619 if (keepends)
5620 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
Guido van Rossum86662912000-04-11 15:38:46 +00005622 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 j = i;
5624 }
5625 if (j < len) {
5626 SPLIT_APPEND(data, j, len);
5627 }
5628
5629 Py_DECREF(string);
5630 return list;
5631
5632 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005633 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 Py_DECREF(string);
5635 return NULL;
5636}
5637
Tim Petersced69f82003-09-16 20:30:58 +00005638static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639PyObject *split_char(PyUnicodeObject *self,
5640 PyObject *list,
5641 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005644 register Py_ssize_t i;
5645 register Py_ssize_t j;
5646 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005648 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
5650 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005651 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (maxcount-- <= 0)
5653 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005654 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 i = j = i + 1;
5656 } else
5657 i++;
5658 }
5659 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005660 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 }
5662 return list;
5663
5664 onError:
5665 Py_DECREF(list);
5666 return NULL;
5667}
5668
Tim Petersced69f82003-09-16 20:30:58 +00005669static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670PyObject *split_substring(PyUnicodeObject *self,
5671 PyObject *list,
5672 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len = self->length;
5678 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 PyObject *str;
5680
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005681 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 if (Py_UNICODE_MATCH(self, i, substring)) {
5683 if (maxcount-- <= 0)
5684 break;
5685 SPLIT_APPEND(self->str, j, i);
5686 i = j = i + sublen;
5687 } else
5688 i++;
5689 }
5690 if (j <= len) {
5691 SPLIT_APPEND(self->str, j, len);
5692 }
5693 return list;
5694
5695 onError:
5696 Py_DECREF(list);
5697 return NULL;
5698}
5699
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700static
5701PyObject *rsplit_whitespace(PyUnicodeObject *self,
5702 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 register Py_ssize_t i;
5706 register Py_ssize_t j;
5707 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005708 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005709 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005710
5711 for (i = j = len - 1; i >= 0; ) {
5712 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005713 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005714 i--;
5715 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005716 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005717 i--;
5718 if (j > i) {
5719 if (maxcount-- <= 0)
5720 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005721 SPLIT_APPEND(buf, i + 1, j + 1);
5722 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005723 i--;
5724 j = i;
5725 }
5726 }
5727 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005728 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005729 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005730 if (PyList_Reverse(list) < 0)
5731 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005732 return list;
5733
5734 onError:
5735 Py_DECREF(list);
5736 return NULL;
5737}
5738
5739static
5740PyObject *rsplit_char(PyUnicodeObject *self,
5741 PyObject *list,
5742 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005743 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 register Py_ssize_t i;
5746 register Py_ssize_t j;
5747 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005748 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005749 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005750
5751 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005752 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005753 if (maxcount-- <= 0)
5754 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005755 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005756 j = i = i - 1;
5757 } else
5758 i--;
5759 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005760 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005761 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005763 if (PyList_Reverse(list) < 0)
5764 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765 return list;
5766
5767 onError:
5768 Py_DECREF(list);
5769 return NULL;
5770}
5771
5772static
5773PyObject *rsplit_substring(PyUnicodeObject *self,
5774 PyObject *list,
5775 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005778 register Py_ssize_t i;
5779 register Py_ssize_t j;
5780 Py_ssize_t len = self->length;
5781 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 PyObject *str;
5783
5784 for (i = len - sublen, j = len; i >= 0; ) {
5785 if (Py_UNICODE_MATCH(self, i, substring)) {
5786 if (maxcount-- <= 0)
5787 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005788 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789 j = i;
5790 i -= sublen;
5791 } else
5792 i--;
5793 }
5794 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005795 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005797 if (PyList_Reverse(list) < 0)
5798 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799 return list;
5800
5801 onError:
5802 Py_DECREF(list);
5803 return NULL;
5804}
5805
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806#undef SPLIT_APPEND
5807
5808static
5809PyObject *split(PyUnicodeObject *self,
5810 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812{
5813 PyObject *list;
5814
5815 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005816 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818 list = PyList_New(0);
5819 if (!list)
5820 return NULL;
5821
5822 if (substring == NULL)
5823 return split_whitespace(self,list,maxcount);
5824
5825 else if (substring->length == 1)
5826 return split_char(self,list,substring->str[0],maxcount);
5827
5828 else if (substring->length == 0) {
5829 Py_DECREF(list);
5830 PyErr_SetString(PyExc_ValueError, "empty separator");
5831 return NULL;
5832 }
5833 else
5834 return split_substring(self,list,substring,maxcount);
5835}
5836
Tim Petersced69f82003-09-16 20:30:58 +00005837static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838PyObject *rsplit(PyUnicodeObject *self,
5839 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005840 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841{
5842 PyObject *list;
5843
5844 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005845 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005846
5847 list = PyList_New(0);
5848 if (!list)
5849 return NULL;
5850
5851 if (substring == NULL)
5852 return rsplit_whitespace(self,list,maxcount);
5853
5854 else if (substring->length == 1)
5855 return rsplit_char(self,list,substring->str[0],maxcount);
5856
5857 else if (substring->length == 0) {
5858 Py_DECREF(list);
5859 PyErr_SetString(PyExc_ValueError, "empty separator");
5860 return NULL;
5861 }
5862 else
5863 return rsplit_substring(self,list,substring,maxcount);
5864}
5865
5866static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867PyObject *replace(PyUnicodeObject *self,
5868 PyUnicodeObject *str1,
5869 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
5872 PyUnicodeObject *u;
5873
5874 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005875 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
Fredrik Lundh347ee272006-05-24 16:35:18 +00005877 if (str1->length == str2->length) {
5878 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005879 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005880 if (str1->length == 1) {
5881 /* replace characters */
5882 Py_UNICODE u1, u2;
5883 if (!findchar(self->str, self->length, str1->str[0]))
5884 goto nothing;
5885 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5886 if (!u)
5887 return NULL;
5888 Py_UNICODE_COPY(u->str, self->str, self->length);
5889 u1 = str1->str[0];
5890 u2 = str2->str[0];
5891 for (i = 0; i < u->length; i++)
5892 if (u->str[i] == u1) {
5893 if (--maxcount < 0)
5894 break;
5895 u->str[i] = u2;
5896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 i = fastsearch(
5899 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005901 if (i < 0)
5902 goto nothing;
5903 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5904 if (!u)
5905 return NULL;
5906 Py_UNICODE_COPY(u->str, self->str, self->length);
5907 while (i <= self->length - str1->length)
5908 if (Py_UNICODE_MATCH(self, i, str1)) {
5909 if (--maxcount < 0)
5910 break;
5911 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5912 i += str1->length;
5913 } else
5914 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005918 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005919 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 Py_UNICODE *p;
5921
5922 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005923 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (n > maxcount)
5925 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005926 if (n == 0)
5927 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005928 /* new_size = self->length + n * (str2->length - str1->length)); */
5929 delta = (str2->length - str1->length);
5930 if (delta == 0) {
5931 new_size = self->length;
5932 } else {
5933 product = n * (str2->length - str1->length);
5934 if ((product / (str2->length - str1->length)) != n) {
5935 PyErr_SetString(PyExc_OverflowError,
5936 "replace string is too long");
5937 return NULL;
5938 }
5939 new_size = self->length + product;
5940 if (new_size < 0) {
5941 PyErr_SetString(PyExc_OverflowError,
5942 "replace string is too long");
5943 return NULL;
5944 }
5945 }
5946 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005947 if (!u)
5948 return NULL;
5949 i = 0;
5950 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005951 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005952 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005953 while (n-- > 0) {
5954 /* look for next match */
5955 j = i;
5956 while (j <= e) {
5957 if (Py_UNICODE_MATCH(self, j, str1))
5958 break;
5959 j++;
5960 }
5961 if (j > i) {
5962 if (j > e)
5963 break;
5964 /* copy unchanged part [i:j] */
5965 Py_UNICODE_COPY(p, self->str+i, j-i);
5966 p += j - i;
5967 }
5968 /* copy substitution string */
5969 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970 Py_UNICODE_COPY(p, str2->str, str2->length);
5971 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005972 }
5973 i = j + str1->length;
5974 }
5975 if (i < self->length)
5976 /* copy tail [i:] */
5977 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005978 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005979 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005980 while (n > 0) {
5981 Py_UNICODE_COPY(p, str2->str, str2->length);
5982 p += str2->length;
5983 if (--n <= 0)
5984 break;
5985 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005987 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
5989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991
5992nothing:
5993 /* nothing to replace; return original string (when possible) */
5994 if (PyUnicode_CheckExact(self)) {
5995 Py_INCREF(self);
5996 return (PyObject *) self;
5997 }
5998 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999}
6000
6001/* --- Unicode Object Methods --------------------------------------------- */
6002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006003PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004"S.title() -> unicode\n\
6005\n\
6006Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006007characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
6009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006010unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return fixup(self, fixtitle);
6013}
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016"S.capitalize() -> unicode\n\
6017\n\
6018Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006022unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return fixup(self, fixcapitalize);
6025}
6026
6027#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029"S.capwords() -> unicode\n\
6030\n\
6031Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
6034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006035unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
6037 PyObject *list;
6038 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006039 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 /* Split into words */
6042 list = split(self, NULL, -1);
6043 if (!list)
6044 return NULL;
6045
6046 /* Capitalize each word */
6047 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6048 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6049 fixcapitalize);
6050 if (item == NULL)
6051 goto onError;
6052 Py_DECREF(PyList_GET_ITEM(list, i));
6053 PyList_SET_ITEM(list, i, item);
6054 }
6055
6056 /* Join the words to form a new string */
6057 item = PyUnicode_Join(NULL, list);
6058
6059onError:
6060 Py_DECREF(list);
6061 return (PyObject *)item;
6062}
6063#endif
6064
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006065/* Argument converter. Coerces to a single unicode character */
6066
6067static int
6068convert_uc(PyObject *obj, void *addr)
6069{
6070 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6071 PyObject *uniobj;
6072 Py_UNICODE *unistr;
6073
6074 uniobj = PyUnicode_FromObject(obj);
6075 if (uniobj == NULL) {
6076 PyErr_SetString(PyExc_TypeError,
6077 "The fill character cannot be converted to Unicode");
6078 return 0;
6079 }
6080 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6081 PyErr_SetString(PyExc_TypeError,
6082 "The fill character must be exactly one character long");
6083 Py_DECREF(uniobj);
6084 return 0;
6085 }
6086 unistr = PyUnicode_AS_UNICODE(uniobj);
6087 *fillcharloc = unistr[0];
6088 Py_DECREF(uniobj);
6089 return 1;
6090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006093"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006095Return S centered in a Unicode string of length width. Padding is\n\
6096done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098static PyObject *
6099unicode_center(PyUnicodeObject *self, PyObject *args)
6100{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t marg, left;
6102 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006103 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
Thomas Woutersde017742006-02-16 19:34:37 +00006105 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
6107
Tim Peters7a29bd52001-09-12 03:03:31 +00006108 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 Py_INCREF(self);
6110 return (PyObject*) self;
6111 }
6112
6113 marg = width - self->length;
6114 left = marg / 2 + (marg & width & 1);
6115
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006116 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117}
6118
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119#if 0
6120
6121/* This code should go into some future Unicode collation support
6122 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006123 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006124
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006125/* speedy UTF-16 code point order comparison */
6126/* gleaned from: */
6127/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6128
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006129static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006130{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006131 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006132 0, 0, 0, 0, 0, 0, 0, 0,
6133 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006134 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006135};
6136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137static int
6138unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 Py_UNICODE *s1 = str1->str;
6143 Py_UNICODE *s2 = str2->str;
6144
6145 len1 = str1->length;
6146 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006147
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006149 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006150
6151 c1 = *s1++;
6152 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006153
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006154 if (c1 > (1<<11) * 26)
6155 c1 += utf16Fixup[c1>>11];
6156 if (c2 > (1<<11) * 26)
6157 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006159
6160 if (c1 != c2)
6161 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006162
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 }
6165
6166 return (len1 < len2) ? -1 : (len1 != len2);
6167}
6168
Marc-André Lemburge5034372000-08-08 08:04:29 +00006169#else
6170
6171static int
6172unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6173{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006174 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006175
6176 Py_UNICODE *s1 = str1->str;
6177 Py_UNICODE *s2 = str2->str;
6178
6179 len1 = str1->length;
6180 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006181
Marc-André Lemburge5034372000-08-08 08:04:29 +00006182 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006183 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006184
Fredrik Lundh45714e92001-06-26 16:39:36 +00006185 c1 = *s1++;
6186 c2 = *s2++;
6187
6188 if (c1 != c2)
6189 return (c1 < c2) ? -1 : 1;
6190
Marc-André Lemburge5034372000-08-08 08:04:29 +00006191 len1--; len2--;
6192 }
6193
6194 return (len1 < len2) ? -1 : (len1 != len2);
6195}
6196
6197#endif
6198
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199int PyUnicode_Compare(PyObject *left,
6200 PyObject *right)
6201{
6202 PyUnicodeObject *u = NULL, *v = NULL;
6203 int result;
6204
6205 /* Coerce the two arguments */
6206 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6207 if (u == NULL)
6208 goto onError;
6209 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6210 if (v == NULL)
6211 goto onError;
6212
Thomas Wouters7e474022000-07-16 12:04:32 +00006213 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 if (v == u) {
6215 Py_DECREF(u);
6216 Py_DECREF(v);
6217 return 0;
6218 }
6219
6220 result = unicode_compare(u, v);
6221
6222 Py_DECREF(u);
6223 Py_DECREF(v);
6224 return result;
6225
6226onError:
6227 Py_XDECREF(u);
6228 Py_XDECREF(v);
6229 return -1;
6230}
6231
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006232PyObject *PyUnicode_RichCompare(PyObject *left,
6233 PyObject *right,
6234 int op)
6235{
6236 int result;
6237
6238 result = PyUnicode_Compare(left, right);
6239 if (result == -1 && PyErr_Occurred())
6240 goto onError;
6241
6242 /* Convert the return value to a Boolean */
6243 switch (op) {
6244 case Py_EQ:
6245 result = (result == 0);
6246 break;
6247 case Py_NE:
6248 result = (result != 0);
6249 break;
6250 case Py_LE:
6251 result = (result <= 0);
6252 break;
6253 case Py_GE:
6254 result = (result >= 0);
6255 break;
6256 case Py_LT:
6257 result = (result == -1);
6258 break;
6259 case Py_GT:
6260 result = (result == 1);
6261 break;
6262 }
6263 return PyBool_FromLong(result);
6264
6265 onError:
6266
6267 /* Standard case
6268
6269 Type errors mean that PyUnicode_FromObject() could not convert
6270 one of the arguments (usually the right hand side) to Unicode,
6271 ie. we can't handle the comparison request. However, it is
6272 possible that the other object knows a comparison method, which
6273 is why we return Py_NotImplemented to give the other object a
6274 chance.
6275
6276 */
6277 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6278 PyErr_Clear();
6279 Py_INCREF(Py_NotImplemented);
6280 return Py_NotImplemented;
6281 }
6282 if (op != Py_EQ && op != Py_NE)
6283 return NULL;
6284
6285 /* Equality comparison.
6286
6287 This is a special case: we silence any PyExc_UnicodeDecodeError
6288 and instead turn it into a PyErr_UnicodeWarning.
6289
6290 */
6291 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6292 return NULL;
6293 PyErr_Clear();
6294 if (PyErr_Warn(PyExc_UnicodeWarning,
6295 (op == Py_EQ) ?
6296 "Unicode equal comparison "
6297 "failed to convert both arguments to Unicode - "
6298 "interpreting them as being unequal" :
6299 "Unicode unequal comparison "
6300 "failed to convert both arguments to Unicode - "
6301 "interpreting them as being unequal"
6302 ) < 0)
6303 return NULL;
6304 result = (op == Py_NE);
6305 return PyBool_FromLong(result);
6306}
6307
Guido van Rossum403d68b2000-03-13 15:55:09 +00006308int PyUnicode_Contains(PyObject *container,
6309 PyObject *element)
6310{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006311 PyObject *str, *sub;
6312 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006313
6314 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006315 sub = PyUnicode_FromObject(element);
6316 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006317 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006318 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006319 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006320 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006321
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006322 str = PyUnicode_FromObject(container);
6323 if (!str) {
6324 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006325 return -1;
6326 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006327
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006328 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006329
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006330 Py_DECREF(str);
6331 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006332
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006333 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006334}
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336/* Concat to string or Unicode object giving a new Unicode object. */
6337
6338PyObject *PyUnicode_Concat(PyObject *left,
6339 PyObject *right)
6340{
6341 PyUnicodeObject *u = NULL, *v = NULL, *w;
6342
6343 /* Coerce the two arguments */
6344 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6345 if (u == NULL)
6346 goto onError;
6347 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6348 if (v == NULL)
6349 goto onError;
6350
6351 /* Shortcuts */
6352 if (v == unicode_empty) {
6353 Py_DECREF(v);
6354 return (PyObject *)u;
6355 }
6356 if (u == unicode_empty) {
6357 Py_DECREF(u);
6358 return (PyObject *)v;
6359 }
6360
6361 /* Concat the two Unicode strings */
6362 w = _PyUnicode_New(u->length + v->length);
6363 if (w == NULL)
6364 goto onError;
6365 Py_UNICODE_COPY(w->str, u->str, u->length);
6366 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6367
6368 Py_DECREF(u);
6369 Py_DECREF(v);
6370 return (PyObject *)w;
6371
6372onError:
6373 Py_XDECREF(u);
6374 Py_XDECREF(v);
6375 return NULL;
6376}
6377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006378PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379"S.count(sub[, start[, end]]) -> int\n\
6380\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006381Return the number of non-overlapping occurrences of substring sub in\n\
6382Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006383interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
6385static PyObject *
6386unicode_count(PyUnicodeObject *self, PyObject *args)
6387{
6388 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006390 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 PyObject *result;
6392
Guido van Rossumb8872e62000-05-09 14:14:27 +00006393 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 return NULL;
6396
6397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006398 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 if (substring == NULL)
6400 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006401
Fredrik Lundhc8162812006-05-26 19:33:03 +00006402 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006404 result = PyInt_FromSsize_t(
6405 stringlib_count(self->str + start, end - start,
6406 substring->str, substring->length)
6407 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 return result;
6412}
6413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006415"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006417Encodes S using the codec registered for encoding. encoding defaults\n\
6418to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6421'xmlcharrefreplace' as well as any other name registered with\n\
6422codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424static PyObject *
6425unicode_encode(PyUnicodeObject *self, PyObject *args)
6426{
6427 char *encoding = NULL;
6428 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006429 PyObject *v;
6430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6432 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006434 if (v == NULL)
6435 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006436 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6437 PyErr_Format(PyExc_TypeError,
6438 "encoder did not return a string/unicode object "
6439 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006440 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006441 Py_DECREF(v);
6442 return NULL;
6443 }
6444 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006445
6446 onError:
6447 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006448}
6449
6450PyDoc_STRVAR(decode__doc__,
6451"S.decode([encoding[,errors]]) -> string or unicode\n\
6452\n\
6453Decodes S using the codec registered for encoding. encoding defaults\n\
6454to the default encoding. errors may be given to set a different error\n\
6455handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6456a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6457as well as any other name registerd with codecs.register_error that is\n\
6458able to handle UnicodeDecodeErrors.");
6459
6460static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006461unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006462{
6463 char *encoding = NULL;
6464 char *errors = NULL;
6465 PyObject *v;
6466
6467 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6468 return NULL;
6469 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006470 if (v == NULL)
6471 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6473 PyErr_Format(PyExc_TypeError,
6474 "decoder did not return a string/unicode object "
6475 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006476 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006477 Py_DECREF(v);
6478 return NULL;
6479 }
6480 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006481
6482 onError:
6483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484}
6485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006486PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487"S.expandtabs([tabsize]) -> unicode\n\
6488\n\
6489Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
6492static PyObject*
6493unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6494{
6495 Py_UNICODE *e;
6496 Py_UNICODE *p;
6497 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006498 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 PyUnicodeObject *u;
6500 int tabsize = 8;
6501
6502 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6503 return NULL;
6504
Thomas Wouters7e474022000-07-16 12:04:32 +00006505 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006506 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 e = self->str + self->length;
6508 for (p = self->str; p < e; p++)
6509 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006510 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006512 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006513 PyErr_SetString(PyExc_OverflowError,
6514 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006515 return NULL;
6516 }
6517 old_j = j;
6518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
6520 else {
6521 j++;
6522 if (*p == '\n' || *p == '\r') {
6523 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006524 old_j = j = 0;
6525 if (i < 0) {
6526 PyErr_SetString(PyExc_OverflowError,
6527 "new string is too long");
6528 return NULL;
6529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
6531 }
6532
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006533 if ((i + j) < 0) {
6534 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6535 return NULL;
6536 }
6537
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 /* Second pass: create output string and fill it */
6539 u = _PyUnicode_New(i + j);
6540 if (!u)
6541 return NULL;
6542
6543 j = 0;
6544 q = u->str;
6545
6546 for (p = self->str; p < e; p++)
6547 if (*p == '\t') {
6548 if (tabsize > 0) {
6549 i = tabsize - (j % tabsize);
6550 j += i;
6551 while (i--)
6552 *q++ = ' ';
6553 }
6554 }
6555 else {
6556 j++;
6557 *q++ = *p;
6558 if (*p == '\n' || *p == '\r')
6559 j = 0;
6560 }
6561
6562 return (PyObject*) u;
6563}
6564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566"S.find(sub [,start [,end]]) -> int\n\
6567\n\
6568Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006569such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570arguments start and end are interpreted as in slice notation.\n\
6571\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006572Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573
6574static PyObject *
6575unicode_find(PyUnicodeObject *self, PyObject *args)
6576{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006577 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006578 Py_ssize_t start;
6579 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006580 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
Facundo Batista57d56692007-11-16 18:04:14 +00006582 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006585 result = stringlib_find_slice(
6586 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6587 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6588 start, end
6589 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
6591 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006592
6593 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
6596static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006597unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598{
6599 if (index < 0 || index >= self->length) {
6600 PyErr_SetString(PyExc_IndexError, "string index out of range");
6601 return NULL;
6602 }
6603
6604 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6605}
6606
6607static long
6608unicode_hash(PyUnicodeObject *self)
6609{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006610 /* Since Unicode objects compare equal to their ASCII string
6611 counterparts, they should use the individual character values
6612 as basis for their hash value. This is needed to assure that
6613 strings and Unicode objects behave in the same way as
6614 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006617 register Py_UNICODE *p;
6618 register long x;
6619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 if (self->hash != -1)
6621 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006622 len = PyUnicode_GET_SIZE(self);
6623 p = PyUnicode_AS_UNICODE(self);
6624 x = *p << 7;
6625 while (--len >= 0)
6626 x = (1000003*x) ^ *p++;
6627 x ^= PyUnicode_GET_SIZE(self);
6628 if (x == -1)
6629 x = -2;
6630 self->hash = x;
6631 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635"S.index(sub [,start [,end]]) -> int\n\
6636\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
6639static PyObject *
6640unicode_index(PyUnicodeObject *self, PyObject *args)
6641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006642 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006643 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006644 Py_ssize_t start;
6645 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
Facundo Batista57d56692007-11-16 18:04:14 +00006647 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006650 result = stringlib_find_slice(
6651 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6652 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6653 start, end
6654 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 if (result < 0) {
6659 PyErr_SetString(PyExc_ValueError, "substring not found");
6660 return NULL;
6661 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006662
Martin v. Löwis18e16552006-02-15 17:27:45 +00006663 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006667"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006673unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
6675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6676 register const Py_UNICODE *e;
6677 int cased;
6678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* Shortcut for single character strings */
6680 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006681 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006683 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006684 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006686
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 e = p + PyUnicode_GET_SIZE(self);
6688 cased = 0;
6689 for (; p < e; p++) {
6690 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006693 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 else if (!cased && Py_UNICODE_ISLOWER(ch))
6695 cased = 1;
6696 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698}
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006701"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006703Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006707unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
6709 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6710 register const Py_UNICODE *e;
6711 int cased;
6712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 /* Shortcut for single character strings */
6714 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006715 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006717 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006718 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 e = p + PyUnicode_GET_SIZE(self);
6722 cased = 0;
6723 for (; p < e; p++) {
6724 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006725
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 else if (!cased && Py_UNICODE_ISUPPER(ch))
6729 cased = 1;
6730 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732}
6733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006734PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006737Return True if S is a titlecased string and there is at least one\n\
6738character in S, i.e. upper- and titlecase characters may only\n\
6739follow uncased characters and lowercase characters only cased ones.\n\
6740Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741
6742static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006743unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744{
6745 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6746 register const Py_UNICODE *e;
6747 int cased, previous_is_cased;
6748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 /* Shortcut for single character strings */
6750 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6752 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006754 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006755 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006756 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 e = p + PyUnicode_GET_SIZE(self);
6759 cased = 0;
6760 previous_is_cased = 0;
6761 for (; p < e; p++) {
6762 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6765 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 previous_is_cased = 1;
6768 cased = 1;
6769 }
6770 else if (Py_UNICODE_ISLOWER(ch)) {
6771 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 previous_is_cased = 1;
6774 cased = 1;
6775 }
6776 else
6777 previous_is_cased = 0;
6778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006785Return True if all characters in S are whitespace\n\
6786and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
6788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006789unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6792 register const Py_UNICODE *e;
6793
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 /* Shortcut for single character strings */
6795 if (PyUnicode_GET_SIZE(self) == 1 &&
6796 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006799 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006800 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 e = p + PyUnicode_GET_SIZE(self);
6804 for (; p < e; p++) {
6805 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809}
6810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006811PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006813\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006814Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006816
6817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006818unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819{
6820 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6821 register const Py_UNICODE *e;
6822
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006823 /* Shortcut for single character strings */
6824 if (PyUnicode_GET_SIZE(self) == 1 &&
6825 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827
6828 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006829 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831
6832 e = p + PyUnicode_GET_SIZE(self);
6833 for (; p < e; p++) {
6834 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006837 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838}
6839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006843Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006844and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006845
6846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006847unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006848{
6849 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6850 register const Py_UNICODE *e;
6851
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006852 /* Shortcut for single character strings */
6853 if (PyUnicode_GET_SIZE(self) == 1 &&
6854 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856
6857 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006858 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006860
6861 e = p + PyUnicode_GET_SIZE(self);
6862 for (; p < e; p++) {
6863 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867}
6868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006869PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
6875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006876unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
6878 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6879 register const Py_UNICODE *e;
6880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 /* Shortcut for single character strings */
6882 if (PyUnicode_GET_SIZE(self) == 1 &&
6883 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006886 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006887 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 e = p + PyUnicode_GET_SIZE(self);
6891 for (; p < e; p++) {
6892 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006898PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006901Return True if all characters in S are digits\n\
6902and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
6904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006905unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
6907 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6908 register const Py_UNICODE *e;
6909
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 /* Shortcut for single character strings */
6911 if (PyUnicode_GET_SIZE(self) == 1 &&
6912 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006915 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006916 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006917 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006918
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 e = p + PyUnicode_GET_SIZE(self);
6920 for (; p < e; p++) {
6921 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006927PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932
6933static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006934unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
6936 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6937 register const Py_UNICODE *e;
6938
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 /* Shortcut for single character strings */
6940 if (PyUnicode_GET_SIZE(self) == 1 &&
6941 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006942 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006944 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006945 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 e = p + PyUnicode_GET_SIZE(self);
6949 for (; p < e; p++) {
6950 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954}
6955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957"S.join(sequence) -> unicode\n\
6958\n\
6959Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006960sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
6962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006963unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006965 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966}
6967
Martin v. Löwis18e16552006-02-15 17:27:45 +00006968static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969unicode_length(PyUnicodeObject *self)
6970{
6971 return self->length;
6972}
6973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006975"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976\n\
6977Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006978done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
6980static PyObject *
6981unicode_ljust(PyUnicodeObject *self, PyObject *args)
6982{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006983 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006984 Py_UNICODE fillchar = ' ';
6985
Martin v. Löwis412fb672006-04-13 06:34:32 +00006986 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 return NULL;
6988
Tim Peters7a29bd52001-09-12 03:03:31 +00006989 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 Py_INCREF(self);
6991 return (PyObject*) self;
6992 }
6993
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006994 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995}
6996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006997PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998"S.lower() -> unicode\n\
6999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007000Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
7002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 return fixup(self, fixlower);
7006}
7007
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007008#define LEFTSTRIP 0
7009#define RIGHTSTRIP 1
7010#define BOTHSTRIP 2
7011
7012/* Arrays indexed by above */
7013static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7014
7015#define STRIPNAME(i) (stripformat[i]+3)
7016
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017/* externally visible for str.strip(unicode) */
7018PyObject *
7019_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7020{
7021 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007022 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007024 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7025 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007026
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007027 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7028
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007029 i = 0;
7030 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007031 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7032 i++;
7033 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034 }
7035
7036 j = len;
7037 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007038 do {
7039 j--;
7040 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7041 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007042 }
7043
7044 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007045 Py_INCREF(self);
7046 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007047 }
7048 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007049 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050}
7051
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
7053static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007056 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007057 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058
7059 i = 0;
7060 if (striptype != RIGHTSTRIP) {
7061 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7062 i++;
7063 }
7064 }
7065
7066 j = len;
7067 if (striptype != LEFTSTRIP) {
7068 do {
7069 j--;
7070 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7071 j++;
7072 }
7073
7074 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7075 Py_INCREF(self);
7076 return (PyObject*)self;
7077 }
7078 else
7079 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082
7083static PyObject *
7084do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7085{
7086 PyObject *sep = NULL;
7087
7088 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7089 return NULL;
7090
7091 if (sep != NULL && sep != Py_None) {
7092 if (PyUnicode_Check(sep))
7093 return _PyUnicode_XStrip(self, striptype, sep);
7094 else if (PyString_Check(sep)) {
7095 PyObject *res;
7096 sep = PyUnicode_FromObject(sep);
7097 if (sep==NULL)
7098 return NULL;
7099 res = _PyUnicode_XStrip(self, striptype, sep);
7100 Py_DECREF(sep);
7101 return res;
7102 }
7103 else {
7104 PyErr_Format(PyExc_TypeError,
7105 "%s arg must be None, unicode or str",
7106 STRIPNAME(striptype));
7107 return NULL;
7108 }
7109 }
7110
7111 return do_strip(self, striptype);
7112}
7113
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007116"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117\n\
7118Return a copy of the string S with leading and trailing\n\
7119whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007120If chars is given and not None, remove characters in chars instead.\n\
7121If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
7123static PyObject *
7124unicode_strip(PyUnicodeObject *self, PyObject *args)
7125{
7126 if (PyTuple_GET_SIZE(args) == 0)
7127 return do_strip(self, BOTHSTRIP); /* Common case */
7128 else
7129 return do_argstrip(self, BOTHSTRIP, args);
7130}
7131
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007134"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007135\n\
7136Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007137If chars is given and not None, remove characters in chars instead.\n\
7138If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139
7140static PyObject *
7141unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7142{
7143 if (PyTuple_GET_SIZE(args) == 0)
7144 return do_strip(self, LEFTSTRIP); /* Common case */
7145 else
7146 return do_argstrip(self, LEFTSTRIP, args);
7147}
7148
7149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007151"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152\n\
7153Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007154If chars is given and not None, remove characters in chars instead.\n\
7155If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156
7157static PyObject *
7158unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7159{
7160 if (PyTuple_GET_SIZE(args) == 0)
7161 return do_strip(self, RIGHTSTRIP); /* Common case */
7162 else
7163 return do_argstrip(self, RIGHTSTRIP, args);
7164}
7165
7166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169{
7170 PyUnicodeObject *u;
7171 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007173 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174
7175 if (len < 0)
7176 len = 0;
7177
Tim Peters7a29bd52001-09-12 03:03:31 +00007178 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 /* no repeat, return original string */
7180 Py_INCREF(str);
7181 return (PyObject*) str;
7182 }
Tim Peters8f422462000-09-09 06:13:41 +00007183
7184 /* ensure # of chars needed doesn't overflow int and # of bytes
7185 * needed doesn't overflow size_t
7186 */
7187 nchars = len * str->length;
7188 if (len && nchars / len != str->length) {
7189 PyErr_SetString(PyExc_OverflowError,
7190 "repeated string is too long");
7191 return NULL;
7192 }
7193 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7194 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7195 PyErr_SetString(PyExc_OverflowError,
7196 "repeated string is too long");
7197 return NULL;
7198 }
7199 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 if (!u)
7201 return NULL;
7202
7203 p = u->str;
7204
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007205 if (str->length == 1 && len > 0) {
7206 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007207 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007208 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007209 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007210 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007211 done = str->length;
7212 }
7213 while (done < nchars) {
7214 int n = (done <= nchars-done) ? done : nchars-done;
7215 Py_UNICODE_COPY(p+done, p, n);
7216 done += n;
7217 }
7218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
7220 return (PyObject*) u;
7221}
7222
7223PyObject *PyUnicode_Replace(PyObject *obj,
7224 PyObject *subobj,
7225 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007226 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
7228 PyObject *self;
7229 PyObject *str1;
7230 PyObject *str2;
7231 PyObject *result;
7232
7233 self = PyUnicode_FromObject(obj);
7234 if (self == NULL)
7235 return NULL;
7236 str1 = PyUnicode_FromObject(subobj);
7237 if (str1 == NULL) {
7238 Py_DECREF(self);
7239 return NULL;
7240 }
7241 str2 = PyUnicode_FromObject(replobj);
7242 if (str2 == NULL) {
7243 Py_DECREF(self);
7244 Py_DECREF(str1);
7245 return NULL;
7246 }
Tim Petersced69f82003-09-16 20:30:58 +00007247 result = replace((PyUnicodeObject *)self,
7248 (PyUnicodeObject *)str1,
7249 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 maxcount);
7251 Py_DECREF(self);
7252 Py_DECREF(str1);
7253 Py_DECREF(str2);
7254 return result;
7255}
7256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007257PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258"S.replace (old, new[, maxsplit]) -> unicode\n\
7259\n\
7260Return a copy of S with all occurrences of substring\n\
7261old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007262given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264static PyObject*
7265unicode_replace(PyUnicodeObject *self, PyObject *args)
7266{
7267 PyUnicodeObject *str1;
7268 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007269 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 PyObject *result;
7271
Martin v. Löwis18e16552006-02-15 17:27:45 +00007272 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 return NULL;
7274 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7275 if (str1 == NULL)
7276 return NULL;
7277 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007278 if (str2 == NULL) {
7279 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282
7283 result = replace(self, str1, str2, maxcount);
7284
7285 Py_DECREF(str1);
7286 Py_DECREF(str2);
7287 return result;
7288}
7289
7290static
7291PyObject *unicode_repr(PyObject *unicode)
7292{
7293 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7294 PyUnicode_GET_SIZE(unicode),
7295 1);
7296}
7297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007298PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299"S.rfind(sub [,start [,end]]) -> int\n\
7300\n\
7301Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007302such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303arguments start and end are interpreted as in slice notation.\n\
7304\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007305Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307static PyObject *
7308unicode_rfind(PyUnicodeObject *self, PyObject *args)
7309{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007310 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007311 Py_ssize_t start;
7312 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007313 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
Facundo Batista57d56692007-11-16 18:04:14 +00007315 if (!_ParseTupleFinds(args, &substring, &start, &end))
7316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007318 result = stringlib_rfind_slice(
7319 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7320 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7321 start, end
7322 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007325
7326 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327}
7328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330"S.rindex(sub [,start [,end]]) -> int\n\
7331\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007332Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333
7334static PyObject *
7335unicode_rindex(PyUnicodeObject *self, PyObject *args)
7336{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007337 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007338 Py_ssize_t start;
7339 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007340 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
Facundo Batista57d56692007-11-16 18:04:14 +00007342 if (!_ParseTupleFinds(args, &substring, &start, &end))
7343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007345 result = stringlib_rfind_slice(
7346 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7347 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7348 start, end
7349 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
7351 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 if (result < 0) {
7354 PyErr_SetString(PyExc_ValueError, "substring not found");
7355 return NULL;
7356 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358}
7359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007360PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007361"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362\n\
7363Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007364done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365
7366static PyObject *
7367unicode_rjust(PyUnicodeObject *self, PyObject *args)
7368{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007369 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007370 Py_UNICODE fillchar = ' ';
7371
Martin v. Löwis412fb672006-04-13 06:34:32 +00007372 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 return NULL;
7374
Tim Peters7a29bd52001-09-12 03:03:31 +00007375 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 Py_INCREF(self);
7377 return (PyObject*) self;
7378 }
7379
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007380 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381}
7382
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007384unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385{
7386 /* standard clamping */
7387 if (start < 0)
7388 start = 0;
7389 if (end < 0)
7390 end = 0;
7391 if (end > self->length)
7392 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007393 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 /* full slice, return original string */
7395 Py_INCREF(self);
7396 return (PyObject*) self;
7397 }
7398 if (start > end)
7399 start = end;
7400 /* copy slice */
7401 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7402 end - start);
7403}
7404
7405PyObject *PyUnicode_Split(PyObject *s,
7406 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007407 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408{
7409 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 s = PyUnicode_FromObject(s);
7412 if (s == NULL)
7413 return NULL;
7414 if (sep != NULL) {
7415 sep = PyUnicode_FromObject(sep);
7416 if (sep == NULL) {
7417 Py_DECREF(s);
7418 return NULL;
7419 }
7420 }
7421
7422 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7423
7424 Py_DECREF(s);
7425 Py_XDECREF(sep);
7426 return result;
7427}
7428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007429PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430"S.split([sep [,maxsplit]]) -> list of strings\n\
7431\n\
7432Return a list of the words in S, using sep as the\n\
7433delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007434splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007435any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437static PyObject*
7438unicode_split(PyUnicodeObject *self, PyObject *args)
7439{
7440 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Martin v. Löwis18e16552006-02-15 17:27:45 +00007443 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 return NULL;
7445
7446 if (substring == Py_None)
7447 return split(self, NULL, maxcount);
7448 else if (PyUnicode_Check(substring))
7449 return split(self, (PyUnicodeObject *)substring, maxcount);
7450 else
7451 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7452}
7453
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007454PyObject *
7455PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7456{
7457 PyObject* str_obj;
7458 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007459 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007460
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007461 str_obj = PyUnicode_FromObject(str_in);
7462 if (!str_obj)
7463 return NULL;
7464 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007465 if (!sep_obj) {
7466 Py_DECREF(str_obj);
7467 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007468 }
7469
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007470 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007471 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7472 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7473 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007474
Fredrik Lundhb9479482006-05-26 17:22:38 +00007475 Py_DECREF(sep_obj);
7476 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007477
7478 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007479}
7480
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007481
7482PyObject *
7483PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7484{
7485 PyObject* str_obj;
7486 PyObject* sep_obj;
7487 PyObject* out;
7488
7489 str_obj = PyUnicode_FromObject(str_in);
7490 if (!str_obj)
7491 return NULL;
7492 sep_obj = PyUnicode_FromObject(sep_in);
7493 if (!sep_obj) {
7494 Py_DECREF(str_obj);
7495 return NULL;
7496 }
7497
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007498 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007499 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7500 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7501 );
7502
7503 Py_DECREF(sep_obj);
7504 Py_DECREF(str_obj);
7505
7506 return out;
7507}
7508
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007509PyDoc_STRVAR(partition__doc__,
7510"S.partition(sep) -> (head, sep, tail)\n\
7511\n\
7512Searches for the separator sep in S, and returns the part before it,\n\
7513the separator itself, and the part after it. If the separator is not\n\
7514found, returns S and two empty strings.");
7515
7516static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007517unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007518{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007519 return PyUnicode_Partition((PyObject *)self, separator);
7520}
7521
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007522PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007523"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007524\n\
7525Searches for the separator sep in S, starting at the end of S, and returns\n\
7526the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007527separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007528
7529static PyObject*
7530unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7531{
7532 return PyUnicode_RPartition((PyObject *)self, separator);
7533}
7534
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007535PyObject *PyUnicode_RSplit(PyObject *s,
7536 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007537 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007538{
7539 PyObject *result;
7540
7541 s = PyUnicode_FromObject(s);
7542 if (s == NULL)
7543 return NULL;
7544 if (sep != NULL) {
7545 sep = PyUnicode_FromObject(sep);
7546 if (sep == NULL) {
7547 Py_DECREF(s);
7548 return NULL;
7549 }
7550 }
7551
7552 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7553
7554 Py_DECREF(s);
7555 Py_XDECREF(sep);
7556 return result;
7557}
7558
7559PyDoc_STRVAR(rsplit__doc__,
7560"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7561\n\
7562Return a list of the words in S, using sep as the\n\
7563delimiter string, starting at the end of the string and\n\
7564working to the front. If maxsplit is given, at most maxsplit\n\
7565splits are done. If sep is not specified, any whitespace string\n\
7566is a separator.");
7567
7568static PyObject*
7569unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7570{
7571 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007572 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007573
Martin v. Löwis18e16552006-02-15 17:27:45 +00007574 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007575 return NULL;
7576
7577 if (substring == Py_None)
7578 return rsplit(self, NULL, maxcount);
7579 else if (PyUnicode_Check(substring))
7580 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7581 else
7582 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7583}
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007586"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
7588Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007589Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject*
7593unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7594{
Guido van Rossum86662912000-04-11 15:38:46 +00007595 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
Guido van Rossum86662912000-04-11 15:38:46 +00007597 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 return NULL;
7599
Guido van Rossum86662912000-04-11 15:38:46 +00007600 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601}
7602
7603static
7604PyObject *unicode_str(PyUnicodeObject *self)
7605{
Fred Drakee4315f52000-05-09 19:53:39 +00007606 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607}
7608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007609PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610"S.swapcase() -> unicode\n\
7611\n\
7612Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007613and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
7615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007616unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 return fixup(self, fixswapcase);
7619}
7620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622"S.translate(table) -> unicode\n\
7623\n\
7624Return a copy of the string S, where all characters have been mapped\n\
7625through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007626Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7627Unmapped characters are left untouched. Characters mapped to None\n\
7628are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007631unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
Tim Petersced69f82003-09-16 20:30:58 +00007633 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007635 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 "ignore");
7637}
7638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007639PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640"S.upper() -> unicode\n\
7641\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007642Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
7644static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007645unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 return fixup(self, fixupper);
7648}
7649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651"S.zfill(width) -> unicode\n\
7652\n\
7653Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject *
7657unicode_zfill(PyUnicodeObject *self, PyObject *args)
7658{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007659 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 PyUnicodeObject *u;
7661
Martin v. Löwis18e16552006-02-15 17:27:45 +00007662 Py_ssize_t width;
7663 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664 return NULL;
7665
7666 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007667 if (PyUnicode_CheckExact(self)) {
7668 Py_INCREF(self);
7669 return (PyObject*) self;
7670 }
7671 else
7672 return PyUnicode_FromUnicode(
7673 PyUnicode_AS_UNICODE(self),
7674 PyUnicode_GET_SIZE(self)
7675 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 }
7677
7678 fill = width - self->length;
7679
7680 u = pad(self, fill, 0, '0');
7681
Walter Dörwald068325e2002-04-15 13:36:47 +00007682 if (u == NULL)
7683 return NULL;
7684
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 if (u->str[fill] == '+' || u->str[fill] == '-') {
7686 /* move sign to beginning of string */
7687 u->str[0] = u->str[fill];
7688 u->str[fill] = '0';
7689 }
7690
7691 return (PyObject*) u;
7692}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
7694#if 0
7695static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007696free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007698 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699}
7700#endif
7701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007702PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007703"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007705Return True if S starts with the specified prefix, False otherwise.\n\
7706With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007707With optional end, stop comparing S at that position.\n\
7708prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709
7710static PyObject *
7711unicode_startswith(PyUnicodeObject *self,
7712 PyObject *args)
7713{
Georg Brandl24250812006-06-09 18:45:48 +00007714 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007717 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007718 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
Georg Brandl24250812006-06-09 18:45:48 +00007720 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007721 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007723 if (PyTuple_Check(subobj)) {
7724 Py_ssize_t i;
7725 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7726 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7727 PyTuple_GET_ITEM(subobj, i));
7728 if (substring == NULL)
7729 return NULL;
7730 result = tailmatch(self, substring, start, end, -1);
7731 Py_DECREF(substring);
7732 if (result) {
7733 Py_RETURN_TRUE;
7734 }
7735 }
7736 /* nothing matched */
7737 Py_RETURN_FALSE;
7738 }
7739 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007741 return NULL;
7742 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007744 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007749"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007751Return True if S ends with the specified suffix, False otherwise.\n\
7752With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007753With optional end, stop comparing S at that position.\n\
7754suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject *
7757unicode_endswith(PyUnicodeObject *self,
7758 PyObject *args)
7759{
Georg Brandl24250812006-06-09 18:45:48 +00007760 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007763 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007764 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
Georg Brandl24250812006-06-09 18:45:48 +00007766 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7767 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007769 if (PyTuple_Check(subobj)) {
7770 Py_ssize_t i;
7771 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7772 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7773 PyTuple_GET_ITEM(subobj, i));
7774 if (substring == NULL)
7775 return NULL;
7776 result = tailmatch(self, substring, start, end, +1);
7777 Py_DECREF(substring);
7778 if (result) {
7779 Py_RETURN_TRUE;
7780 }
7781 }
7782 Py_RETURN_FALSE;
7783 }
7784 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
Georg Brandl24250812006-06-09 18:45:48 +00007788 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007790 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791}
7792
7793
Eric Smitha9f7d622008-02-17 19:46:49 +00007794/* Implements do_string_format, which is unicode because of stringlib */
7795#include "stringlib/string_format.h"
7796
7797PyDoc_STRVAR(format__doc__,
7798"S.format(*args, **kwargs) -> unicode\n\
7799\n\
7800");
7801
7802PyDoc_STRVAR(p_format__doc__,
7803"S.__format__(format_spec) -> unicode\n\
7804\n\
7805");
7806
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007807
7808static PyObject *
7809unicode_getnewargs(PyUnicodeObject *v)
7810{
7811 return Py_BuildValue("(u#)", v->str, v->length);
7812}
7813
7814
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815static PyMethodDef unicode_methods[] = {
7816
7817 /* Order is according to common usage: often used methods should
7818 appear first, since lookup is done sequentially. */
7819
Georg Brandlecdc0a92006-03-30 12:19:07 +00007820 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007821 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7822 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007823 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007824 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7825 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7826 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7827 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7828 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7829 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7830 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007831 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7833 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7834 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007835 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007836 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007837/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7838 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7839 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7840 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007841 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007842 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007843 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007844 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007845 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7846 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7847 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7848 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7849 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7850 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7851 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7852 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7853 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7854 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7855 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7856 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7857 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7858 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007859 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007860 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7861 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7862 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7863 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007864#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007865 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866#endif
7867
7868#if 0
7869 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007870 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871#endif
7872
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007873 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 {NULL, NULL}
7875};
7876
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007877static PyObject *
7878unicode_mod(PyObject *v, PyObject *w)
7879{
7880 if (!PyUnicode_Check(v)) {
7881 Py_INCREF(Py_NotImplemented);
7882 return Py_NotImplemented;
7883 }
7884 return PyUnicode_Format(v, w);
7885}
7886
7887static PyNumberMethods unicode_as_number = {
7888 0, /*nb_add*/
7889 0, /*nb_subtract*/
7890 0, /*nb_multiply*/
7891 0, /*nb_divide*/
7892 unicode_mod, /*nb_remainder*/
7893};
7894
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007896 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007897 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007898 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7899 (ssizeargfunc) unicode_getitem, /* sq_item */
7900 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 0, /* sq_ass_item */
7902 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007903 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904};
7905
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007906static PyObject*
7907unicode_subscript(PyUnicodeObject* self, PyObject* item)
7908{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007909 if (PyIndex_Check(item)) {
7910 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007911 if (i == -1 && PyErr_Occurred())
7912 return NULL;
7913 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007914 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007915 return unicode_getitem(self, i);
7916 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007917 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007918 Py_UNICODE* source_buf;
7919 Py_UNICODE* result_buf;
7920 PyObject* result;
7921
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007922 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 &start, &stop, &step, &slicelength) < 0) {
7924 return NULL;
7925 }
7926
7927 if (slicelength <= 0) {
7928 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007929 } else if (start == 0 && step == 1 && slicelength == self->length &&
7930 PyUnicode_CheckExact(self)) {
7931 Py_INCREF(self);
7932 return (PyObject *)self;
7933 } else if (step == 1) {
7934 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007935 } else {
7936 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007937 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7938 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007939
7940 if (result_buf == NULL)
7941 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007942
7943 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7944 result_buf[i] = source_buf[cur];
7945 }
Tim Petersced69f82003-09-16 20:30:58 +00007946
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007947 result = PyUnicode_FromUnicode(result_buf, slicelength);
7948 PyMem_FREE(result_buf);
7949 return result;
7950 }
7951 } else {
7952 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7953 return NULL;
7954 }
7955}
7956
7957static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007958 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007959 (binaryfunc)unicode_subscript, /* mp_subscript */
7960 (objobjargproc)0, /* mp_ass_subscript */
7961};
7962
Martin v. Löwis18e16552006-02-15 17:27:45 +00007963static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007965 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 const void **ptr)
7967{
7968 if (index != 0) {
7969 PyErr_SetString(PyExc_SystemError,
7970 "accessing non-existent unicode segment");
7971 return -1;
7972 }
7973 *ptr = (void *) self->str;
7974 return PyUnicode_GET_DATA_SIZE(self);
7975}
7976
Martin v. Löwis18e16552006-02-15 17:27:45 +00007977static Py_ssize_t
7978unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 const void **ptr)
7980{
7981 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007982 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 return -1;
7984}
7985
7986static int
7987unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007988 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989{
7990 if (lenp)
7991 *lenp = PyUnicode_GET_DATA_SIZE(self);
7992 return 1;
7993}
7994
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007995static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007997 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 const void **ptr)
7999{
8000 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008001
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 if (index != 0) {
8003 PyErr_SetString(PyExc_SystemError,
8004 "accessing non-existent unicode segment");
8005 return -1;
8006 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008007 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 if (str == NULL)
8009 return -1;
8010 *ptr = (void *) PyString_AS_STRING(str);
8011 return PyString_GET_SIZE(str);
8012}
8013
8014/* Helpers for PyUnicode_Format() */
8015
8016static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008017getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 if (argidx < arglen) {
8021 (*p_argidx)++;
8022 if (arglen < 0)
8023 return args;
8024 else
8025 return PyTuple_GetItem(args, argidx);
8026 }
8027 PyErr_SetString(PyExc_TypeError,
8028 "not enough arguments for format string");
8029 return NULL;
8030}
8031
8032#define F_LJUST (1<<0)
8033#define F_SIGN (1<<1)
8034#define F_BLANK (1<<2)
8035#define F_ALT (1<<3)
8036#define F_ZERO (1<<4)
8037
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008039strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041 register Py_ssize_t i;
8042 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 for (i = len - 1; i >= 0; i--)
8044 buffer[i] = (Py_UNICODE) charbuffer[i];
8045
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 return len;
8047}
8048
Neal Norwitzfc76d632006-01-10 06:03:13 +00008049static int
8050doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8051{
Tim Peters15231542006-02-16 01:08:01 +00008052 Py_ssize_t result;
8053
Neal Norwitzfc76d632006-01-10 06:03:13 +00008054 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008055 result = strtounicode(buffer, (char *)buffer);
8056 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008057}
8058
8059static int
8060longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8061{
Tim Peters15231542006-02-16 01:08:01 +00008062 Py_ssize_t result;
8063
Neal Norwitzfc76d632006-01-10 06:03:13 +00008064 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008065 result = strtounicode(buffer, (char *)buffer);
8066 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008067}
8068
Guido van Rossum078151d2002-08-11 04:24:12 +00008069/* XXX To save some code duplication, formatfloat/long/int could have been
8070 shared with stringobject.c, converting from 8-bit to Unicode after the
8071 formatting is done. */
8072
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073static int
8074formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008075 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 int flags,
8077 int prec,
8078 int type,
8079 PyObject *v)
8080{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008081 /* fmt = '%#.' + `prec` + `type`
8082 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 char fmt[20];
8084 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 x = PyFloat_AsDouble(v);
8087 if (x == -1.0 && PyErr_Occurred())
8088 return -1;
8089 if (prec < 0)
8090 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8092 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008093 /* Worst case length calc to ensure no buffer overrun:
8094
8095 'g' formats:
8096 fmt = %#.<prec>g
8097 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8098 for any double rep.)
8099 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8100
8101 'f' formats:
8102 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8103 len = 1 + 50 + 1 + prec = 52 + prec
8104
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008105 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008106 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008107
8108 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008109 if (((type == 'g' || type == 'G') &&
8110 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008111 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008112 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008113 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008114 return -1;
8115 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008116 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8117 (flags&F_ALT) ? "#" : "",
8118 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008119 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120}
8121
Tim Peters38fd5b62000-09-21 05:43:11 +00008122static PyObject*
8123formatlong(PyObject *val, int flags, int prec, int type)
8124{
8125 char *buf;
8126 int i, len;
8127 PyObject *str; /* temporary string object. */
8128 PyUnicodeObject *result;
8129
8130 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8131 if (!str)
8132 return NULL;
8133 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008134 if (!result) {
8135 Py_DECREF(str);
8136 return NULL;
8137 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008138 for (i = 0; i < len; i++)
8139 result->str[i] = buf[i];
8140 result->str[len] = 0;
8141 Py_DECREF(str);
8142 return (PyObject*)result;
8143}
8144
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145static int
8146formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008147 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 int flags,
8149 int prec,
8150 int type,
8151 PyObject *v)
8152{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008153 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008154 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8155 * + 1 + 1
8156 * = 24
8157 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008158 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008159 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 long x;
8161
8162 x = PyInt_AsLong(v);
8163 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008164 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008165 if (x < 0 && type == 'u') {
8166 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008167 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008168 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8169 sign = "-";
8170 else
8171 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008173 prec = 1;
8174
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008175 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8176 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008177 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008178 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008179 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008180 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008181 return -1;
8182 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008183
8184 if ((flags & F_ALT) &&
8185 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008186 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008187 * of issues that cause pain:
8188 * - when 0 is being converted, the C standard leaves off
8189 * the '0x' or '0X', which is inconsistent with other
8190 * %#x/%#X conversions and inconsistent with Python's
8191 * hex() function
8192 * - there are platforms that violate the standard and
8193 * convert 0 with the '0x' or '0X'
8194 * (Metrowerks, Compaq Tru64)
8195 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008196 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008197 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008198 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008199 * We can achieve the desired consistency by inserting our
8200 * own '0x' or '0X' prefix, and substituting %x/%X in place
8201 * of %#x/%#X.
8202 *
8203 * Note that this is the same approach as used in
8204 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008205 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008206 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8207 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008208 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008209 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008210 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8211 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008212 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008213 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008214 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008215 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008216 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008217 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218}
8219
8220static int
8221formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008222 size_t buflen,
8223 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008225 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008226 if (PyUnicode_Check(v)) {
8227 if (PyUnicode_GET_SIZE(v) != 1)
8228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008232 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008233 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008234 goto onError;
8235 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237
8238 else {
8239 /* Integer input truncated to a character */
8240 long x;
8241 x = PyInt_AsLong(v);
8242 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008243 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008244#ifdef Py_UNICODE_WIDE
8245 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008246 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008247 "%c arg not in range(0x110000) "
8248 "(wide Python build)");
8249 return -1;
8250 }
8251#else
8252 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008253 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008254 "%c arg not in range(0x10000) "
8255 "(narrow Python build)");
8256 return -1;
8257 }
8258#endif
8259 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
8261 buf[1] = '\0';
8262 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008263
8264 onError:
8265 PyErr_SetString(PyExc_TypeError,
8266 "%c requires int or char");
8267 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268}
8269
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008270/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8271
8272 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8273 chars are formatted. XXX This is a magic number. Each formatting
8274 routine does bounds checking to ensure no overflow, but a better
8275 solution may be to malloc a buffer of appropriate size for each
8276 format. For now, the current solution is sufficient.
8277*/
8278#define FORMATBUFLEN (size_t)120
8279
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280PyObject *PyUnicode_Format(PyObject *format,
8281 PyObject *args)
8282{
8283 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008284 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 int args_owned = 0;
8286 PyUnicodeObject *result = NULL;
8287 PyObject *dict = NULL;
8288 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008289
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 if (format == NULL || args == NULL) {
8291 PyErr_BadInternalCall();
8292 return NULL;
8293 }
8294 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008295 if (uformat == NULL)
8296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 fmt = PyUnicode_AS_UNICODE(uformat);
8298 fmtcnt = PyUnicode_GET_SIZE(uformat);
8299
8300 reslen = rescnt = fmtcnt + 100;
8301 result = _PyUnicode_New(reslen);
8302 if (result == NULL)
8303 goto onError;
8304 res = PyUnicode_AS_UNICODE(result);
8305
8306 if (PyTuple_Check(args)) {
8307 arglen = PyTuple_Size(args);
8308 argidx = 0;
8309 }
8310 else {
8311 arglen = -1;
8312 argidx = -2;
8313 }
Christian Heimese93237d2007-12-19 02:37:44 +00008314 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008315 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 dict = args;
8317
8318 while (--fmtcnt >= 0) {
8319 if (*fmt != '%') {
8320 if (--rescnt < 0) {
8321 rescnt = fmtcnt + 100;
8322 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008323 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8326 --rescnt;
8327 }
8328 *res++ = *fmt++;
8329 }
8330 else {
8331 /* Got a format specifier */
8332 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008333 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 Py_UNICODE c = '\0';
8336 Py_UNICODE fill;
8337 PyObject *v = NULL;
8338 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008339 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008341 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008342 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343
8344 fmt++;
8345 if (*fmt == '(') {
8346 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008347 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 PyObject *key;
8349 int pcount = 1;
8350
8351 if (dict == NULL) {
8352 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008353 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 goto onError;
8355 }
8356 ++fmt;
8357 --fmtcnt;
8358 keystart = fmt;
8359 /* Skip over balanced parentheses */
8360 while (pcount > 0 && --fmtcnt >= 0) {
8361 if (*fmt == ')')
8362 --pcount;
8363 else if (*fmt == '(')
8364 ++pcount;
8365 fmt++;
8366 }
8367 keylen = fmt - keystart - 1;
8368 if (fmtcnt < 0 || pcount > 0) {
8369 PyErr_SetString(PyExc_ValueError,
8370 "incomplete format key");
8371 goto onError;
8372 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008373#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008374 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 then looked up since Python uses strings to hold
8376 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008377 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 key = PyUnicode_EncodeUTF8(keystart,
8379 keylen,
8380 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008381#else
8382 key = PyUnicode_FromUnicode(keystart, keylen);
8383#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 if (key == NULL)
8385 goto onError;
8386 if (args_owned) {
8387 Py_DECREF(args);
8388 args_owned = 0;
8389 }
8390 args = PyObject_GetItem(dict, key);
8391 Py_DECREF(key);
8392 if (args == NULL) {
8393 goto onError;
8394 }
8395 args_owned = 1;
8396 arglen = -1;
8397 argidx = -2;
8398 }
8399 while (--fmtcnt >= 0) {
8400 switch (c = *fmt++) {
8401 case '-': flags |= F_LJUST; continue;
8402 case '+': flags |= F_SIGN; continue;
8403 case ' ': flags |= F_BLANK; continue;
8404 case '#': flags |= F_ALT; continue;
8405 case '0': flags |= F_ZERO; continue;
8406 }
8407 break;
8408 }
8409 if (c == '*') {
8410 v = getnextarg(args, arglen, &argidx);
8411 if (v == NULL)
8412 goto onError;
8413 if (!PyInt_Check(v)) {
8414 PyErr_SetString(PyExc_TypeError,
8415 "* wants int");
8416 goto onError;
8417 }
8418 width = PyInt_AsLong(v);
8419 if (width < 0) {
8420 flags |= F_LJUST;
8421 width = -width;
8422 }
8423 if (--fmtcnt >= 0)
8424 c = *fmt++;
8425 }
8426 else if (c >= '0' && c <= '9') {
8427 width = c - '0';
8428 while (--fmtcnt >= 0) {
8429 c = *fmt++;
8430 if (c < '0' || c > '9')
8431 break;
8432 if ((width*10) / 10 != width) {
8433 PyErr_SetString(PyExc_ValueError,
8434 "width too big");
8435 goto onError;
8436 }
8437 width = width*10 + (c - '0');
8438 }
8439 }
8440 if (c == '.') {
8441 prec = 0;
8442 if (--fmtcnt >= 0)
8443 c = *fmt++;
8444 if (c == '*') {
8445 v = getnextarg(args, arglen, &argidx);
8446 if (v == NULL)
8447 goto onError;
8448 if (!PyInt_Check(v)) {
8449 PyErr_SetString(PyExc_TypeError,
8450 "* wants int");
8451 goto onError;
8452 }
8453 prec = PyInt_AsLong(v);
8454 if (prec < 0)
8455 prec = 0;
8456 if (--fmtcnt >= 0)
8457 c = *fmt++;
8458 }
8459 else if (c >= '0' && c <= '9') {
8460 prec = c - '0';
8461 while (--fmtcnt >= 0) {
8462 c = Py_CHARMASK(*fmt++);
8463 if (c < '0' || c > '9')
8464 break;
8465 if ((prec*10) / 10 != prec) {
8466 PyErr_SetString(PyExc_ValueError,
8467 "prec too big");
8468 goto onError;
8469 }
8470 prec = prec*10 + (c - '0');
8471 }
8472 }
8473 } /* prec */
8474 if (fmtcnt >= 0) {
8475 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 if (--fmtcnt >= 0)
8477 c = *fmt++;
8478 }
8479 }
8480 if (fmtcnt < 0) {
8481 PyErr_SetString(PyExc_ValueError,
8482 "incomplete format");
8483 goto onError;
8484 }
8485 if (c != '%') {
8486 v = getnextarg(args, arglen, &argidx);
8487 if (v == NULL)
8488 goto onError;
8489 }
8490 sign = 0;
8491 fill = ' ';
8492 switch (c) {
8493
8494 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008495 pbuf = formatbuf;
8496 /* presume that buffer length is at least 1 */
8497 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 len = 1;
8499 break;
8500
8501 case 's':
8502 case 'r':
8503 if (PyUnicode_Check(v) && c == 's') {
8504 temp = v;
8505 Py_INCREF(temp);
8506 }
8507 else {
8508 PyObject *unicode;
8509 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008510 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 else
8512 temp = PyObject_Repr(v);
8513 if (temp == NULL)
8514 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008515 if (PyUnicode_Check(temp))
8516 /* nothing to do */;
8517 else if (PyString_Check(temp)) {
8518 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008519 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008521 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008523 Py_DECREF(temp);
8524 temp = unicode;
8525 if (temp == NULL)
8526 goto onError;
8527 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008528 else {
8529 Py_DECREF(temp);
8530 PyErr_SetString(PyExc_TypeError,
8531 "%s argument has non-string str()");
8532 goto onError;
8533 }
8534 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008535 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 len = PyUnicode_GET_SIZE(temp);
8537 if (prec >= 0 && len > prec)
8538 len = prec;
8539 break;
8540
8541 case 'i':
8542 case 'd':
8543 case 'u':
8544 case 'o':
8545 case 'x':
8546 case 'X':
8547 if (c == 'i')
8548 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008549 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008550 temp = formatlong(v, flags, prec, c);
8551 if (!temp)
8552 goto onError;
8553 pbuf = PyUnicode_AS_UNICODE(temp);
8554 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008555 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008557 else {
8558 pbuf = formatbuf;
8559 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8560 flags, prec, c, v);
8561 if (len < 0)
8562 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008563 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008564 }
8565 if (flags & F_ZERO)
8566 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 break;
8568
8569 case 'e':
8570 case 'E':
8571 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008572 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 case 'g':
8574 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008575 if (c == 'F')
8576 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008577 pbuf = formatbuf;
8578 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8579 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 if (len < 0)
8581 goto onError;
8582 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008583 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 fill = '0';
8585 break;
8586
8587 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008588 pbuf = formatbuf;
8589 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 if (len < 0)
8591 goto onError;
8592 break;
8593
8594 default:
8595 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008596 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008597 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008598 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008599 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008600 (Py_ssize_t)(fmt - 1 -
8601 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 goto onError;
8603 }
8604 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008605 if (*pbuf == '-' || *pbuf == '+') {
8606 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 len--;
8608 }
8609 else if (flags & F_SIGN)
8610 sign = '+';
8611 else if (flags & F_BLANK)
8612 sign = ' ';
8613 else
8614 sign = 0;
8615 }
8616 if (width < len)
8617 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008618 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 reslen -= rescnt;
8620 rescnt = width + fmtcnt + 100;
8621 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008622 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008623 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008624 PyErr_NoMemory();
8625 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008626 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008627 if (_PyUnicode_Resize(&result, reslen) < 0) {
8628 Py_XDECREF(temp);
8629 goto onError;
8630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 res = PyUnicode_AS_UNICODE(result)
8632 + reslen - rescnt;
8633 }
8634 if (sign) {
8635 if (fill != ' ')
8636 *res++ = sign;
8637 rescnt--;
8638 if (width > len)
8639 width--;
8640 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008641 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8642 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008643 assert(pbuf[1] == c);
8644 if (fill != ' ') {
8645 *res++ = *pbuf++;
8646 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008647 }
Tim Petersfff53252001-04-12 18:38:48 +00008648 rescnt -= 2;
8649 width -= 2;
8650 if (width < 0)
8651 width = 0;
8652 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 if (width > len && !(flags & F_LJUST)) {
8655 do {
8656 --rescnt;
8657 *res++ = fill;
8658 } while (--width > len);
8659 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008660 if (fill == ' ') {
8661 if (sign)
8662 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008663 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008664 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008665 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008666 *res++ = *pbuf++;
8667 *res++ = *pbuf++;
8668 }
8669 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008670 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 res += len;
8672 rescnt -= len;
8673 while (--width >= len) {
8674 --rescnt;
8675 *res++ = ' ';
8676 }
8677 if (dict && (argidx < arglen) && c != '%') {
8678 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008679 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008680 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 goto onError;
8682 }
8683 Py_XDECREF(temp);
8684 } /* '%' */
8685 } /* until end */
8686 if (argidx < arglen && !dict) {
8687 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008688 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 goto onError;
8690 }
8691
Thomas Woutersa96affe2006-03-12 00:29:36 +00008692 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8693 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 if (args_owned) {
8695 Py_DECREF(args);
8696 }
8697 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 return (PyObject *)result;
8699
8700 onError:
8701 Py_XDECREF(result);
8702 Py_DECREF(uformat);
8703 if (args_owned) {
8704 Py_DECREF(args);
8705 }
8706 return NULL;
8707}
8708
8709static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008710 (readbufferproc) unicode_buffer_getreadbuf,
8711 (writebufferproc) unicode_buffer_getwritebuf,
8712 (segcountproc) unicode_buffer_getsegcount,
8713 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714};
8715
Jeremy Hylton938ace62002-07-17 16:30:39 +00008716static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008717unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8718
Tim Peters6d6c1a32001-08-02 04:15:00 +00008719static PyObject *
8720unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8721{
8722 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008723 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008724 char *encoding = NULL;
8725 char *errors = NULL;
8726
Guido van Rossume023fe02001-08-30 03:12:59 +00008727 if (type != &PyUnicode_Type)
8728 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008729 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8730 kwlist, &x, &encoding, &errors))
8731 return NULL;
8732 if (x == NULL)
8733 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008734 if (encoding == NULL && errors == NULL)
8735 return PyObject_Unicode(x);
8736 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008737 return PyUnicode_FromEncodedObject(x, encoding, errors);
8738}
8739
Guido van Rossume023fe02001-08-30 03:12:59 +00008740static PyObject *
8741unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8742{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008743 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008744 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008745
8746 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8747 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8748 if (tmp == NULL)
8749 return NULL;
8750 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008751 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008752 if (pnew == NULL) {
8753 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008754 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008755 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008756 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8757 if (pnew->str == NULL) {
8758 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008759 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008760 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008761 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008762 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008763 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8764 pnew->length = n;
8765 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008766 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008767 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008768}
8769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008770PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008771"unicode(string [, encoding[, errors]]) -> object\n\
8772\n\
8773Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008774encoding defaults to the current default string encoding.\n\
8775errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008778 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 "unicode", /* tp_name */
8780 sizeof(PyUnicodeObject), /* tp_size */
8781 0, /* tp_itemsize */
8782 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008783 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008785 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008787 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008788 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008789 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008791 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 (hashfunc) unicode_hash, /* tp_hash*/
8793 0, /* tp_call*/
8794 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008795 PyObject_GenericGetAttr, /* tp_getattro */
8796 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008798 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008799 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008800 unicode_doc, /* tp_doc */
8801 0, /* tp_traverse */
8802 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008803 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008804 0, /* tp_weaklistoffset */
8805 0, /* tp_iter */
8806 0, /* tp_iternext */
8807 unicode_methods, /* tp_methods */
8808 0, /* tp_members */
8809 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008810 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008811 0, /* tp_dict */
8812 0, /* tp_descr_get */
8813 0, /* tp_descr_set */
8814 0, /* tp_dictoffset */
8815 0, /* tp_init */
8816 0, /* tp_alloc */
8817 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008818 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819};
8820
8821/* Initialize the Unicode implementation */
8822
Thomas Wouters78890102000-07-22 19:25:51 +00008823void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008825 int i;
8826
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008827 /* XXX - move this array to unicodectype.c ? */
8828 Py_UNICODE linebreak[] = {
8829 0x000A, /* LINE FEED */
8830 0x000D, /* CARRIAGE RETURN */
8831 0x001C, /* FILE SEPARATOR */
8832 0x001D, /* GROUP SEPARATOR */
8833 0x001E, /* RECORD SEPARATOR */
8834 0x0085, /* NEXT LINE */
8835 0x2028, /* LINE SEPARATOR */
8836 0x2029, /* PARAGRAPH SEPARATOR */
8837 };
8838
Fred Drakee4315f52000-05-09 19:53:39 +00008839 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008840 free_list = NULL;
8841 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008843 if (!unicode_empty)
8844 return;
8845
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008846 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008847 for (i = 0; i < 256; i++)
8848 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008849 if (PyType_Ready(&PyUnicode_Type) < 0)
8850 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008851
8852 /* initialize the linebreak bloom filter */
8853 bloom_linebreak = make_bloom_mask(
8854 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8855 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008856
8857 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858}
8859
8860/* Finalize the Unicode implementation */
8861
Christian Heimes3b718a72008-02-14 12:47:33 +00008862int
8863PyUnicode_ClearFreeList(void)
8864{
8865 int freelist_size = numfree;
8866 PyUnicodeObject *u;
8867
8868 for (u = free_list; u != NULL;) {
8869 PyUnicodeObject *v = u;
8870 u = *(PyUnicodeObject **)u;
8871 if (v->str)
8872 PyMem_DEL(v->str);
8873 Py_XDECREF(v->defenc);
8874 PyObject_Del(v);
8875 numfree--;
8876 }
8877 free_list = NULL;
8878 assert(numfree == 0);
8879 return freelist_size;
8880}
8881
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882void
Thomas Wouters78890102000-07-22 19:25:51 +00008883_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008885 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008887 Py_XDECREF(unicode_empty);
8888 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008889
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008890 for (i = 0; i < 256; i++) {
8891 if (unicode_latin1[i]) {
8892 Py_DECREF(unicode_latin1[i]);
8893 unicode_latin1[i] = NULL;
8894 }
8895 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008896 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008898
Anthony Baxterac6bd462006-04-13 02:06:09 +00008899#ifdef __cplusplus
8900}
8901#endif
8902
8903
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008904/*
8905Local variables:
8906c-basic-offset: 4
8907indent-tabs-mode: nil
8908End:
8909*/