blob: 840efb9de3998f3993628e4d9492b33fb62a7754 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118// case 0x0009: /* HORIZONTAL TABULATION */
119// case 0x000A: /* LINE FEED */
120// case 0x000B: /* VERTICAL TABULATION */
121// case 0x000C: /* FORM FEED */
122// case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125// case 0x001C: /* FILE SEPARATOR */
126// case 0x001D: /* GROUP SEPARATOR */
127// case 0x001E: /* RECORD SEPARATOR */
128// case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130// case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149// 0x000A, /* LINE FEED */
150// 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153// 0x001C, /* FILE SEPARATOR */
154// 0x001D, /* GROUP SEPARATOR */
155// 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177 return 0x10FFFF;
178#else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Christian Heimes4d4f2702008-01-30 11:32:37 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
228#define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
302 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000303 if (free_list) {
304 unicode = free_list;
305 free_list = *(PyUnicodeObject **)unicode;
306 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000308 /* Keep-Alive optimization: we only upsize the buffer,
309 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000310 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000311 unicode_resize(unicode, length) < 0) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000312 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000313 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314 }
315 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000316 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000317 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
318 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000319 }
320 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000323 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000324 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 if (unicode == NULL)
326 return NULL;
Neal Norwitz419fd492008-03-17 20:22:43 +0000327 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
328 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 }
330
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000331 if (!unicode->str) {
332 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000333 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000334 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000335 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000336 * the caller fails before initializing str -- unicode_resize()
337 * reads str[0], and the Keep-Alive optimization can keep memory
338 * allocated for str alive across a call to unicode_dealloc(unicode).
339 * We don't want unicode_resize to read uninitialized memory in
340 * that case.
341 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000342 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000344 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000346 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000348
349 onError:
350 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000351 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353}
354
355static
Guido van Rossum9475a232001-10-05 20:51:39 +0000356void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000358 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000359 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000360 /* Keep-Alive optimization */
361 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000362 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 unicode->str = NULL;
364 unicode->length = 0;
365 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000366 if (unicode->defenc) {
367 Py_DECREF(unicode->defenc);
368 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000369 }
370 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000371 *(PyUnicodeObject **)unicode = free_list;
372 free_list = unicode;
373 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 }
375 else {
Neal Norwitz419fd492008-03-17 20:22:43 +0000376 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000377 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000378 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 }
380}
381
Martin v. Löwis18e16552006-02-15 17:27:45 +0000382int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000383{
384 register PyUnicodeObject *v;
385
386 /* Argument checks */
387 if (unicode == NULL) {
388 PyErr_BadInternalCall();
389 return -1;
390 }
391 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000392 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000393 PyErr_BadInternalCall();
394 return -1;
395 }
396
397 /* Resizing unicode_empty and single character objects is not
398 possible since these are being shared. We simply return a fresh
399 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000400 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000401 (v == unicode_empty || v->length == 1)) {
402 PyUnicodeObject *w = _PyUnicode_New(length);
403 if (w == NULL)
404 return -1;
405 Py_UNICODE_COPY(w->str, v->str,
406 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000407 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 *unicode = (PyObject *)w;
409 return 0;
410 }
411
412 /* Note that we don't have to modify *unicode for unshared Unicode
413 objects, since we can modify them in-place. */
414 return unicode_resize(v, length);
415}
416
417/* Internal API for use in unicodeobject.c only ! */
418#define _PyUnicode_Resize(unicodevar, length) \
419 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
420
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000422 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423{
424 PyUnicodeObject *unicode;
425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000426 /* If the Unicode data is known at construction time, we can apply
427 some optimizations which share commonly used objects. */
428 if (u != NULL) {
429
430 /* Optimization for empty strings */
431 if (size == 0 && unicode_empty != NULL) {
432 Py_INCREF(unicode_empty);
433 return (PyObject *)unicode_empty;
434 }
435
436 /* Single character Unicode objects in the Latin-1 range are
437 shared when using this constructor */
438 if (size == 1 && *u < 256) {
439 unicode = unicode_latin1[*u];
440 if (!unicode) {
441 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 if (!unicode)
443 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000444 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 unicode_latin1[*u] = unicode;
446 }
447 Py_INCREF(unicode);
448 return (PyObject *)unicode;
449 }
450 }
Tim Petersced69f82003-09-16 20:30:58 +0000451
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 unicode = _PyUnicode_New(size);
453 if (!unicode)
454 return NULL;
455
456 /* Copy the Unicode data into the new object */
457 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459
460 return (PyObject *)unicode;
461}
462
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000463PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
464{
465 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000466
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000467 if (size < 0) {
468 PyErr_SetString(PyExc_SystemError,
469 "Negative size passed to PyUnicode_FromStringAndSize");
470 return NULL;
471 }
472
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000473 /* If the Unicode data is known at construction time, we can apply
474 some optimizations which share commonly used objects.
475 Also, this means the input must be UTF-8, so fall back to the
476 UTF-8 decoder at the end. */
477 if (u != NULL) {
478
479 /* Optimization for empty strings */
480 if (size == 0 && unicode_empty != NULL) {
481 Py_INCREF(unicode_empty);
482 return (PyObject *)unicode_empty;
483 }
484
485 /* Single characters are shared when using this constructor.
486 Restrict to ASCII, since the input must be UTF-8. */
487 if (size == 1 && Py_CHARMASK(*u) < 128) {
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000488 unicode = unicode_latin1[Py_CHARMASK(*u)];
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000489 if (!unicode) {
490 unicode = _PyUnicode_New(1);
491 if (!unicode)
492 return NULL;
493 unicode->str[0] = Py_CHARMASK(*u);
Neal Norwitzd183bdd2008-03-28 04:58:51 +0000494 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000495 }
496 Py_INCREF(unicode);
497 return (PyObject *)unicode;
498 }
499
500 return PyUnicode_DecodeUTF8(u, size, NULL);
501 }
502
503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 return (PyObject *)unicode;
508}
509
510PyObject *PyUnicode_FromString(const char *u)
511{
512 size_t size = strlen(u);
513 if (size > PY_SSIZE_T_MAX) {
514 PyErr_SetString(PyExc_OverflowError, "input too long");
515 return NULL;
516 }
517
518 return PyUnicode_FromStringAndSize(u, size);
519}
520
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521#ifdef HAVE_WCHAR_H
522
523PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000524 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525{
526 PyUnicodeObject *unicode;
527
528 if (w == NULL) {
529 PyErr_BadInternalCall();
530 return NULL;
531 }
532
533 unicode = _PyUnicode_New(size);
534 if (!unicode)
535 return NULL;
536
537 /* Copy the wchar_t data into the new object */
538#ifdef HAVE_USABLE_WCHAR_T
539 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000540#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541 {
542 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000543 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000545 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 *u++ = *w++;
547 }
548#endif
549
550 return (PyObject *)unicode;
551}
552
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000553static void
554makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
555{
556 *fmt++ = '%';
557 if (width) {
558 if (zeropad)
559 *fmt++ = '0';
560 fmt += sprintf(fmt, "%d", width);
561 }
562 if (precision)
563 fmt += sprintf(fmt, ".%d", precision);
564 if (longflag)
565 *fmt++ = 'l';
566 else if (size_tflag) {
567 char *f = PY_FORMAT_SIZE_T;
568 while (*f)
569 *fmt++ = *f++;
570 }
571 *fmt++ = c;
572 *fmt = '\0';
573}
574
575#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
576
577PyObject *
578PyUnicode_FromFormatV(const char *format, va_list vargs)
579{
580 va_list count;
581 Py_ssize_t callcount = 0;
582 PyObject **callresults = NULL;
583 PyObject **callresult = NULL;
584 Py_ssize_t n = 0;
585 int width = 0;
586 int precision = 0;
587 int zeropad;
588 const char* f;
589 Py_UNICODE *s;
590 PyObject *string;
591 /* used by sprintf */
592 char buffer[21];
593 /* use abuffer instead of buffer, if we need more space
594 * (which can happen if there's a format specifier with width). */
595 char *abuffer = NULL;
596 char *realbuffer;
597 Py_ssize_t abuffersize = 0;
598 char fmt[60]; /* should be enough for %0width.precisionld */
599 const char *copy;
600
601#ifdef VA_LIST_IS_ARRAY
602 Py_MEMCPY(count, vargs, sizeof(va_list));
603#else
604#ifdef __va_copy
605 __va_copy(count, vargs);
606#else
607 count = vargs;
608#endif
609#endif
610 /* step 1: count the number of %S/%R format specifications
611 * (we call PyObject_Str()/PyObject_Repr() for these objects
612 * once during step 3 and put the result in an array) */
613 for (f = format; *f; f++) {
614 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
615 ++callcount;
616 }
617 /* step 2: allocate memory for the results of
618 * PyObject_Str()/PyObject_Repr() calls */
619 if (callcount) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000620 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000621 if (!callresults) {
622 PyErr_NoMemory();
623 return NULL;
624 }
625 callresult = callresults;
626 }
627 /* step 3: figure out how large a buffer we need */
628 for (f = format; *f; f++) {
629 if (*f == '%') {
630 const char* p = f;
631 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000632 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000633 width = (width*10) + *f++ - '0';
Neal Norwitzade57d02008-03-23 06:19:57 +0000634 while (*++f && *f != '%' && !isalpha((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000635 ;
636
637 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
638 * they don't affect the amount of space we reserve.
639 */
640 if ((*f == 'l' || *f == 'z') &&
641 (f[1] == 'd' || f[1] == 'u'))
642 ++f;
643
644 switch (*f) {
645 case 'c':
646 (void)va_arg(count, int);
647 /* fall through... */
648 case '%':
649 n++;
650 break;
651 case 'd': case 'u': case 'i': case 'x':
652 (void) va_arg(count, int);
653 /* 20 bytes is enough to hold a 64-bit
654 integer. Decimal takes the most space.
655 This isn't enough for octal.
656 If a width is specified we need more
657 (which we allocate later). */
658 if (width < 20)
659 width = 20;
660 n += width;
661 if (abuffersize < width)
662 abuffersize = width;
663 break;
664 case 's':
665 {
666 /* UTF-8 */
667 unsigned char*s;
668 s = va_arg(count, unsigned char*);
669 while (*s) {
670 if (*s < 128) {
671 n++; s++;
672 } else if (*s < 0xc0) {
673 /* invalid UTF-8 */
674 n++; s++;
675 } else if (*s < 0xc0) {
676 n++;
677 s++; if(!*s)break;
678 s++;
679 } else if (*s < 0xe0) {
680 n++;
681 s++; if(!*s)break;
682 s++; if(!*s)break;
683 s++;
684 } else {
685 #ifdef Py_UNICODE_WIDE
686 n++;
687 #else
688 n+=2;
689 #endif
690 s++; if(!*s)break;
691 s++; if(!*s)break;
692 s++; if(!*s)break;
693 s++;
694 }
695 }
696 break;
697 }
698 case 'U':
699 {
700 PyObject *obj = va_arg(count, PyObject *);
701 assert(obj && PyUnicode_Check(obj));
702 n += PyUnicode_GET_SIZE(obj);
703 break;
704 }
705 case 'V':
706 {
707 PyObject *obj = va_arg(count, PyObject *);
708 const char *str = va_arg(count, const char *);
709 assert(obj || str);
710 assert(!obj || PyUnicode_Check(obj));
711 if (obj)
712 n += PyUnicode_GET_SIZE(obj);
713 else
714 n += strlen(str);
715 break;
716 }
717 case 'S':
718 {
719 PyObject *obj = va_arg(count, PyObject *);
720 PyObject *str;
721 assert(obj);
722 str = PyObject_Str(obj);
723 if (!str)
724 goto fail;
725 n += PyUnicode_GET_SIZE(str);
726 /* Remember the str and switch to the next slot */
727 *callresult++ = str;
728 break;
729 }
730 case 'R':
731 {
732 PyObject *obj = va_arg(count, PyObject *);
733 PyObject *repr;
734 assert(obj);
735 repr = PyObject_Repr(obj);
736 if (!repr)
737 goto fail;
738 n += PyUnicode_GET_SIZE(repr);
739 /* Remember the repr and switch to the next slot */
740 *callresult++ = repr;
741 break;
742 }
743 case 'p':
744 (void) va_arg(count, int);
745 /* maximum 64-bit pointer representation:
746 * 0xffffffffffffffff
747 * so 19 characters is enough.
748 * XXX I count 18 -- what's the extra for?
749 */
750 n += 19;
751 break;
752 default:
753 /* if we stumble upon an unknown
754 formatting code, copy the rest of
755 the format string to the output
756 string. (we cannot just skip the
757 code, since there's no way to know
758 what's in the argument list) */
759 n += strlen(p);
760 goto expand;
761 }
762 } else
763 n++;
764 }
765 expand:
766 if (abuffersize > 20) {
Neal Norwitz419fd492008-03-17 20:22:43 +0000767 abuffer = PyObject_Malloc(abuffersize);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000768 if (!abuffer) {
769 PyErr_NoMemory();
770 goto fail;
771 }
772 realbuffer = abuffer;
773 }
774 else
775 realbuffer = buffer;
776 /* step 4: fill the buffer */
777 /* Since we've analyzed how much space we need for the worst case,
778 we don't have to resize the string.
779 There can be no errors beyond this point. */
780 string = PyUnicode_FromUnicode(NULL, n);
781 if (!string)
782 goto fail;
783
784 s = PyUnicode_AS_UNICODE(string);
785 callresult = callresults;
786
787 for (f = format; *f; f++) {
788 if (*f == '%') {
789 const char* p = f++;
790 int longflag = 0;
791 int size_tflag = 0;
792 zeropad = (*f == '0');
793 /* parse the width.precision part */
794 width = 0;
Neal Norwitzade57d02008-03-23 06:19:57 +0000795 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000796 width = (width*10) + *f++ - '0';
797 precision = 0;
798 if (*f == '.') {
799 f++;
Neal Norwitzade57d02008-03-23 06:19:57 +0000800 while (isdigit((unsigned)*f))
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000801 precision = (precision*10) + *f++ - '0';
802 }
803 /* handle the long flag, but only for %ld and %lu.
804 others can be added when necessary. */
805 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
806 longflag = 1;
807 ++f;
808 }
809 /* handle the size_t flag. */
810 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
811 size_tflag = 1;
812 ++f;
813 }
814
815 switch (*f) {
816 case 'c':
817 *s++ = va_arg(vargs, int);
818 break;
819 case 'd':
820 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
821 if (longflag)
822 sprintf(realbuffer, fmt, va_arg(vargs, long));
823 else if (size_tflag)
824 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
825 else
826 sprintf(realbuffer, fmt, va_arg(vargs, int));
827 appendstring(realbuffer);
828 break;
829 case 'u':
830 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
831 if (longflag)
832 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
833 else if (size_tflag)
834 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
835 else
836 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
837 appendstring(realbuffer);
838 break;
839 case 'i':
840 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
841 sprintf(realbuffer, fmt, va_arg(vargs, int));
842 appendstring(realbuffer);
843 break;
844 case 'x':
845 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
846 sprintf(realbuffer, fmt, va_arg(vargs, int));
847 appendstring(realbuffer);
848 break;
849 case 's':
850 {
851 /* Parameter must be UTF-8 encoded.
852 In case of encoding errors, use
853 the replacement character. */
854 PyObject *u;
855 p = va_arg(vargs, char*);
856 u = PyUnicode_DecodeUTF8(p, strlen(p),
857 "replace");
858 if (!u)
859 goto fail;
860 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
861 PyUnicode_GET_SIZE(u));
862 s += PyUnicode_GET_SIZE(u);
863 Py_DECREF(u);
864 break;
865 }
866 case 'U':
867 {
868 PyObject *obj = va_arg(vargs, PyObject *);
869 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
870 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
871 s += size;
872 break;
873 }
874 case 'V':
875 {
876 PyObject *obj = va_arg(vargs, PyObject *);
877 const char *str = va_arg(vargs, const char *);
878 if (obj) {
879 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
880 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
881 s += size;
882 } else {
883 appendstring(str);
884 }
885 break;
886 }
887 case 'S':
888 case 'R':
889 {
890 Py_UNICODE *ucopy;
891 Py_ssize_t usize;
892 Py_ssize_t upos;
893 /* unused, since we already have the result */
894 (void) va_arg(vargs, PyObject *);
895 ucopy = PyUnicode_AS_UNICODE(*callresult);
896 usize = PyUnicode_GET_SIZE(*callresult);
897 for (upos = 0; upos<usize;)
898 *s++ = ucopy[upos++];
899 /* We're done with the unicode()/repr() => forget it */
900 Py_DECREF(*callresult);
901 /* switch to next unicode()/repr() result */
902 ++callresult;
903 break;
904 }
905 case 'p':
906 sprintf(buffer, "%p", va_arg(vargs, void*));
907 /* %p is ill-defined: ensure leading 0x. */
908 if (buffer[1] == 'X')
909 buffer[1] = 'x';
910 else if (buffer[1] != 'x') {
911 memmove(buffer+2, buffer, strlen(buffer)+1);
912 buffer[0] = '0';
913 buffer[1] = 'x';
914 }
915 appendstring(buffer);
916 break;
917 case '%':
918 *s++ = '%';
919 break;
920 default:
921 appendstring(p);
922 goto end;
923 }
924 } else
925 *s++ = *f;
926 }
927
928 end:
929 if (callresults)
Neal Norwitz419fd492008-03-17 20:22:43 +0000930 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000931 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000932 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000933 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
934 return string;
935 fail:
936 if (callresults) {
937 PyObject **callresult2 = callresults;
938 while (callresult2 < callresult) {
939 Py_DECREF(*callresult2);
940 ++callresult2;
941 }
Neal Norwitz419fd492008-03-17 20:22:43 +0000942 PyObject_Free(callresults);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000943 }
944 if (abuffer)
Neal Norwitz419fd492008-03-17 20:22:43 +0000945 PyObject_Free(abuffer);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000946 return NULL;
947}
948
949#undef appendstring
950
951PyObject *
952PyUnicode_FromFormat(const char *format, ...)
953{
954 PyObject* ret;
955 va_list vargs;
956
957#ifdef HAVE_STDARG_PROTOTYPES
958 va_start(vargs, format);
959#else
960 va_start(vargs);
961#endif
962 ret = PyUnicode_FromFormatV(format, vargs);
963 va_end(vargs);
964 return ret;
965}
966
Martin v. Löwis18e16552006-02-15 17:27:45 +0000967Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
968 wchar_t *w,
969 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970{
971 if (unicode == NULL) {
972 PyErr_BadInternalCall();
973 return -1;
974 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000975
976 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000978 size = PyUnicode_GET_SIZE(unicode) + 1;
979
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980#ifdef HAVE_USABLE_WCHAR_T
981 memcpy(w, unicode->str, size * sizeof(wchar_t));
982#else
983 {
984 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000985 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000987 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000988 *w++ = *u++;
989 }
990#endif
991
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000992 if (size > PyUnicode_GET_SIZE(unicode))
993 return PyUnicode_GET_SIZE(unicode);
994 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000995 return size;
996}
997
998#endif
999
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001000PyObject *PyUnicode_FromOrdinal(int ordinal)
1001{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001002 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001003
1004#ifdef Py_UNICODE_WIDE
1005 if (ordinal < 0 || ordinal > 0x10ffff) {
1006 PyErr_SetString(PyExc_ValueError,
1007 "unichr() arg not in range(0x110000) "
1008 "(wide Python build)");
1009 return NULL;
1010 }
1011#else
1012 if (ordinal < 0 || ordinal > 0xffff) {
1013 PyErr_SetString(PyExc_ValueError,
1014 "unichr() arg not in range(0x10000) "
1015 "(narrow Python build)");
1016 return NULL;
1017 }
1018#endif
1019
Hye-Shik Chang40574832004-04-06 07:24:51 +00001020 s[0] = (Py_UNICODE)ordinal;
1021 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001022}
1023
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024PyObject *PyUnicode_FromObject(register PyObject *obj)
1025{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001026 /* XXX Perhaps we should make this API an alias of
1027 PyObject_Unicode() instead ?! */
1028 if (PyUnicode_CheckExact(obj)) {
1029 Py_INCREF(obj);
1030 return obj;
1031 }
1032 if (PyUnicode_Check(obj)) {
1033 /* For a Unicode subtype that's not a Unicode object,
1034 return a true Unicode object with the same data. */
1035 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1036 PyUnicode_GET_SIZE(obj));
1037 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001038 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1039}
1040
1041PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1042 const char *encoding,
1043 const char *errors)
1044{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001045 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001046 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001048
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 if (obj == NULL) {
1050 PyErr_BadInternalCall();
1051 return NULL;
1052 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001053
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001054#if 0
1055 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001056 that no encodings is given and then redirect to
1057 PyObject_Unicode() which then applies the additional logic for
1058 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001059
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001060 NOTE: This API should really only be used for object which
1061 represent *encoded* Unicode !
1062
1063 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001064 if (PyUnicode_Check(obj)) {
1065 if (encoding) {
1066 PyErr_SetString(PyExc_TypeError,
1067 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001068 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001069 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001070 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001071 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001072#else
1073 if (PyUnicode_Check(obj)) {
1074 PyErr_SetString(PyExc_TypeError,
1075 "decoding Unicode is not supported");
1076 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001077 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001078#endif
1079
1080 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001081 if (PyString_Check(obj)) {
1082 s = PyString_AS_STRING(obj);
1083 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001084 }
Christian Heimes3497f942008-05-26 12:29:14 +00001085 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001086 /* Python 2.x specific */
1087 PyErr_Format(PyExc_TypeError,
1088 "decoding bytearray is not supported");
1089 return NULL;
1090 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001091 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1092 /* Overwrite the error message with something more useful in
1093 case of a TypeError. */
1094 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001095 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001096 "coercing to Unicode: need string or buffer, "
1097 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001098 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001099 goto onError;
1100 }
Tim Petersced69f82003-09-16 20:30:58 +00001101
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 if (len == 0) {
1104 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001105 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 }
Tim Petersced69f82003-09-16 20:30:58 +00001107 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001108 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001109
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 return v;
1111
1112 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114}
1115
1116PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001117 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 const char *encoding,
1119 const char *errors)
1120{
1121 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001122
1123 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001124 encoding = PyUnicode_GetDefaultEncoding();
1125
1126 /* Shortcuts for common default encodings */
1127 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001131#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_DecodeMBCS(s, size, errors);
1134#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137
1138 /* Decode via the codec registry */
1139 buffer = PyBuffer_FromMemory((void *)s, size);
1140 if (buffer == NULL)
1141 goto onError;
1142 unicode = PyCodec_Decode(buffer, encoding, errors);
1143 if (unicode == NULL)
1144 goto onError;
1145 if (!PyUnicode_Check(unicode)) {
1146 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001147 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001148 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 Py_DECREF(unicode);
1150 goto onError;
1151 }
1152 Py_DECREF(buffer);
1153 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001154
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155 onError:
1156 Py_XDECREF(buffer);
1157 return NULL;
1158}
1159
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001160PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1161 const char *encoding,
1162 const char *errors)
1163{
1164 PyObject *v;
1165
1166 if (!PyUnicode_Check(unicode)) {
1167 PyErr_BadArgument();
1168 goto onError;
1169 }
1170
1171 if (encoding == NULL)
1172 encoding = PyUnicode_GetDefaultEncoding();
1173
1174 /* Decode via the codec registry */
1175 v = PyCodec_Decode(unicode, encoding, errors);
1176 if (v == NULL)
1177 goto onError;
1178 return v;
1179
1180 onError:
1181 return NULL;
1182}
1183
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001185 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186 const char *encoding,
1187 const char *errors)
1188{
1189 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 unicode = PyUnicode_FromUnicode(s, size);
1192 if (unicode == NULL)
1193 return NULL;
1194 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1195 Py_DECREF(unicode);
1196 return v;
1197}
1198
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001199PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1200 const char *encoding,
1201 const char *errors)
1202{
1203 PyObject *v;
1204
1205 if (!PyUnicode_Check(unicode)) {
1206 PyErr_BadArgument();
1207 goto onError;
1208 }
1209
1210 if (encoding == NULL)
1211 encoding = PyUnicode_GetDefaultEncoding();
1212
1213 /* Encode via the codec registry */
1214 v = PyCodec_Encode(unicode, encoding, errors);
1215 if (v == NULL)
1216 goto onError;
1217 return v;
1218
1219 onError:
1220 return NULL;
1221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1224 const char *encoding,
1225 const char *errors)
1226{
1227 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001228
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 if (!PyUnicode_Check(unicode)) {
1230 PyErr_BadArgument();
1231 goto onError;
1232 }
Fred Drakee4315f52000-05-09 19:53:39 +00001233
Tim Petersced69f82003-09-16 20:30:58 +00001234 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001235 encoding = PyUnicode_GetDefaultEncoding();
1236
1237 /* Shortcuts for common default encodings */
1238 if (errors == NULL) {
1239 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001240 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001241 else if (strcmp(encoding, "latin-1") == 0)
1242 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001243#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1244 else if (strcmp(encoding, "mbcs") == 0)
1245 return PyUnicode_AsMBCSString(unicode);
1246#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001247 else if (strcmp(encoding, "ascii") == 0)
1248 return PyUnicode_AsASCIIString(unicode);
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250
1251 /* Encode via the codec registry */
1252 v = PyCodec_Encode(unicode, encoding, errors);
1253 if (v == NULL)
1254 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001255 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001257 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001258 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 Py_DECREF(v);
1260 goto onError;
1261 }
1262 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001263
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 onError:
1265 return NULL;
1266}
1267
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001268PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1269 const char *errors)
1270{
1271 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1272
1273 if (v)
1274 return v;
1275 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1276 if (v && errors == NULL)
1277 ((PyUnicodeObject *)unicode)->defenc = v;
1278 return v;
1279}
1280
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1282{
1283 if (!PyUnicode_Check(unicode)) {
1284 PyErr_BadArgument();
1285 goto onError;
1286 }
1287 return PyUnicode_AS_UNICODE(unicode);
1288
1289 onError:
1290 return NULL;
1291}
1292
Martin v. Löwis18e16552006-02-15 17:27:45 +00001293Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294{
1295 if (!PyUnicode_Check(unicode)) {
1296 PyErr_BadArgument();
1297 goto onError;
1298 }
1299 return PyUnicode_GET_SIZE(unicode);
1300
1301 onError:
1302 return -1;
1303}
1304
Thomas Wouters78890102000-07-22 19:25:51 +00001305const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001306{
1307 return unicode_default_encoding;
1308}
1309
1310int PyUnicode_SetDefaultEncoding(const char *encoding)
1311{
1312 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001313
Fred Drakee4315f52000-05-09 19:53:39 +00001314 /* Make sure the encoding is valid. As side effect, this also
1315 loads the encoding into the codec registry cache. */
1316 v = _PyCodec_Lookup(encoding);
1317 if (v == NULL)
1318 goto onError;
1319 Py_DECREF(v);
1320 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001321 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001322 sizeof(unicode_default_encoding));
1323 return 0;
1324
1325 onError:
1326 return -1;
1327}
1328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329/* error handling callback helper:
1330 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001331 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 and adjust various state variables.
1333 return 0 on success, -1 on error
1334*/
1335
1336static
1337int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1338 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001339 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001341 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001343 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344
1345 PyObject *restuple = NULL;
1346 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1348 Py_ssize_t requiredsize;
1349 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001350 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001351 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 int res = -1;
1353
1354 if (*errorHandler == NULL) {
1355 *errorHandler = PyCodec_LookupError(errors);
1356 if (*errorHandler == NULL)
1357 goto onError;
1358 }
1359
1360 if (*exceptionObject == NULL) {
1361 *exceptionObject = PyUnicodeDecodeError_Create(
1362 encoding, input, insize, *startinpos, *endinpos, reason);
1363 if (*exceptionObject == NULL)
1364 goto onError;
1365 }
1366 else {
1367 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1368 goto onError;
1369 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1370 goto onError;
1371 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1372 goto onError;
1373 }
1374
1375 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1376 if (restuple == NULL)
1377 goto onError;
1378 if (!PyTuple_Check(restuple)) {
1379 PyErr_Format(PyExc_TypeError, &argparse[4]);
1380 goto onError;
1381 }
1382 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1383 goto onError;
1384 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001385 newpos = insize+newpos;
1386 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001387 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001388 goto onError;
1389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390
1391 /* need more space? (at least enough for what we
1392 have+the replacement+the rest of the string (starting
1393 at the new input position), so we won't have to check space
1394 when there are no errors in the rest of the string) */
1395 repptr = PyUnicode_AS_UNICODE(repunicode);
1396 repsize = PyUnicode_GET_SIZE(repunicode);
1397 requiredsize = *outpos + repsize + insize-newpos;
1398 if (requiredsize > outsize) {
1399 if (requiredsize<2*outsize)
1400 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001401 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 goto onError;
1403 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1404 }
1405 *endinpos = newpos;
1406 *inptr = input + newpos;
1407 Py_UNICODE_COPY(*outptr, repptr, repsize);
1408 *outptr += repsize;
1409 *outpos += repsize;
1410 /* we made it! */
1411 res = 0;
1412
1413 onError:
1414 Py_XDECREF(restuple);
1415 return res;
1416}
1417
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418/* --- UTF-7 Codec -------------------------------------------------------- */
1419
1420/* see RFC2152 for details */
1421
Tim Petersced69f82003-09-16 20:30:58 +00001422static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423char utf7_special[128] = {
1424 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1425 encoded:
1426 0 - not special
1427 1 - special
1428 2 - whitespace (optional)
1429 3 - RFC2152 Set O (optional) */
1430 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1431 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1432 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1434 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1436 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1438
1439};
1440
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001441/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1442 warnings about the comparison always being false; since
1443 utf7_special[0] is 1, we can safely make that one comparison
1444 true */
1445
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001447 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001448 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449 (encodeO && (utf7_special[(c)] == 3)))
1450
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001451#define B64(n) \
1452 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1453#define B64CHAR(c) \
1454 (isalnum(c) || (c) == '+' || (c) == '/')
1455#define UB64(c) \
1456 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1457 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001459#define ENCODE(out, ch, bits) \
1460 while (bits >= 6) { \
1461 *out++ = B64(ch >> (bits-6)); \
1462 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463 }
1464
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001465#define DECODE(out, ch, bits, surrogate) \
1466 while (bits >= 16) { \
1467 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1468 bits -= 16; \
1469 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001470 /* We have already generated an error for the high surrogate \
1471 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001472 surrogate = 0; \
1473 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001475 it in a 16-bit character */ \
1476 surrogate = 1; \
1477 errmsg = "code pairs are not supported"; \
1478 goto utf7Error; \
1479 } else { \
1480 *out++ = outCh; \
1481 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001482 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001484PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001485 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486 const char *errors)
1487{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001488 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1489}
1490
1491PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1492 Py_ssize_t size,
1493 const char *errors,
1494 Py_ssize_t *consumed)
1495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001497 Py_ssize_t startinpos;
1498 Py_ssize_t endinpos;
1499 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001500 const char *e;
1501 PyUnicodeObject *unicode;
1502 Py_UNICODE *p;
1503 const char *errmsg = "";
1504 int inShift = 0;
1505 unsigned int bitsleft = 0;
1506 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 int surrogate = 0;
1508 PyObject *errorHandler = NULL;
1509 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510
1511 unicode = _PyUnicode_New(size);
1512 if (!unicode)
1513 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001514 if (size == 0) {
1515 if (consumed)
1516 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519
1520 p = unicode->str;
1521 e = s + size;
1522
1523 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 Py_UNICODE ch;
1525 restart:
1526 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527
1528 if (inShift) {
1529 if ((ch == '-') || !B64CHAR(ch)) {
1530 inShift = 0;
1531 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001532
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1534 if (bitsleft >= 6) {
1535 /* The shift sequence has a partial character in it. If
1536 bitsleft < 6 then we could just classify it as padding
1537 but that is not the case here */
1538
1539 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001540 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 }
1542 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001543 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 here so indicate the potential of a misencoded character. */
1545
1546 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1547 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1548 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001549 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 }
1551
1552 if (ch == '-') {
1553 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001554 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 inShift = 1;
1556 }
1557 } else if (SPECIAL(ch,0,0)) {
1558 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001559 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 } else {
1561 *p++ = ch;
1562 }
1563 } else {
1564 charsleft = (charsleft << 6) | UB64(ch);
1565 bitsleft += 6;
1566 s++;
1567 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1568 }
1569 }
1570 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 s++;
1573 if (s < e && *s == '-') {
1574 s++;
1575 *p++ = '+';
1576 } else
1577 {
1578 inShift = 1;
1579 bitsleft = 0;
1580 }
1581 }
1582 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001583 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 errmsg = "unexpected special character";
1585 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001586 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 }
1588 else {
1589 *p++ = ch;
1590 s++;
1591 }
1592 continue;
1593 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 outpos = p-PyUnicode_AS_UNICODE(unicode);
1595 endinpos = s-starts;
1596 if (unicode_decode_call_errorhandler(
1597 errors, &errorHandler,
1598 "utf7", errmsg,
1599 starts, size, &startinpos, &endinpos, &exc, &s,
1600 (PyObject **)&unicode, &outpos, &p))
1601 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001602 }
1603
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001604 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 outpos = p-PyUnicode_AS_UNICODE(unicode);
1606 endinpos = size;
1607 if (unicode_decode_call_errorhandler(
1608 errors, &errorHandler,
1609 "utf7", "unterminated shift sequence",
1610 starts, size, &startinpos, &endinpos, &exc, &s,
1611 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 if (s < e)
1614 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001616 if (consumed) {
1617 if(inShift)
1618 *consumed = startinpos;
1619 else
1620 *consumed = s-starts;
1621 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001623 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 goto onError;
1625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001628 return (PyObject *)unicode;
1629
1630onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001631 Py_XDECREF(errorHandler);
1632 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633 Py_DECREF(unicode);
1634 return NULL;
1635}
1636
1637
1638PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001639 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 int encodeSetO,
1641 int encodeWhiteSpace,
1642 const char *errors)
1643{
1644 PyObject *v;
1645 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001646 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001648 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 unsigned int bitsleft = 0;
1650 unsigned long charsleft = 0;
1651 char * out;
1652 char * start;
1653
1654 if (size == 0)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001655 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001657 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 if (v == NULL)
1659 return NULL;
1660
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001661 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 for (;i < size; ++i) {
1663 Py_UNICODE ch = s[i];
1664
1665 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001666 if (ch == '+') {
1667 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668 *out++ = '-';
1669 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1670 charsleft = ch;
1671 bitsleft = 16;
1672 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001673 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001675 } else {
1676 *out++ = (char) ch;
1677 }
1678 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680 *out++ = B64(charsleft << (6-bitsleft));
1681 charsleft = 0;
1682 bitsleft = 0;
1683 /* Characters not in the BASE64 set implicitly unshift the sequence
1684 so no '-' is required, except if the character is itself a '-' */
1685 if (B64CHAR(ch) || ch == '-') {
1686 *out++ = '-';
1687 }
1688 inShift = 0;
1689 *out++ = (char) ch;
1690 } else {
1691 bitsleft += 16;
1692 charsleft = (charsleft << 16) | ch;
1693 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1694
1695 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001696 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 or '-' then the shift sequence will be terminated implicitly and we
1698 don't have to insert a '-'. */
1699
1700 if (bitsleft == 0) {
1701 if (i + 1 < size) {
1702 Py_UNICODE ch2 = s[i+1];
1703
1704 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001705
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 } else if (B64CHAR(ch2) || ch2 == '-') {
1707 *out++ = '-';
1708 inShift = 0;
1709 } else {
1710 inShift = 0;
1711 }
1712
1713 }
1714 else {
1715 *out++ = '-';
1716 inShift = 0;
1717 }
1718 }
Tim Petersced69f82003-09-16 20:30:58 +00001719 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001720 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001722 if (bitsleft) {
1723 *out++= B64(charsleft << (6-bitsleft) );
1724 *out++ = '-';
1725 }
1726
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001727 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001728 return v;
1729}
1730
1731#undef SPECIAL
1732#undef B64
1733#undef B64CHAR
1734#undef UB64
1735#undef ENCODE
1736#undef DECODE
1737
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738/* --- UTF-8 Codec -------------------------------------------------------- */
1739
Tim Petersced69f82003-09-16 20:30:58 +00001740static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741char utf8_code_length[256] = {
1742 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1743 illegal prefix. see RFC 2279 for details */
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1757 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1758 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1759 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1760};
1761
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001763 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 const char *errors)
1765{
Walter Dörwald69652032004-09-07 20:24:22 +00001766 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1767}
1768
1769PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001770 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001771 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001774 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 const char *e;
1780 PyUnicodeObject *unicode;
1781 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001782 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 PyObject *errorHandler = NULL;
1784 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785
1786 /* Note: size will always be longer than the resulting Unicode
1787 character count */
1788 unicode = _PyUnicode_New(size);
1789 if (!unicode)
1790 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001791 if (size == 0) {
1792 if (consumed)
1793 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
1797 /* Unpack UTF-8 encoded data */
1798 p = unicode->str;
1799 e = s + size;
1800
1801 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001802 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
1804 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001805 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 s++;
1807 continue;
1808 }
1809
1810 n = utf8_code_length[ch];
1811
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001813 if (consumed)
1814 break;
1815 else {
1816 errmsg = "unexpected end of data";
1817 startinpos = s-starts;
1818 endinpos = size;
1819 goto utf8Error;
1820 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
1823 switch (n) {
1824
1825 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001826 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 startinpos = s-starts;
1828 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830
1831 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001832 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 startinpos = s-starts;
1834 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001835 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836
1837 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 if ((s[1] & 0xc0) != 0x80) {
1839 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001840 startinpos = s-starts;
1841 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 goto utf8Error;
1843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 startinpos = s-starts;
1847 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 errmsg = "illegal encoding";
1849 goto utf8Error;
1850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 break;
1854
1855 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001856 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 (s[2] & 0xc0) != 0x80) {
1858 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 startinpos = s-starts;
1860 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 goto utf8Error;
1862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001864 if (ch < 0x0800) {
1865 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001866 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001867
1868 XXX For wide builds (UCS-4) we should probably try
1869 to recombine the surrogates into a single code
1870 unit.
1871 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001872 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 startinpos = s-starts;
1874 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001875 goto utf8Error;
1876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001878 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001879 break;
1880
1881 case 4:
1882 if ((s[1] & 0xc0) != 0x80 ||
1883 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 (s[3] & 0xc0) != 0x80) {
1885 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 startinpos = s-starts;
1887 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001888 goto utf8Error;
1889 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001890 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1891 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1892 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001893 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001894 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001895 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001897 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001898 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001899 startinpos = s-starts;
1900 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 goto utf8Error;
1902 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001903#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001904 *p++ = (Py_UNICODE)ch;
1905#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001906 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001907
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 /* translate from 10000..10FFFF to 0..FFFF */
1909 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001910
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 /* high surrogate = top 10 bits added to D800 */
1912 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001913
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001914 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001915 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001916#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 break;
1918
1919 default:
1920 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001921 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 startinpos = s-starts;
1923 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 }
1926 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001928
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001929 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001930 outpos = p-PyUnicode_AS_UNICODE(unicode);
1931 if (unicode_decode_call_errorhandler(
1932 errors, &errorHandler,
1933 "utf8", errmsg,
1934 starts, size, &startinpos, &endinpos, &exc, &s,
1935 (PyObject **)&unicode, &outpos, &p))
1936 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
Walter Dörwald69652032004-09-07 20:24:22 +00001938 if (consumed)
1939 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940
1941 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001942 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 goto onError;
1944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001945 Py_XDECREF(errorHandler);
1946 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 return (PyObject *)unicode;
1948
1949onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001950 Py_XDECREF(errorHandler);
1951 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 Py_DECREF(unicode);
1953 return NULL;
1954}
1955
Tim Peters602f7402002-04-27 18:03:26 +00001956/* Allocation strategy: if the string is short, convert into a stack buffer
1957 and allocate exactly as much space needed at the end. Else allocate the
1958 maximum possible needed (4 result bytes per Unicode character), and return
1959 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001960*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001961PyObject *
1962PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001963 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001964 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965{
Tim Peters602f7402002-04-27 18:03:26 +00001966#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001967
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001969 PyObject *v; /* result string object */
1970 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001972 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001973 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001974
Tim Peters602f7402002-04-27 18:03:26 +00001975 assert(s != NULL);
1976 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Tim Peters602f7402002-04-27 18:03:26 +00001978 if (size <= MAX_SHORT_UNICHARS) {
1979 /* Write into the stack buffer; nallocated can't overflow.
1980 * At the end, we'll allocate exactly as much heap space as it
1981 * turns out we need.
1982 */
1983 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1984 v = NULL; /* will allocate after we're done */
1985 p = stackbuf;
1986 }
1987 else {
1988 /* Overallocate on the heap, and give the excess back at the end. */
1989 nallocated = size * 4;
1990 if (nallocated / 4 != size) /* overflow! */
1991 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001992 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001993 if (v == NULL)
1994 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001995 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001996 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001997
Tim Peters602f7402002-04-27 18:03:26 +00001998 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001999 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002000
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002001 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002002 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002004
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002006 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002007 *p++ = (char)(0xc0 | (ch >> 6));
2008 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002009 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002010 else {
Tim Peters602f7402002-04-27 18:03:26 +00002011 /* Encode UCS2 Unicode ordinals */
2012 if (ch < 0x10000) {
2013 /* Special case: check for high surrogate */
2014 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2015 Py_UCS4 ch2 = s[i];
2016 /* Check for low surrogate and combine the two to
2017 form a UCS4 value */
2018 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002019 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002020 i++;
2021 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002022 }
Tim Peters602f7402002-04-27 18:03:26 +00002023 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002025 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002026 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2027 *p++ = (char)(0x80 | (ch & 0x3f));
2028 continue;
2029 }
2030encodeUCS4:
2031 /* Encode UCS4 Unicode ordinals */
2032 *p++ = (char)(0xf0 | (ch >> 18));
2033 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2034 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2035 *p++ = (char)(0x80 | (ch & 0x3f));
2036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002038
Tim Peters602f7402002-04-27 18:03:26 +00002039 if (v == NULL) {
2040 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002041 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002042 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002043 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002044 }
2045 else {
2046 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002047 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002048 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002049 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002052
Tim Peters602f7402002-04-27 18:03:26 +00002053#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054}
2055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2057{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 if (!PyUnicode_Check(unicode)) {
2059 PyErr_BadArgument();
2060 return NULL;
2061 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002062 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2063 PyUnicode_GET_SIZE(unicode),
2064 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065}
2066
Walter Dörwald6e390802007-08-17 16:41:28 +00002067/* --- UTF-32 Codec ------------------------------------------------------- */
2068
2069PyObject *
2070PyUnicode_DecodeUTF32(const char *s,
2071 Py_ssize_t size,
2072 const char *errors,
2073 int *byteorder)
2074{
2075 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2076}
2077
2078PyObject *
2079PyUnicode_DecodeUTF32Stateful(const char *s,
2080 Py_ssize_t size,
2081 const char *errors,
2082 int *byteorder,
2083 Py_ssize_t *consumed)
2084{
2085 const char *starts = s;
2086 Py_ssize_t startinpos;
2087 Py_ssize_t endinpos;
2088 Py_ssize_t outpos;
2089 PyUnicodeObject *unicode;
2090 Py_UNICODE *p;
2091#ifndef Py_UNICODE_WIDE
2092 int i, pairs;
2093#else
2094 const int pairs = 0;
2095#endif
2096 const unsigned char *q, *e;
2097 int bo = 0; /* assume native ordering by default */
2098 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002099 /* Offsets from q for retrieving bytes in the right order. */
2100#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2101 int iorder[] = {0, 1, 2, 3};
2102#else
2103 int iorder[] = {3, 2, 1, 0};
2104#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002105 PyObject *errorHandler = NULL;
2106 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002107 /* On narrow builds we split characters outside the BMP into two
2108 codepoints => count how much extra space we need. */
2109#ifndef Py_UNICODE_WIDE
2110 for (i = pairs = 0; i < size/4; i++)
2111 if (((Py_UCS4 *)s)[i] >= 0x10000)
2112 pairs++;
2113#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002114
2115 /* This might be one to much, because of a BOM */
2116 unicode = _PyUnicode_New((size+3)/4+pairs);
2117 if (!unicode)
2118 return NULL;
2119 if (size == 0)
2120 return (PyObject *)unicode;
2121
2122 /* Unpack UTF-32 encoded data */
2123 p = unicode->str;
2124 q = (unsigned char *)s;
2125 e = q + size;
2126
2127 if (byteorder)
2128 bo = *byteorder;
2129
2130 /* Check for BOM marks (U+FEFF) in the input and adjust current
2131 byte order setting accordingly. In native mode, the leading BOM
2132 mark is skipped, in all other modes, it is copied to the output
2133 stream as-is (giving a ZWNBSP character). */
2134 if (bo == 0) {
2135 if (size >= 4) {
2136 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2137 (q[iorder[1]] << 8) | q[iorder[0]];
2138#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2139 if (bom == 0x0000FEFF) {
2140 q += 4;
2141 bo = -1;
2142 }
2143 else if (bom == 0xFFFE0000) {
2144 q += 4;
2145 bo = 1;
2146 }
2147#else
2148 if (bom == 0x0000FEFF) {
2149 q += 4;
2150 bo = 1;
2151 }
2152 else if (bom == 0xFFFE0000) {
2153 q += 4;
2154 bo = -1;
2155 }
2156#endif
2157 }
2158 }
2159
2160 if (bo == -1) {
2161 /* force LE */
2162 iorder[0] = 0;
2163 iorder[1] = 1;
2164 iorder[2] = 2;
2165 iorder[3] = 3;
2166 }
2167 else if (bo == 1) {
2168 /* force BE */
2169 iorder[0] = 3;
2170 iorder[1] = 2;
2171 iorder[2] = 1;
2172 iorder[3] = 0;
2173 }
2174
2175 while (q < e) {
2176 Py_UCS4 ch;
2177 /* remaining bytes at the end? (size should be divisible by 4) */
2178 if (e-q<4) {
2179 if (consumed)
2180 break;
2181 errmsg = "truncated data";
2182 startinpos = ((const char *)q)-starts;
2183 endinpos = ((const char *)e)-starts;
2184 goto utf32Error;
2185 /* The remaining input chars are ignored if the callback
2186 chooses to skip the input */
2187 }
2188 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2189 (q[iorder[1]] << 8) | q[iorder[0]];
2190
2191 if (ch >= 0x110000)
2192 {
2193 errmsg = "codepoint not in range(0x110000)";
2194 startinpos = ((const char *)q)-starts;
2195 endinpos = startinpos+4;
2196 goto utf32Error;
2197 }
2198#ifndef Py_UNICODE_WIDE
2199 if (ch >= 0x10000)
2200 {
2201 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2202 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2203 }
2204 else
2205#endif
2206 *p++ = ch;
2207 q += 4;
2208 continue;
2209 utf32Error:
2210 outpos = p-PyUnicode_AS_UNICODE(unicode);
2211 if (unicode_decode_call_errorhandler(
2212 errors, &errorHandler,
2213 "utf32", errmsg,
2214 starts, size, &startinpos, &endinpos, &exc, &s,
2215 (PyObject **)&unicode, &outpos, &p))
2216 goto onError;
2217 }
2218
2219 if (byteorder)
2220 *byteorder = bo;
2221
2222 if (consumed)
2223 *consumed = (const char *)q-starts;
2224
2225 /* Adjust length */
2226 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2227 goto onError;
2228
2229 Py_XDECREF(errorHandler);
2230 Py_XDECREF(exc);
2231 return (PyObject *)unicode;
2232
2233onError:
2234 Py_DECREF(unicode);
2235 Py_XDECREF(errorHandler);
2236 Py_XDECREF(exc);
2237 return NULL;
2238}
2239
2240PyObject *
2241PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2242 Py_ssize_t size,
2243 const char *errors,
2244 int byteorder)
2245{
2246 PyObject *v;
2247 unsigned char *p;
2248#ifndef Py_UNICODE_WIDE
2249 int i, pairs;
2250#else
2251 const int pairs = 0;
2252#endif
2253 /* Offsets from p for storing byte pairs in the right order. */
2254#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2255 int iorder[] = {0, 1, 2, 3};
2256#else
2257 int iorder[] = {3, 2, 1, 0};
2258#endif
2259
2260#define STORECHAR(CH) \
2261 do { \
2262 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2263 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2264 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2265 p[iorder[0]] = (CH) & 0xff; \
2266 p += 4; \
2267 } while(0)
2268
2269 /* In narrow builds we can output surrogate pairs as one codepoint,
2270 so we need less space. */
2271#ifndef Py_UNICODE_WIDE
2272 for (i = pairs = 0; i < size-1; i++)
2273 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2274 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2275 pairs++;
2276#endif
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002277 v = PyString_FromStringAndSize(NULL,
Walter Dörwald6e390802007-08-17 16:41:28 +00002278 4 * (size - pairs + (byteorder == 0)));
2279 if (v == NULL)
2280 return NULL;
2281
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002282 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002283 if (byteorder == 0)
2284 STORECHAR(0xFEFF);
2285 if (size == 0)
2286 return v;
2287
2288 if (byteorder == -1) {
2289 /* force LE */
2290 iorder[0] = 0;
2291 iorder[1] = 1;
2292 iorder[2] = 2;
2293 iorder[3] = 3;
2294 }
2295 else if (byteorder == 1) {
2296 /* force BE */
2297 iorder[0] = 3;
2298 iorder[1] = 2;
2299 iorder[2] = 1;
2300 iorder[3] = 0;
2301 }
2302
2303 while (size-- > 0) {
2304 Py_UCS4 ch = *s++;
2305#ifndef Py_UNICODE_WIDE
2306 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2307 Py_UCS4 ch2 = *s;
2308 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2309 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2310 s++;
2311 size--;
2312 }
2313 }
2314#endif
2315 STORECHAR(ch);
2316 }
2317 return v;
2318#undef STORECHAR
2319}
2320
2321PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2322{
2323 if (!PyUnicode_Check(unicode)) {
2324 PyErr_BadArgument();
2325 return NULL;
2326 }
2327 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2328 PyUnicode_GET_SIZE(unicode),
2329 NULL,
2330 0);
2331}
2332
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333/* --- UTF-16 Codec ------------------------------------------------------- */
2334
Tim Peters772747b2001-08-09 22:21:55 +00002335PyObject *
2336PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002337 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002338 const char *errors,
2339 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340{
Walter Dörwald69652032004-09-07 20:24:22 +00002341 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2342}
2343
2344PyObject *
2345PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002346 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002347 const char *errors,
2348 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002349 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t startinpos;
2353 Py_ssize_t endinpos;
2354 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 PyUnicodeObject *unicode;
2356 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002357 const unsigned char *q, *e;
2358 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002359 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002360 /* Offsets from q for retrieving byte pairs in the right order. */
2361#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2362 int ihi = 1, ilo = 0;
2363#else
2364 int ihi = 0, ilo = 1;
2365#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002366 PyObject *errorHandler = NULL;
2367 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368
2369 /* Note: size will always be longer than the resulting Unicode
2370 character count */
2371 unicode = _PyUnicode_New(size);
2372 if (!unicode)
2373 return NULL;
2374 if (size == 0)
2375 return (PyObject *)unicode;
2376
2377 /* Unpack UTF-16 encoded data */
2378 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002379 q = (unsigned char *)s;
2380 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002383 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002385 /* Check for BOM marks (U+FEFF) in the input and adjust current
2386 byte order setting accordingly. In native mode, the leading BOM
2387 mark is skipped, in all other modes, it is copied to the output
2388 stream as-is (giving a ZWNBSP character). */
2389 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002390 if (size >= 2) {
2391 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002392#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002393 if (bom == 0xFEFF) {
2394 q += 2;
2395 bo = -1;
2396 }
2397 else if (bom == 0xFFFE) {
2398 q += 2;
2399 bo = 1;
2400 }
Tim Petersced69f82003-09-16 20:30:58 +00002401#else
Walter Dörwald69652032004-09-07 20:24:22 +00002402 if (bom == 0xFEFF) {
2403 q += 2;
2404 bo = 1;
2405 }
2406 else if (bom == 0xFFFE) {
2407 q += 2;
2408 bo = -1;
2409 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002410#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002411 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413
Tim Peters772747b2001-08-09 22:21:55 +00002414 if (bo == -1) {
2415 /* force LE */
2416 ihi = 1;
2417 ilo = 0;
2418 }
2419 else if (bo == 1) {
2420 /* force BE */
2421 ihi = 0;
2422 ilo = 1;
2423 }
2424
2425 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002426 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002427 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002428 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002429 if (consumed)
2430 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 errmsg = "truncated data";
2432 startinpos = ((const char *)q)-starts;
2433 endinpos = ((const char *)e)-starts;
2434 goto utf16Error;
2435 /* The remaining input chars are ignored if the callback
2436 chooses to skip the input */
2437 }
2438 ch = (q[ihi] << 8) | q[ilo];
2439
Tim Peters772747b2001-08-09 22:21:55 +00002440 q += 2;
2441
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 if (ch < 0xD800 || ch > 0xDFFF) {
2443 *p++ = ch;
2444 continue;
2445 }
2446
2447 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002448 if (q >= e) {
2449 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450 startinpos = (((const char *)q)-2)-starts;
2451 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002452 goto utf16Error;
2453 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002454 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002455 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2456 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002457 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002458#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002459 *p++ = ch;
2460 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002461#else
2462 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002463#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002464 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002465 }
2466 else {
2467 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 startinpos = (((const char *)q)-4)-starts;
2469 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002470 goto utf16Error;
2471 }
2472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 startinpos = (((const char *)q)-2)-starts;
2476 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002477 /* Fall through to report the error */
2478
2479 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 outpos = p-PyUnicode_AS_UNICODE(unicode);
2481 if (unicode_decode_call_errorhandler(
2482 errors, &errorHandler,
2483 "utf16", errmsg,
2484 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2485 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 }
2488
2489 if (byteorder)
2490 *byteorder = bo;
2491
Walter Dörwald69652032004-09-07 20:24:22 +00002492 if (consumed)
2493 *consumed = (const char *)q-starts;
2494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002496 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 goto onError;
2498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return (PyObject *)unicode;
2502
2503onError:
2504 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002505 Py_XDECREF(errorHandler);
2506 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 return NULL;
2508}
2509
Tim Peters772747b2001-08-09 22:21:55 +00002510PyObject *
2511PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002512 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002513 const char *errors,
2514 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515{
2516 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002517 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002518#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002519 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002520#else
2521 const int pairs = 0;
2522#endif
Tim Peters772747b2001-08-09 22:21:55 +00002523 /* Offsets from p for storing byte pairs in the right order. */
2524#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2525 int ihi = 1, ilo = 0;
2526#else
2527 int ihi = 0, ilo = 1;
2528#endif
2529
2530#define STORECHAR(CH) \
2531 do { \
2532 p[ihi] = ((CH) >> 8) & 0xff; \
2533 p[ilo] = (CH) & 0xff; \
2534 p += 2; \
2535 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002537#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002538 for (i = pairs = 0; i < size; i++)
2539 if (s[i] >= 0x10000)
2540 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002541#endif
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002542 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002543 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 if (v == NULL)
2545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002547 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002549 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002550 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002551 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002552
2553 if (byteorder == -1) {
2554 /* force LE */
2555 ihi = 1;
2556 ilo = 0;
2557 }
2558 else if (byteorder == 1) {
2559 /* force BE */
2560 ihi = 0;
2561 ilo = 1;
2562 }
2563
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002564 while (size-- > 0) {
2565 Py_UNICODE ch = *s++;
2566 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002567#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002568 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002569 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2570 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002572#endif
Tim Peters772747b2001-08-09 22:21:55 +00002573 STORECHAR(ch);
2574 if (ch2)
2575 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002578#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579}
2580
2581PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2582{
2583 if (!PyUnicode_Check(unicode)) {
2584 PyErr_BadArgument();
2585 return NULL;
2586 }
2587 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2588 PyUnicode_GET_SIZE(unicode),
2589 NULL,
2590 0);
2591}
2592
2593/* --- Unicode Escape Codec ----------------------------------------------- */
2594
Fredrik Lundh06d12682001-01-24 07:59:11 +00002595static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002598 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 const char *errors)
2600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002602 Py_ssize_t startinpos;
2603 Py_ssize_t endinpos;
2604 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002605 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002609 char* message;
2610 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002611 PyObject *errorHandler = NULL;
2612 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002613
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 /* Escaped strings will always be longer than the resulting
2615 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 length after conversion to the true value.
2617 (but if the error callback returns a long replacement string
2618 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 v = _PyUnicode_New(size);
2620 if (v == NULL)
2621 goto onError;
2622 if (size == 0)
2623 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002627
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 while (s < end) {
2629 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002630 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632
2633 /* Non-escape characters are interpreted as Unicode ordinals */
2634 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002635 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 continue;
2637 }
2638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 /* \ - Escapes */
2641 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002642 c = *s++;
2643 if (s > end)
2644 c = '\0'; /* Invalid after \ */
2645 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646
2647 /* \x escapes */
2648 case '\n': break;
2649 case '\\': *p++ = '\\'; break;
2650 case '\'': *p++ = '\''; break;
2651 case '\"': *p++ = '\"'; break;
2652 case 'b': *p++ = '\b'; break;
2653 case 'f': *p++ = '\014'; break; /* FF */
2654 case 't': *p++ = '\t'; break;
2655 case 'n': *p++ = '\n'; break;
2656 case 'r': *p++ = '\r'; break;
2657 case 'v': *p++ = '\013'; break; /* VT */
2658 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2659
2660 /* \OOO (octal) escapes */
2661 case '0': case '1': case '2': case '3':
2662 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002663 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002664 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002665 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002666 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002667 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002669 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 break;
2671
Fredrik Lundhccc74732001-02-18 22:13:49 +00002672 /* hex escapes */
2673 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002675 digits = 2;
2676 message = "truncated \\xXX escape";
2677 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678
Fredrik Lundhccc74732001-02-18 22:13:49 +00002679 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002681 digits = 4;
2682 message = "truncated \\uXXXX escape";
2683 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002686 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002687 digits = 8;
2688 message = "truncated \\UXXXXXXXX escape";
2689 hexescape:
2690 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 outpos = p-PyUnicode_AS_UNICODE(v);
2692 if (s+digits>end) {
2693 endinpos = size;
2694 if (unicode_decode_call_errorhandler(
2695 errors, &errorHandler,
2696 "unicodeescape", "end of string in escape sequence",
2697 starts, size, &startinpos, &endinpos, &exc, &s,
2698 (PyObject **)&v, &outpos, &p))
2699 goto onError;
2700 goto nextByte;
2701 }
2702 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002703 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002704 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 endinpos = (s+i+1)-starts;
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "unicodeescape", message,
2709 starts, size, &startinpos, &endinpos, &exc, &s,
2710 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002711 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002713 }
2714 chr = (chr<<4) & ~0xF;
2715 if (c >= '0' && c <= '9')
2716 chr += c - '0';
2717 else if (c >= 'a' && c <= 'f')
2718 chr += 10 + c - 'a';
2719 else
2720 chr += 10 + c - 'A';
2721 }
2722 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002723 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 /* _decoding_error will have already written into the
2725 target buffer. */
2726 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002727 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002728 /* when we get here, chr is a 32-bit unicode character */
2729 if (chr <= 0xffff)
2730 /* UCS-2 character */
2731 *p++ = (Py_UNICODE) chr;
2732 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002733 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002734 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002735#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002736 *p++ = chr;
2737#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002738 chr -= 0x10000L;
2739 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002740 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002741#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002742 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 endinpos = s-starts;
2744 outpos = p-PyUnicode_AS_UNICODE(v);
2745 if (unicode_decode_call_errorhandler(
2746 errors, &errorHandler,
2747 "unicodeescape", "illegal Unicode character",
2748 starts, size, &startinpos, &endinpos, &exc, &s,
2749 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002750 goto onError;
2751 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002752 break;
2753
2754 /* \N{name} */
2755 case 'N':
2756 message = "malformed \\N character escape";
2757 if (ucnhash_CAPI == NULL) {
2758 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002759 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002760 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002761 if (m == NULL)
2762 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002763 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002764 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002765 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002766 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002767 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002768 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002769 if (ucnhash_CAPI == NULL)
2770 goto ucnhashError;
2771 }
2772 if (*s == '{') {
2773 const char *start = s+1;
2774 /* look for the closing brace */
2775 while (*s != '}' && s < end)
2776 s++;
2777 if (s > start && s < end && *s == '}') {
2778 /* found a name. look it up in the unicode database */
2779 message = "unknown Unicode character name";
2780 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002781 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002782 goto store;
2783 }
2784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 endinpos = s-starts;
2786 outpos = p-PyUnicode_AS_UNICODE(v);
2787 if (unicode_decode_call_errorhandler(
2788 errors, &errorHandler,
2789 "unicodeescape", message,
2790 starts, size, &startinpos, &endinpos, &exc, &s,
2791 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 break;
2794
2795 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002796 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 message = "\\ at end of string";
2798 s--;
2799 endinpos = s-starts;
2800 outpos = p-PyUnicode_AS_UNICODE(v);
2801 if (unicode_decode_call_errorhandler(
2802 errors, &errorHandler,
2803 "unicodeescape", message,
2804 starts, size, &startinpos, &endinpos, &exc, &s,
2805 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002806 goto onError;
2807 }
2808 else {
2809 *p++ = '\\';
2810 *p++ = (unsigned char)s[-1];
2811 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 nextByte:
2815 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002817 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002819 Py_XDECREF(errorHandler);
2820 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002822
Fredrik Lundhccc74732001-02-18 22:13:49 +00002823ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002824 PyErr_SetString(
2825 PyExc_UnicodeError,
2826 "\\N escapes not supported (can't load unicodedata module)"
2827 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002828 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 Py_XDECREF(errorHandler);
2830 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002831 return NULL;
2832
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 Py_XDECREF(errorHandler);
2836 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 return NULL;
2838}
2839
2840/* Return a Unicode-Escape string version of the Unicode object.
2841
2842 If quotes is true, the string is enclosed in u"" or u'' quotes as
2843 appropriate.
2844
2845*/
2846
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002847Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002848 Py_ssize_t size,
2849 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002850{
2851 /* like wcschr, but doesn't stop at NULL characters */
2852
2853 while (size-- > 0) {
2854 if (*s == ch)
2855 return s;
2856 s++;
2857 }
2858
2859 return NULL;
2860}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002861
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862static
2863PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 int quotes)
2866{
2867 PyObject *repr;
2868 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002870 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
Neal Norwitz17753ec2006-08-21 22:21:19 +00002872 /* XXX(nnorwitz): rather than over-allocating, it would be
2873 better to choose a different scheme. Perhaps scan the
2874 first N-chars of the string and allocate based on that size.
2875 */
2876 /* Initial allocation is based on the longest-possible unichr
2877 escape.
2878
2879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880 unichr, so in this case it's the longest unichr escape. In
2881 narrow (UTF-16) builds this is five chars per source unichr
2882 since there are two unichrs in the surrogate pair, so in narrow
2883 (UTF-16) builds it's not the longest unichr escape.
2884
2885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886 so in the narrow (UTF-16) build case it's the longest unichr
2887 escape.
2888 */
2889
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002890 repr = PyString_FromStringAndSize(NULL,
Neal Norwitz17753ec2006-08-21 22:21:19 +00002891 2
2892#ifdef Py_UNICODE_WIDE
2893 + 10*size
2894#else
2895 + 6*size
2896#endif
2897 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 if (repr == NULL)
2899 return NULL;
2900
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002901 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902
2903 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002905 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 !findchar(s, size, '"')) ? '"' : '\'';
2907 }
2908 while (size-- > 0) {
2909 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002910
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002911 /* Escape quotes and backslashes */
2912 if ((quotes &&
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002913 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 *p++ = '\\';
2915 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002917 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002918
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002919#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002920 /* Map 21-bit characters to '\U00xxxxxx' */
2921 else if (ch >= 0x10000) {
2922 *p++ = '\\';
2923 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002924 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2925 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2926 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2927 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2928 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2929 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2930 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002931 *p++ = hexdigit[ch & 0x0000000F];
2932 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002933 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002934#else
2935 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002936 else if (ch >= 0xD800 && ch < 0xDC00) {
2937 Py_UNICODE ch2;
2938 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002939
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002940 ch2 = *s++;
2941 size--;
2942 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2943 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2944 *p++ = '\\';
2945 *p++ = 'U';
2946 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2947 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2948 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2949 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2950 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2951 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2952 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2953 *p++ = hexdigit[ucs & 0x0000000F];
2954 continue;
2955 }
2956 /* Fall through: isolated surrogates are copied as-is */
2957 s--;
2958 size++;
2959 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002960#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002961
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002963 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 *p++ = '\\';
2965 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002966 *p++ = hexdigit[(ch >> 12) & 0x000F];
2967 *p++ = hexdigit[(ch >> 8) & 0x000F];
2968 *p++ = hexdigit[(ch >> 4) & 0x000F];
2969 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002971
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002972 /* Map special whitespace to '\t', \n', '\r' */
2973 else if (ch == '\t') {
2974 *p++ = '\\';
2975 *p++ = 't';
2976 }
2977 else if (ch == '\n') {
2978 *p++ = '\\';
2979 *p++ = 'n';
2980 }
2981 else if (ch == '\r') {
2982 *p++ = '\\';
2983 *p++ = 'r';
2984 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002985
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002986 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002987 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002989 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002990 *p++ = hexdigit[(ch >> 4) & 0x000F];
2991 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002992 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002993
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 /* Copy everything else as-is */
2995 else
2996 *p++ = (char) ch;
2997 }
2998 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002999 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000
3001 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003002 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 return repr;
3004}
3005
3006PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008{
3009 return unicodeescape_string(s, size, 0);
3010}
3011
3012PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3013{
3014 if (!PyUnicode_Check(unicode)) {
3015 PyErr_BadArgument();
3016 return NULL;
3017 }
3018 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3019 PyUnicode_GET_SIZE(unicode));
3020}
3021
3022/* --- Raw Unicode Escape Codec ------------------------------------------- */
3023
3024PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 const char *errors)
3027{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003029 Py_ssize_t startinpos;
3030 Py_ssize_t endinpos;
3031 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 const char *end;
3035 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 PyObject *errorHandler = NULL;
3037 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003038
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 /* Escaped strings will always be longer than the resulting
3040 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 length after conversion to the true value. (But decoding error
3042 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 v = _PyUnicode_New(size);
3044 if (v == NULL)
3045 goto onError;
3046 if (size == 0)
3047 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 end = s + size;
3050 while (s < end) {
3051 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003052 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003054 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055
3056 /* Non-escape characters are interpreted as Unicode ordinals */
3057 if (*s != '\\') {
3058 *p++ = (unsigned char)*s++;
3059 continue;
3060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062
3063 /* \u-escapes are only interpreted iff the number of leading
3064 backslashes if odd */
3065 bs = s;
3066 for (;s < end;) {
3067 if (*s != '\\')
3068 break;
3069 *p++ = (unsigned char)*s++;
3070 }
3071 if (((s - bs) & 1) == 0 ||
3072 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003073 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 continue;
3075 }
3076 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003077 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 s++;
3079
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003080 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003082 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 endinpos = s-starts;
3086 if (unicode_decode_call_errorhandler(
3087 errors, &errorHandler,
3088 "rawunicodeescape", "truncated \\uXXXX",
3089 starts, size, &startinpos, &endinpos, &exc, &s,
3090 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
3094 x = (x<<4) & ~0xF;
3095 if (c >= '0' && c <= '9')
3096 x += c - '0';
3097 else if (c >= 'a' && c <= 'f')
3098 x += 10 + c - 'a';
3099 else
3100 x += 10 + c - 'A';
3101 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003102 if (x <= 0xffff)
3103 /* UCS-2 character */
3104 *p++ = (Py_UNICODE) x;
3105 else if (x <= 0x10ffff) {
3106 /* UCS-4 character. Either store directly, or as
3107 surrogate pair. */
3108#ifdef Py_UNICODE_WIDE
Amaury Forgeot d'Arcfac02fa2008-03-24 21:04:10 +00003109 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003110#else
3111 x -= 0x10000L;
3112 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3113 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3114#endif
3115 } else {
3116 endinpos = s-starts;
3117 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003118 if (unicode_decode_call_errorhandler(
3119 errors, &errorHandler,
3120 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3121 starts, size, &startinpos, &endinpos, &exc, &s,
3122 (PyObject **)&v, &outpos, &p))
3123 goto onError;
3124 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 nextByte:
3126 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003128 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 Py_XDECREF(errorHandler);
3131 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003133
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 onError:
3135 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 Py_XDECREF(errorHandler);
3137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 return NULL;
3139}
3140
3141PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003142 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143{
3144 PyObject *repr;
3145 char *p;
3146 char *q;
3147
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003148 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003150#ifdef Py_UNICODE_WIDE
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003151 repr = PyString_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152#else
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003153 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (repr == NULL)
3156 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003157 if (size == 0)
3158 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003160 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 while (size-- > 0) {
3162 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003163#ifdef Py_UNICODE_WIDE
3164 /* Map 32-bit characters to '\Uxxxxxxxx' */
3165 if (ch >= 0x10000) {
3166 *p++ = '\\';
3167 *p++ = 'U';
3168 *p++ = hexdigit[(ch >> 28) & 0xf];
3169 *p++ = hexdigit[(ch >> 24) & 0xf];
3170 *p++ = hexdigit[(ch >> 20) & 0xf];
3171 *p++ = hexdigit[(ch >> 16) & 0xf];
3172 *p++ = hexdigit[(ch >> 12) & 0xf];
3173 *p++ = hexdigit[(ch >> 8) & 0xf];
3174 *p++ = hexdigit[(ch >> 4) & 0xf];
3175 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003176 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003178#else
3179 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3180 if (ch >= 0xD800 && ch < 0xDC00) {
3181 Py_UNICODE ch2;
3182 Py_UCS4 ucs;
3183
3184 ch2 = *s++;
3185 size--;
3186 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3187 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3188 *p++ = '\\';
3189 *p++ = 'U';
3190 *p++ = hexdigit[(ucs >> 28) & 0xf];
3191 *p++ = hexdigit[(ucs >> 24) & 0xf];
3192 *p++ = hexdigit[(ucs >> 20) & 0xf];
3193 *p++ = hexdigit[(ucs >> 16) & 0xf];
3194 *p++ = hexdigit[(ucs >> 12) & 0xf];
3195 *p++ = hexdigit[(ucs >> 8) & 0xf];
3196 *p++ = hexdigit[(ucs >> 4) & 0xf];
3197 *p++ = hexdigit[ucs & 0xf];
3198 continue;
3199 }
3200 /* Fall through: isolated surrogates are copied as-is */
3201 s--;
3202 size++;
3203 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003204#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 /* Map 16-bit characters to '\uxxxx' */
3206 if (ch >= 256) {
3207 *p++ = '\\';
3208 *p++ = 'u';
3209 *p++ = hexdigit[(ch >> 12) & 0xf];
3210 *p++ = hexdigit[(ch >> 8) & 0xf];
3211 *p++ = hexdigit[(ch >> 4) & 0xf];
3212 *p++ = hexdigit[ch & 15];
3213 }
3214 /* Copy everything else as-is */
3215 else
3216 *p++ = (char) ch;
3217 }
3218 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003219 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 return repr;
3221}
3222
3223PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3224{
3225 if (!PyUnicode_Check(unicode)) {
3226 PyErr_BadArgument();
3227 return NULL;
3228 }
3229 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3230 PyUnicode_GET_SIZE(unicode));
3231}
3232
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003233/* --- Unicode Internal Codec ------------------------------------------- */
3234
3235PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003237 const char *errors)
3238{
3239 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003240 Py_ssize_t startinpos;
3241 Py_ssize_t endinpos;
3242 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003243 PyUnicodeObject *v;
3244 Py_UNICODE *p;
3245 const char *end;
3246 const char *reason;
3247 PyObject *errorHandler = NULL;
3248 PyObject *exc = NULL;
3249
Neal Norwitzd43069c2006-01-08 01:12:10 +00003250#ifdef Py_UNICODE_WIDE
3251 Py_UNICODE unimax = PyUnicode_GetMax();
3252#endif
3253
Armin Rigo7ccbca92006-10-04 12:17:45 +00003254 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003255 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3256 if (v == NULL)
3257 goto onError;
3258 if (PyUnicode_GetSize((PyObject *)v) == 0)
3259 return (PyObject *)v;
3260 p = PyUnicode_AS_UNICODE(v);
3261 end = s + size;
3262
3263 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003264 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003265 /* We have to sanity check the raw data, otherwise doom looms for
3266 some malformed UCS-4 data. */
3267 if (
3268 #ifdef Py_UNICODE_WIDE
3269 *p > unimax || *p < 0 ||
3270 #endif
3271 end-s < Py_UNICODE_SIZE
3272 )
3273 {
3274 startinpos = s - starts;
3275 if (end-s < Py_UNICODE_SIZE) {
3276 endinpos = end-starts;
3277 reason = "truncated input";
3278 }
3279 else {
3280 endinpos = s - starts + Py_UNICODE_SIZE;
3281 reason = "illegal code point (> 0x10FFFF)";
3282 }
3283 outpos = p - PyUnicode_AS_UNICODE(v);
3284 if (unicode_decode_call_errorhandler(
3285 errors, &errorHandler,
3286 "unicode_internal", reason,
3287 starts, size, &startinpos, &endinpos, &exc, &s,
3288 (PyObject **)&v, &outpos, &p)) {
3289 goto onError;
3290 }
3291 }
3292 else {
3293 p++;
3294 s += Py_UNICODE_SIZE;
3295 }
3296 }
3297
Martin v. Löwis412fb672006-04-13 06:34:32 +00003298 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003299 goto onError;
3300 Py_XDECREF(errorHandler);
3301 Py_XDECREF(exc);
3302 return (PyObject *)v;
3303
3304 onError:
3305 Py_XDECREF(v);
3306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
3308 return NULL;
3309}
3310
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311/* --- Latin-1 Codec ------------------------------------------------------ */
3312
3313PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003314 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 const char *errors)
3316{
3317 PyUnicodeObject *v;
3318 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003321 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003322 Py_UNICODE r = *(unsigned char*)s;
3323 return PyUnicode_FromUnicode(&r, 1);
3324 }
3325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 v = _PyUnicode_New(size);
3327 if (v == NULL)
3328 goto onError;
3329 if (size == 0)
3330 return (PyObject *)v;
3331 p = PyUnicode_AS_UNICODE(v);
3332 while (size-- > 0)
3333 *p++ = (unsigned char)*s++;
3334 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003335
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 onError:
3337 Py_XDECREF(v);
3338 return NULL;
3339}
3340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341/* create or adjust a UnicodeEncodeError */
3342static void make_encode_exception(PyObject **exceptionObject,
3343 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 const Py_UNICODE *unicode, Py_ssize_t size,
3345 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 if (*exceptionObject == NULL) {
3349 *exceptionObject = PyUnicodeEncodeError_Create(
3350 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 }
3352 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3354 goto onError;
3355 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3356 goto onError;
3357 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3358 goto onError;
3359 return;
3360 onError:
3361 Py_DECREF(*exceptionObject);
3362 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 }
3364}
3365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366/* raises a UnicodeEncodeError */
3367static void raise_encode_exception(PyObject **exceptionObject,
3368 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 const Py_UNICODE *unicode, Py_ssize_t size,
3370 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 const char *reason)
3372{
3373 make_encode_exception(exceptionObject,
3374 encoding, unicode, size, startpos, endpos, reason);
3375 if (*exceptionObject != NULL)
3376 PyCodec_StrictErrors(*exceptionObject);
3377}
3378
3379/* error handling callback helper:
3380 build arguments, call the callback and check the arguments,
3381 put the result into newpos and return the replacement string, which
3382 has to be freed by the caller */
3383static PyObject *unicode_encode_call_errorhandler(const char *errors,
3384 PyObject **errorHandler,
3385 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003386 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3387 Py_ssize_t startpos, Py_ssize_t endpos,
3388 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003390 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391
3392 PyObject *restuple;
3393 PyObject *resunicode;
3394
3395 if (*errorHandler == NULL) {
3396 *errorHandler = PyCodec_LookupError(errors);
3397 if (*errorHandler == NULL)
3398 return NULL;
3399 }
3400
3401 make_encode_exception(exceptionObject,
3402 encoding, unicode, size, startpos, endpos, reason);
3403 if (*exceptionObject == NULL)
3404 return NULL;
3405
3406 restuple = PyObject_CallFunctionObjArgs(
3407 *errorHandler, *exceptionObject, NULL);
3408 if (restuple == NULL)
3409 return NULL;
3410 if (!PyTuple_Check(restuple)) {
3411 PyErr_Format(PyExc_TypeError, &argparse[4]);
3412 Py_DECREF(restuple);
3413 return NULL;
3414 }
3415 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3416 &resunicode, newpos)) {
3417 Py_DECREF(restuple);
3418 return NULL;
3419 }
3420 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003421 *newpos = size+*newpos;
3422 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003424 Py_DECREF(restuple);
3425 return NULL;
3426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 Py_INCREF(resunicode);
3428 Py_DECREF(restuple);
3429 return resunicode;
3430}
3431
3432static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 const char *errors,
3435 int limit)
3436{
3437 /* output object */
3438 PyObject *res;
3439 /* pointers to the beginning and end+1 of input */
3440 const Py_UNICODE *startp = p;
3441 const Py_UNICODE *endp = p + size;
3442 /* pointer to the beginning of the unencodable characters */
3443 /* const Py_UNICODE *badp = NULL; */
3444 /* pointer into the output */
3445 char *str;
3446 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003447 Py_ssize_t respos = 0;
3448 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003449 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3450 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 PyObject *errorHandler = NULL;
3452 PyObject *exc = NULL;
3453 /* the following variable is used for caching string comparisons
3454 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3455 int known_errorHandler = -1;
3456
3457 /* allocate enough for a simple encoding without
3458 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003459 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 if (res == NULL)
3461 goto onError;
3462 if (size == 0)
3463 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003464 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 ressize = size;
3466
3467 while (p<endp) {
3468 Py_UNICODE c = *p;
3469
3470 /* can we encode this? */
3471 if (c<limit) {
3472 /* no overflow check, because we know that the space is enough */
3473 *str++ = (char)c;
3474 ++p;
3475 }
3476 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t unicodepos = p-startp;
3478 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003480 Py_ssize_t repsize;
3481 Py_ssize_t newpos;
3482 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 Py_UNICODE *uni2;
3484 /* startpos for collecting unencodable chars */
3485 const Py_UNICODE *collstart = p;
3486 const Py_UNICODE *collend = p;
3487 /* find all unecodable characters */
3488 while ((collend < endp) && ((*collend)>=limit))
3489 ++collend;
3490 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3491 if (known_errorHandler==-1) {
3492 if ((errors==NULL) || (!strcmp(errors, "strict")))
3493 known_errorHandler = 1;
3494 else if (!strcmp(errors, "replace"))
3495 known_errorHandler = 2;
3496 else if (!strcmp(errors, "ignore"))
3497 known_errorHandler = 3;
3498 else if (!strcmp(errors, "xmlcharrefreplace"))
3499 known_errorHandler = 4;
3500 else
3501 known_errorHandler = 0;
3502 }
3503 switch (known_errorHandler) {
3504 case 1: /* strict */
3505 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3506 goto onError;
3507 case 2: /* replace */
3508 while (collstart++<collend)
3509 *str++ = '?'; /* fall through */
3510 case 3: /* ignore */
3511 p = collend;
3512 break;
3513 case 4: /* xmlcharrefreplace */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003514 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 /* determine replacement size (temporarily (mis)uses p) */
3516 for (p = collstart, repsize = 0; p < collend; ++p) {
3517 if (*p<10)
3518 repsize += 2+1+1;
3519 else if (*p<100)
3520 repsize += 2+2+1;
3521 else if (*p<1000)
3522 repsize += 2+3+1;
3523 else if (*p<10000)
3524 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003525#ifndef Py_UNICODE_WIDE
3526 else
3527 repsize += 2+5+1;
3528#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 else if (*p<100000)
3530 repsize += 2+5+1;
3531 else if (*p<1000000)
3532 repsize += 2+6+1;
3533 else
3534 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003535#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 }
3537 requiredsize = respos+repsize+(endp-collend);
3538 if (requiredsize > ressize) {
3539 if (requiredsize<2*ressize)
3540 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003541 if (_PyString_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003543 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 ressize = requiredsize;
3545 }
3546 /* generate replacement (temporarily (mis)uses p) */
3547 for (p = collstart; p < collend; ++p) {
3548 str += sprintf(str, "&#%d;", (int)*p);
3549 }
3550 p = collend;
3551 break;
3552 default:
3553 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3554 encoding, reason, startp, size, &exc,
3555 collstart-startp, collend-startp, &newpos);
3556 if (repunicode == NULL)
3557 goto onError;
3558 /* need more space? (at least enough for what we
3559 have+the replacement+the rest of the string, so
3560 we won't have to check space for encodable characters) */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003561 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 repsize = PyUnicode_GET_SIZE(repunicode);
3563 requiredsize = respos+repsize+(endp-collend);
3564 if (requiredsize > ressize) {
3565 if (requiredsize<2*ressize)
3566 requiredsize = 2*ressize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003567 if (_PyString_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_DECREF(repunicode);
3569 goto onError;
3570 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003571 str = PyString_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 ressize = requiredsize;
3573 }
3574 /* check if there is anything unencodable in the replacement
3575 and copy it to the output */
3576 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3577 c = *uni2;
3578 if (c >= limit) {
3579 raise_encode_exception(&exc, encoding, startp, size,
3580 unicodepos, unicodepos+1, reason);
3581 Py_DECREF(repunicode);
3582 goto onError;
3583 }
3584 *str = (char)c;
3585 }
3586 p = startp + newpos;
3587 Py_DECREF(repunicode);
3588 }
3589 }
3590 }
3591 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003592 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 if (respos<ressize)
3594 /* If this falls res will be NULL */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003595 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 Py_XDECREF(errorHandler);
3597 Py_XDECREF(exc);
3598 return res;
3599
3600 onError:
3601 Py_XDECREF(res);
3602 Py_XDECREF(errorHandler);
3603 Py_XDECREF(exc);
3604 return NULL;
3605}
3606
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 const char *errors)
3610{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612}
3613
3614PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3615{
3616 if (!PyUnicode_Check(unicode)) {
3617 PyErr_BadArgument();
3618 return NULL;
3619 }
3620 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3621 PyUnicode_GET_SIZE(unicode),
3622 NULL);
3623}
3624
3625/* --- 7-bit ASCII Codec -------------------------------------------------- */
3626
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003628 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 const char *errors)
3630{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 PyUnicodeObject *v;
3633 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003634 Py_ssize_t startinpos;
3635 Py_ssize_t endinpos;
3636 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 const char *e;
3638 PyObject *errorHandler = NULL;
3639 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003642 if (size == 1 && *(unsigned char*)s < 128) {
3643 Py_UNICODE r = *(unsigned char*)s;
3644 return PyUnicode_FromUnicode(&r, 1);
3645 }
Tim Petersced69f82003-09-16 20:30:58 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 v = _PyUnicode_New(size);
3648 if (v == NULL)
3649 goto onError;
3650 if (size == 0)
3651 return (PyObject *)v;
3652 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 e = s + size;
3654 while (s < e) {
3655 register unsigned char c = (unsigned char)*s;
3656 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 ++s;
3659 }
3660 else {
3661 startinpos = s-starts;
3662 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003663 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 if (unicode_decode_call_errorhandler(
3665 errors, &errorHandler,
3666 "ascii", "ordinal not in range(128)",
3667 starts, size, &startinpos, &endinpos, &exc, &s,
3668 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003672 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003673 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003674 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 onError:
3680 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_XDECREF(errorHandler);
3682 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return NULL;
3684}
3685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003687 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 const char *errors)
3689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691}
3692
3693PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3694{
3695 if (!PyUnicode_Check(unicode)) {
3696 PyErr_BadArgument();
3697 return NULL;
3698 }
3699 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3700 PyUnicode_GET_SIZE(unicode),
3701 NULL);
3702}
3703
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003704#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003705
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003706/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003707
Martin v. Löwisd8251432006-06-14 05:21:04 +00003708#if SIZEOF_INT < SIZEOF_SSIZE_T
3709#define NEED_RETRY
3710#endif
3711
3712/* XXX This code is limited to "true" double-byte encodings, as
3713 a) it assumes an incomplete character consists of a single byte, and
3714 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3715 encodings, see IsDBCSLeadByteEx documentation. */
3716
3717static int is_dbcs_lead_byte(const char *s, int offset)
3718{
3719 const char *curr = s + offset;
3720
3721 if (IsDBCSLeadByte(*curr)) {
3722 const char *prev = CharPrev(s, curr);
3723 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3724 }
3725 return 0;
3726}
3727
3728/*
3729 * Decode MBCS string into unicode object. If 'final' is set, converts
3730 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3731 */
3732static int decode_mbcs(PyUnicodeObject **v,
3733 const char *s, /* MBCS string */
3734 int size, /* sizeof MBCS string */
3735 int final)
3736{
3737 Py_UNICODE *p;
3738 Py_ssize_t n = 0;
3739 int usize = 0;
3740
3741 assert(size >= 0);
3742
3743 /* Skip trailing lead-byte unless 'final' is set */
3744 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3745 --size;
3746
3747 /* First get the size of the result */
3748 if (size > 0) {
3749 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3750 if (usize == 0) {
3751 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3752 return -1;
3753 }
3754 }
3755
3756 if (*v == NULL) {
3757 /* Create unicode object */
3758 *v = _PyUnicode_New(usize);
3759 if (*v == NULL)
3760 return -1;
3761 }
3762 else {
3763 /* Extend unicode object */
3764 n = PyUnicode_GET_SIZE(*v);
3765 if (_PyUnicode_Resize(v, n + usize) < 0)
3766 return -1;
3767 }
3768
3769 /* Do the conversion */
3770 if (size > 0) {
3771 p = PyUnicode_AS_UNICODE(*v) + n;
3772 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3773 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3774 return -1;
3775 }
3776 }
3777
3778 return size;
3779}
3780
3781PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3782 Py_ssize_t size,
3783 const char *errors,
3784 Py_ssize_t *consumed)
3785{
3786 PyUnicodeObject *v = NULL;
3787 int done;
3788
3789 if (consumed)
3790 *consumed = 0;
3791
3792#ifdef NEED_RETRY
3793 retry:
3794 if (size > INT_MAX)
3795 done = decode_mbcs(&v, s, INT_MAX, 0);
3796 else
3797#endif
3798 done = decode_mbcs(&v, s, (int)size, !consumed);
3799
3800 if (done < 0) {
3801 Py_XDECREF(v);
3802 return NULL;
3803 }
3804
3805 if (consumed)
3806 *consumed += done;
3807
3808#ifdef NEED_RETRY
3809 if (size > INT_MAX) {
3810 s += done;
3811 size -= done;
3812 goto retry;
3813 }
3814#endif
3815
3816 return (PyObject *)v;
3817}
3818
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003819PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003820 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003821 const char *errors)
3822{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003823 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3824}
3825
3826/*
3827 * Convert unicode into string object (MBCS).
3828 * Returns 0 if succeed, -1 otherwise.
3829 */
3830static int encode_mbcs(PyObject **repr,
3831 const Py_UNICODE *p, /* unicode */
3832 int size) /* size of unicode */
3833{
3834 int mbcssize = 0;
3835 Py_ssize_t n = 0;
3836
3837 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838
3839 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003840 if (size > 0) {
3841 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3842 if (mbcssize == 0) {
3843 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3844 return -1;
3845 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003846 }
3847
Martin v. Löwisd8251432006-06-14 05:21:04 +00003848 if (*repr == NULL) {
3849 /* Create string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003850 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003851 if (*repr == NULL)
3852 return -1;
3853 }
3854 else {
3855 /* Extend string object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003856 n = PyString_Size(*repr);
3857 if (_PyString_Resize(repr, n + mbcssize) < 0)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003858 return -1;
3859 }
3860
3861 /* Do the conversion */
3862 if (size > 0) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003863 char *s = PyString_AS_STRING(*repr) + n;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003864 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3865 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3866 return -1;
3867 }
3868 }
3869
3870 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003871}
3872
3873PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003875 const char *errors)
3876{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877 PyObject *repr = NULL;
3878 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003879
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880#ifdef NEED_RETRY
3881 retry:
3882 if (size > INT_MAX)
3883 ret = encode_mbcs(&repr, p, INT_MAX);
3884 else
3885#endif
3886 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003887
Martin v. Löwisd8251432006-06-14 05:21:04 +00003888 if (ret < 0) {
3889 Py_XDECREF(repr);
3890 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003891 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892
3893#ifdef NEED_RETRY
3894 if (size > INT_MAX) {
3895 p += INT_MAX;
3896 size -= INT_MAX;
3897 goto retry;
3898 }
3899#endif
3900
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003901 return repr;
3902}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003903
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003904PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3905{
3906 if (!PyUnicode_Check(unicode)) {
3907 PyErr_BadArgument();
3908 return NULL;
3909 }
3910 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3911 PyUnicode_GET_SIZE(unicode),
3912 NULL);
3913}
3914
Martin v. Löwisd8251432006-06-14 05:21:04 +00003915#undef NEED_RETRY
3916
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003917#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003918
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919/* --- Character Mapping Codec -------------------------------------------- */
3920
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 PyObject *mapping,
3924 const char *errors)
3925{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t startinpos;
3928 Py_ssize_t endinpos;
3929 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 PyUnicodeObject *v;
3932 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 PyObject *errorHandler = NULL;
3935 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003936 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003938
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 /* Default to Latin-1 */
3940 if (mapping == NULL)
3941 return PyUnicode_DecodeLatin1(s, size, errors);
3942
3943 v = _PyUnicode_New(size);
3944 if (v == NULL)
3945 goto onError;
3946 if (size == 0)
3947 return (PyObject *)v;
3948 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003950 if (PyUnicode_CheckExact(mapping)) {
3951 mapstring = PyUnicode_AS_UNICODE(mapping);
3952 maplen = PyUnicode_GET_SIZE(mapping);
3953 while (s < e) {
3954 unsigned char ch = *s;
3955 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003957 if (ch < maplen)
3958 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003960 if (x == 0xfffe) {
3961 /* undefined mapping */
3962 outpos = p-PyUnicode_AS_UNICODE(v);
3963 startinpos = s-starts;
3964 endinpos = startinpos+1;
3965 if (unicode_decode_call_errorhandler(
3966 errors, &errorHandler,
3967 "charmap", "character maps to <undefined>",
3968 starts, size, &startinpos, &endinpos, &exc, &s,
3969 (PyObject **)&v, &outpos, &p)) {
3970 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003971 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003972 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003973 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003974 *p++ = x;
3975 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003977 }
3978 else {
3979 while (s < e) {
3980 unsigned char ch = *s;
3981 PyObject *w, *x;
3982
3983 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3984 w = PyInt_FromLong((long)ch);
3985 if (w == NULL)
3986 goto onError;
3987 x = PyObject_GetItem(mapping, w);
3988 Py_DECREF(w);
3989 if (x == NULL) {
3990 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3991 /* No mapping found means: mapping is undefined. */
3992 PyErr_Clear();
3993 x = Py_None;
3994 Py_INCREF(x);
3995 } else
3996 goto onError;
3997 }
3998
3999 /* Apply mapping */
4000 if (PyInt_Check(x)) {
4001 long value = PyInt_AS_LONG(x);
4002 if (value < 0 || value > 65535) {
4003 PyErr_SetString(PyExc_TypeError,
4004 "character mapping must be in range(65536)");
4005 Py_DECREF(x);
4006 goto onError;
4007 }
4008 *p++ = (Py_UNICODE)value;
4009 }
4010 else if (x == Py_None) {
4011 /* undefined mapping */
4012 outpos = p-PyUnicode_AS_UNICODE(v);
4013 startinpos = s-starts;
4014 endinpos = startinpos+1;
4015 if (unicode_decode_call_errorhandler(
4016 errors, &errorHandler,
4017 "charmap", "character maps to <undefined>",
4018 starts, size, &startinpos, &endinpos, &exc, &s,
4019 (PyObject **)&v, &outpos, &p)) {
4020 Py_DECREF(x);
4021 goto onError;
4022 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004023 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004024 continue;
4025 }
4026 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004027 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004028
4029 if (targetsize == 1)
4030 /* 1-1 mapping */
4031 *p++ = *PyUnicode_AS_UNICODE(x);
4032
4033 else if (targetsize > 1) {
4034 /* 1-n mapping */
4035 if (targetsize > extrachars) {
4036 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4038 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004039 (targetsize << 2);
4040 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00004041 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004042 if (_PyUnicode_Resize(&v,
4043 PyUnicode_GET_SIZE(v) + needed) < 0) {
4044 Py_DECREF(x);
4045 goto onError;
4046 }
4047 p = PyUnicode_AS_UNICODE(v) + oldpos;
4048 }
4049 Py_UNICODE_COPY(p,
4050 PyUnicode_AS_UNICODE(x),
4051 targetsize);
4052 p += targetsize;
4053 extrachars -= targetsize;
4054 }
4055 /* 1-0 mapping: skip the character */
4056 }
4057 else {
4058 /* wrong return value */
4059 PyErr_SetString(PyExc_TypeError,
4060 "character mapping must return integer, None or unicode");
4061 Py_DECREF(x);
4062 goto onError;
4063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004065 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 }
4068 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004069 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 Py_XDECREF(errorHandler);
4072 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 Py_XDECREF(errorHandler);
4077 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 Py_XDECREF(v);
4079 return NULL;
4080}
4081
Martin v. Löwis3f767792006-06-04 19:36:28 +00004082/* Charmap encoding: the lookup table */
4083
4084struct encoding_map{
4085 PyObject_HEAD
4086 unsigned char level1[32];
4087 int count2, count3;
4088 unsigned char level23[1];
4089};
4090
4091static PyObject*
4092encoding_map_size(PyObject *obj, PyObject* args)
4093{
4094 struct encoding_map *map = (struct encoding_map*)obj;
4095 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4096 128*map->count3);
4097}
4098
4099static PyMethodDef encoding_map_methods[] = {
4100 {"size", encoding_map_size, METH_NOARGS,
4101 PyDoc_STR("Return the size (in bytes) of this object") },
4102 { 0 }
4103};
4104
4105static void
4106encoding_map_dealloc(PyObject* o)
4107{
4108 PyObject_FREE(o);
4109}
4110
4111static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004112 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004113 "EncodingMap", /*tp_name*/
4114 sizeof(struct encoding_map), /*tp_basicsize*/
4115 0, /*tp_itemsize*/
4116 /* methods */
4117 encoding_map_dealloc, /*tp_dealloc*/
4118 0, /*tp_print*/
4119 0, /*tp_getattr*/
4120 0, /*tp_setattr*/
4121 0, /*tp_compare*/
4122 0, /*tp_repr*/
4123 0, /*tp_as_number*/
4124 0, /*tp_as_sequence*/
4125 0, /*tp_as_mapping*/
4126 0, /*tp_hash*/
4127 0, /*tp_call*/
4128 0, /*tp_str*/
4129 0, /*tp_getattro*/
4130 0, /*tp_setattro*/
4131 0, /*tp_as_buffer*/
4132 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4133 0, /*tp_doc*/
4134 0, /*tp_traverse*/
4135 0, /*tp_clear*/
4136 0, /*tp_richcompare*/
4137 0, /*tp_weaklistoffset*/
4138 0, /*tp_iter*/
4139 0, /*tp_iternext*/
4140 encoding_map_methods, /*tp_methods*/
4141 0, /*tp_members*/
4142 0, /*tp_getset*/
4143 0, /*tp_base*/
4144 0, /*tp_dict*/
4145 0, /*tp_descr_get*/
4146 0, /*tp_descr_set*/
4147 0, /*tp_dictoffset*/
4148 0, /*tp_init*/
4149 0, /*tp_alloc*/
4150 0, /*tp_new*/
4151 0, /*tp_free*/
4152 0, /*tp_is_gc*/
4153};
4154
4155PyObject*
4156PyUnicode_BuildEncodingMap(PyObject* string)
4157{
4158 Py_UNICODE *decode;
4159 PyObject *result;
4160 struct encoding_map *mresult;
4161 int i;
4162 int need_dict = 0;
4163 unsigned char level1[32];
4164 unsigned char level2[512];
4165 unsigned char *mlevel1, *mlevel2, *mlevel3;
4166 int count2 = 0, count3 = 0;
4167
4168 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4169 PyErr_BadArgument();
4170 return NULL;
4171 }
4172 decode = PyUnicode_AS_UNICODE(string);
4173 memset(level1, 0xFF, sizeof level1);
4174 memset(level2, 0xFF, sizeof level2);
4175
4176 /* If there isn't a one-to-one mapping of NULL to \0,
4177 or if there are non-BMP characters, we need to use
4178 a mapping dictionary. */
4179 if (decode[0] != 0)
4180 need_dict = 1;
4181 for (i = 1; i < 256; i++) {
4182 int l1, l2;
4183 if (decode[i] == 0
4184 #ifdef Py_UNICODE_WIDE
4185 || decode[i] > 0xFFFF
4186 #endif
4187 ) {
4188 need_dict = 1;
4189 break;
4190 }
4191 if (decode[i] == 0xFFFE)
4192 /* unmapped character */
4193 continue;
4194 l1 = decode[i] >> 11;
4195 l2 = decode[i] >> 7;
4196 if (level1[l1] == 0xFF)
4197 level1[l1] = count2++;
4198 if (level2[l2] == 0xFF)
4199 level2[l2] = count3++;
4200 }
4201
4202 if (count2 >= 0xFF || count3 >= 0xFF)
4203 need_dict = 1;
4204
4205 if (need_dict) {
4206 PyObject *result = PyDict_New();
4207 PyObject *key, *value;
4208 if (!result)
4209 return NULL;
4210 for (i = 0; i < 256; i++) {
4211 key = value = NULL;
4212 key = PyInt_FromLong(decode[i]);
4213 value = PyInt_FromLong(i);
4214 if (!key || !value)
4215 goto failed1;
4216 if (PyDict_SetItem(result, key, value) == -1)
4217 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004218 Py_DECREF(key);
4219 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220 }
4221 return result;
4222 failed1:
4223 Py_XDECREF(key);
4224 Py_XDECREF(value);
4225 Py_DECREF(result);
4226 return NULL;
4227 }
4228
4229 /* Create a three-level trie */
4230 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4231 16*count2 + 128*count3 - 1);
4232 if (!result)
4233 return PyErr_NoMemory();
4234 PyObject_Init(result, &EncodingMapType);
4235 mresult = (struct encoding_map*)result;
4236 mresult->count2 = count2;
4237 mresult->count3 = count3;
4238 mlevel1 = mresult->level1;
4239 mlevel2 = mresult->level23;
4240 mlevel3 = mresult->level23 + 16*count2;
4241 memcpy(mlevel1, level1, 32);
4242 memset(mlevel2, 0xFF, 16*count2);
4243 memset(mlevel3, 0, 128*count3);
4244 count3 = 0;
4245 for (i = 1; i < 256; i++) {
4246 int o1, o2, o3, i2, i3;
4247 if (decode[i] == 0xFFFE)
4248 /* unmapped character */
4249 continue;
4250 o1 = decode[i]>>11;
4251 o2 = (decode[i]>>7) & 0xF;
4252 i2 = 16*mlevel1[o1] + o2;
4253 if (mlevel2[i2] == 0xFF)
4254 mlevel2[i2] = count3++;
4255 o3 = decode[i] & 0x7F;
4256 i3 = 128*mlevel2[i2] + o3;
4257 mlevel3[i3] = i;
4258 }
4259 return result;
4260}
4261
4262static int
4263encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4264{
4265 struct encoding_map *map = (struct encoding_map*)mapping;
4266 int l1 = c>>11;
4267 int l2 = (c>>7) & 0xF;
4268 int l3 = c & 0x7F;
4269 int i;
4270
4271#ifdef Py_UNICODE_WIDE
4272 if (c > 0xFFFF) {
4273 return -1;
4274 }
4275#endif
4276 if (c == 0)
4277 return 0;
4278 /* level 1*/
4279 i = map->level1[l1];
4280 if (i == 0xFF) {
4281 return -1;
4282 }
4283 /* level 2*/
4284 i = map->level23[16*i+l2];
4285 if (i == 0xFF) {
4286 return -1;
4287 }
4288 /* level 3 */
4289 i = map->level23[16*map->count2 + 128*i + l3];
4290 if (i == 0) {
4291 return -1;
4292 }
4293 return i;
4294}
4295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296/* Lookup the character ch in the mapping. If the character
4297 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004298 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 PyObject *w = PyInt_FromLong((long)c);
4302 PyObject *x;
4303
4304 if (w == NULL)
4305 return NULL;
4306 x = PyObject_GetItem(mapping, w);
4307 Py_DECREF(w);
4308 if (x == NULL) {
4309 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4310 /* No mapping found means: mapping is undefined. */
4311 PyErr_Clear();
4312 x = Py_None;
4313 Py_INCREF(x);
4314 return x;
4315 } else
4316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004318 else if (x == Py_None)
4319 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 else if (PyInt_Check(x)) {
4321 long value = PyInt_AS_LONG(x);
4322 if (value < 0 || value > 255) {
4323 PyErr_SetString(PyExc_TypeError,
4324 "character mapping must be in range(256)");
4325 Py_DECREF(x);
4326 return NULL;
4327 }
4328 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004330 else if (PyString_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 /* wrong return value */
4334 PyErr_SetString(PyExc_TypeError,
4335 "character mapping must return integer, None or str");
4336 Py_DECREF(x);
4337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
4339}
4340
Martin v. Löwis3f767792006-06-04 19:36:28 +00004341static int
4342charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4343{
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004344 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 /* exponentially overallocate to minimize reallocations */
4346 if (requiredsize < 2*outsize)
4347 requiredsize = 2*outsize;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004348 if (_PyString_Resize(outobj, requiredsize)) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004349 return 0;
4350 }
4351 return 1;
4352}
4353
4354typedef enum charmapencode_result {
4355 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4356}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357/* lookup the character, put the result in the output string and adjust
4358 various state variables. Reallocate the output string if not enough
4359 space is available. Return a new reference to the object that
4360 was put in the output buffer, or Py_None, if the mapping was undefined
4361 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004362 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004364charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004365 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004367 PyObject *rep;
4368 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004369 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370
Christian Heimese93237d2007-12-19 02:37:44 +00004371 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372 int res = encoding_map_lookup(c, mapping);
4373 Py_ssize_t requiredsize = *outpos+1;
4374 if (res == -1)
4375 return enc_FAILED;
4376 if (outsize<requiredsize)
4377 if (!charmapencode_resize(outobj, outpos, requiredsize))
4378 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004379 outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004380 outstart[(*outpos)++] = (char)res;
4381 return enc_SUCCESS;
4382 }
4383
4384 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004386 return enc_EXCEPTION;
4387 else if (rep==Py_None) {
4388 Py_DECREF(rep);
4389 return enc_FAILED;
4390 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004393 if (outsize<requiredsize)
4394 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004398 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4400 }
4401 else {
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004402 const char *repchars = PyString_AS_STRING(rep);
4403 Py_ssize_t repsize = PyString_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004404 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004405 if (outsize<requiredsize)
4406 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004408 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004410 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 memcpy(outstart + *outpos, repchars, repsize);
4412 *outpos += repsize;
4413 }
4414 }
Georg Brandl9f167602006-06-04 21:46:16 +00004415 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004416 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417}
4418
4419/* handle an error in PyUnicode_EncodeCharmap
4420 Return 0 on success, -1 on error */
4421static
4422int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004423 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004425 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427{
4428 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004429 Py_ssize_t repsize;
4430 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 Py_UNICODE *uni2;
4432 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t collstartpos = *inpos;
4434 Py_ssize_t collendpos = *inpos+1;
4435 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 char *encoding = "charmap";
4437 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004438 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 /* find all unencodable characters */
4441 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004442 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004443 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004444 int res = encoding_map_lookup(p[collendpos], mapping);
4445 if (res != -1)
4446 break;
4447 ++collendpos;
4448 continue;
4449 }
4450
4451 rep = charmapencode_lookup(p[collendpos], mapping);
4452 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004454 else if (rep!=Py_None) {
4455 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 break;
4457 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004458 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 ++collendpos;
4460 }
4461 /* cache callback name lookup
4462 * (if not done yet, i.e. it's the first error) */
4463 if (*known_errorHandler==-1) {
4464 if ((errors==NULL) || (!strcmp(errors, "strict")))
4465 *known_errorHandler = 1;
4466 else if (!strcmp(errors, "replace"))
4467 *known_errorHandler = 2;
4468 else if (!strcmp(errors, "ignore"))
4469 *known_errorHandler = 3;
4470 else if (!strcmp(errors, "xmlcharrefreplace"))
4471 *known_errorHandler = 4;
4472 else
4473 *known_errorHandler = 0;
4474 }
4475 switch (*known_errorHandler) {
4476 case 1: /* strict */
4477 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4478 return -1;
4479 case 2: /* replace */
4480 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4481 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004482 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 return -1;
4484 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004485 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4487 return -1;
4488 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 }
4490 /* fall through */
4491 case 3: /* ignore */
4492 *inpos = collendpos;
4493 break;
4494 case 4: /* xmlcharrefreplace */
4495 /* generate replacement (temporarily (mis)uses p) */
4496 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4497 char buffer[2+29+1+1];
4498 char *cp;
4499 sprintf(buffer, "&#%d;", (int)p[collpos]);
4500 for (cp = buffer; *cp; ++cp) {
4501 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004502 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4506 return -1;
4507 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 }
4509 }
4510 *inpos = collendpos;
4511 break;
4512 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004513 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 encoding, reason, p, size, exceptionObject,
4515 collstartpos, collendpos, &newpos);
4516 if (repunicode == NULL)
4517 return -1;
4518 /* generate replacement */
4519 repsize = PyUnicode_GET_SIZE(repunicode);
4520 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4521 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004522 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 return -1;
4524 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004525 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4528 return -1;
4529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 }
4531 *inpos = newpos;
4532 Py_DECREF(repunicode);
4533 }
4534 return 0;
4535}
4536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 PyObject *mapping,
4540 const char *errors)
4541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 /* output object */
4543 PyObject *res = NULL;
4544 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004545 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004547 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
4550 /* the following variable is used for caching string comparisons
4551 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4552 * 3=ignore, 4=xmlcharrefreplace */
4553 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554
4555 /* Default to Latin-1 */
4556 if (mapping == NULL)
4557 return PyUnicode_EncodeLatin1(p, size, errors);
4558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 /* allocate enough for a simple encoding without
4560 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004561 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 if (res == NULL)
4563 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004564 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 while (inpos<size) {
4568 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004569 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4570 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 if (charmap_encoding_error(p, size, &inpos, mapping,
4574 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004575 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004576 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004577 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 else
4581 /* done with this character => adjust input position */
4582 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004586 if (respos<PyString_GET_SIZE(res)) {
4587 if (_PyString_Resize(&res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 goto onError;
4589 }
4590 Py_XDECREF(exc);
4591 Py_XDECREF(errorHandler);
4592 return res;
4593
4594 onError:
4595 Py_XDECREF(res);
4596 Py_XDECREF(exc);
4597 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 return NULL;
4599}
4600
4601PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4602 PyObject *mapping)
4603{
4604 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4605 PyErr_BadArgument();
4606 return NULL;
4607 }
4608 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4609 PyUnicode_GET_SIZE(unicode),
4610 mapping,
4611 NULL);
4612}
4613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614/* create or adjust a UnicodeTranslateError */
4615static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004616 const Py_UNICODE *unicode, Py_ssize_t size,
4617 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 if (*exceptionObject == NULL) {
4621 *exceptionObject = PyUnicodeTranslateError_Create(
4622 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 }
4624 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4626 goto onError;
4627 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4628 goto onError;
4629 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4630 goto onError;
4631 return;
4632 onError:
4633 Py_DECREF(*exceptionObject);
4634 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 }
4636}
4637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638/* raises a UnicodeTranslateError */
4639static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 const Py_UNICODE *unicode, Py_ssize_t size,
4641 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 const char *reason)
4643{
4644 make_translate_exception(exceptionObject,
4645 unicode, size, startpos, endpos, reason);
4646 if (*exceptionObject != NULL)
4647 PyCodec_StrictErrors(*exceptionObject);
4648}
4649
4650/* error handling callback helper:
4651 build arguments, call the callback and check the arguments,
4652 put the result into newpos and return the replacement string, which
4653 has to be freed by the caller */
4654static PyObject *unicode_translate_call_errorhandler(const char *errors,
4655 PyObject **errorHandler,
4656 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004657 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4658 Py_ssize_t startpos, Py_ssize_t endpos,
4659 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004661 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662
Martin v. Löwis412fb672006-04-13 06:34:32 +00004663 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 PyObject *restuple;
4665 PyObject *resunicode;
4666
4667 if (*errorHandler == NULL) {
4668 *errorHandler = PyCodec_LookupError(errors);
4669 if (*errorHandler == NULL)
4670 return NULL;
4671 }
4672
4673 make_translate_exception(exceptionObject,
4674 unicode, size, startpos, endpos, reason);
4675 if (*exceptionObject == NULL)
4676 return NULL;
4677
4678 restuple = PyObject_CallFunctionObjArgs(
4679 *errorHandler, *exceptionObject, NULL);
4680 if (restuple == NULL)
4681 return NULL;
4682 if (!PyTuple_Check(restuple)) {
4683 PyErr_Format(PyExc_TypeError, &argparse[4]);
4684 Py_DECREF(restuple);
4685 return NULL;
4686 }
4687 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 Py_DECREF(restuple);
4690 return NULL;
4691 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 if (i_newpos<0)
4693 *newpos = size+i_newpos;
4694 else
4695 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004696 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004698 Py_DECREF(restuple);
4699 return NULL;
4700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 Py_INCREF(resunicode);
4702 Py_DECREF(restuple);
4703 return resunicode;
4704}
4705
4706/* Lookup the character ch in the mapping and put the result in result,
4707 which must be decrefed by the caller.
4708 Return 0 on success, -1 on error */
4709static
4710int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4711{
4712 PyObject *w = PyInt_FromLong((long)c);
4713 PyObject *x;
4714
4715 if (w == NULL)
4716 return -1;
4717 x = PyObject_GetItem(mapping, w);
4718 Py_DECREF(w);
4719 if (x == NULL) {
4720 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4721 /* No mapping found means: use 1:1 mapping. */
4722 PyErr_Clear();
4723 *result = NULL;
4724 return 0;
4725 } else
4726 return -1;
4727 }
4728 else if (x == Py_None) {
4729 *result = x;
4730 return 0;
4731 }
4732 else if (PyInt_Check(x)) {
4733 long value = PyInt_AS_LONG(x);
4734 long max = PyUnicode_GetMax();
4735 if (value < 0 || value > max) {
4736 PyErr_Format(PyExc_TypeError,
4737 "character mapping must be in range(0x%lx)", max+1);
4738 Py_DECREF(x);
4739 return -1;
4740 }
4741 *result = x;
4742 return 0;
4743 }
4744 else if (PyUnicode_Check(x)) {
4745 *result = x;
4746 return 0;
4747 }
4748 else {
4749 /* wrong return value */
4750 PyErr_SetString(PyExc_TypeError,
4751 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004752 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 return -1;
4754 }
4755}
4756/* ensure that *outobj is at least requiredsize characters long,
4757if not reallocate and adjust various state variables.
4758Return 0 on success, -1 on error */
4759static
Walter Dörwald4894c302003-10-24 14:25:28 +00004760int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004764 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004768 if (requiredsize < 2 * oldsize)
4769 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004770 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 return -1;
4772 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 }
4774 return 0;
4775}
4776/* lookup the character, put the result in the output string and adjust
4777 various state variables. Return a new reference to the object that
4778 was put in the output buffer in *result, or Py_None, if the mapping was
4779 undefined (in which case no character was written).
4780 The called must decref result.
4781 Return 0 on success, -1 on error. */
4782static
Walter Dörwald4894c302003-10-24 14:25:28 +00004783int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004785 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786{
Walter Dörwald4894c302003-10-24 14:25:28 +00004787 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 return -1;
4789 if (*res==NULL) {
4790 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004791 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 }
4793 else if (*res==Py_None)
4794 ;
4795 else if (PyInt_Check(*res)) {
4796 /* no overflow check, because we know that the space is enough */
4797 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4798 }
4799 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 if (repsize==1) {
4802 /* no overflow check, because we know that the space is enough */
4803 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4804 }
4805 else if (repsize!=0) {
4806 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004807 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004808 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004809 repsize - 1;
4810 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 return -1;
4812 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4813 *outp += repsize;
4814 }
4815 }
4816 else
4817 return -1;
4818 return 0;
4819}
4820
4821PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 PyObject *mapping,
4824 const char *errors)
4825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 /* output object */
4827 PyObject *res = NULL;
4828 /* pointers to the beginning and end+1 of input */
4829 const Py_UNICODE *startp = p;
4830 const Py_UNICODE *endp = p + size;
4831 /* pointer into the output */
4832 Py_UNICODE *str;
4833 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 char *reason = "character maps to <undefined>";
4836 PyObject *errorHandler = NULL;
4837 PyObject *exc = NULL;
4838 /* the following variable is used for caching string comparisons
4839 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4840 * 3=ignore, 4=xmlcharrefreplace */
4841 int known_errorHandler = -1;
4842
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (mapping == NULL) {
4844 PyErr_BadArgument();
4845 return NULL;
4846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847
4848 /* allocate enough for a simple 1:1 translation without
4849 replacements, if we need more, we'll resize */
4850 res = PyUnicode_FromUnicode(NULL, size);
4851 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004852 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 return res;
4855 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 while (p<endp) {
4858 /* try to encode it */
4859 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004860 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 goto onError;
4863 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004864 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 if (x!=Py_None) /* it worked => adjust input pointer */
4866 ++p;
4867 else { /* untranslatable character */
4868 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t repsize;
4870 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_UNICODE *uni2;
4872 /* startpos for collecting untranslatable chars */
4873 const Py_UNICODE *collstart = p;
4874 const Py_UNICODE *collend = p+1;
4875 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 /* find all untranslatable characters */
4878 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004879 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 goto onError;
4881 Py_XDECREF(x);
4882 if (x!=Py_None)
4883 break;
4884 ++collend;
4885 }
4886 /* cache callback name lookup
4887 * (if not done yet, i.e. it's the first error) */
4888 if (known_errorHandler==-1) {
4889 if ((errors==NULL) || (!strcmp(errors, "strict")))
4890 known_errorHandler = 1;
4891 else if (!strcmp(errors, "replace"))
4892 known_errorHandler = 2;
4893 else if (!strcmp(errors, "ignore"))
4894 known_errorHandler = 3;
4895 else if (!strcmp(errors, "xmlcharrefreplace"))
4896 known_errorHandler = 4;
4897 else
4898 known_errorHandler = 0;
4899 }
4900 switch (known_errorHandler) {
4901 case 1: /* strict */
4902 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4903 goto onError;
4904 case 2: /* replace */
4905 /* No need to check for space, this is a 1:1 replacement */
4906 for (coll = collstart; coll<collend; ++coll)
4907 *str++ = '?';
4908 /* fall through */
4909 case 3: /* ignore */
4910 p = collend;
4911 break;
4912 case 4: /* xmlcharrefreplace */
4913 /* generate replacement (temporarily (mis)uses p) */
4914 for (p = collstart; p < collend; ++p) {
4915 char buffer[2+29+1+1];
4916 char *cp;
4917 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004918 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4920 goto onError;
4921 for (cp = buffer; *cp; ++cp)
4922 *str++ = *cp;
4923 }
4924 p = collend;
4925 break;
4926 default:
4927 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4928 reason, startp, size, &exc,
4929 collstart-startp, collend-startp, &newpos);
4930 if (repunicode == NULL)
4931 goto onError;
4932 /* generate replacement */
4933 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004934 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4936 Py_DECREF(repunicode);
4937 goto onError;
4938 }
4939 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4940 *str++ = *uni2;
4941 p = startp + newpos;
4942 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 }
4944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946 /* Resize if we allocated to much */
4947 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004948 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004949 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004950 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 }
4952 Py_XDECREF(exc);
4953 Py_XDECREF(errorHandler);
4954 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 onError:
4957 Py_XDECREF(res);
4958 Py_XDECREF(exc);
4959 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 return NULL;
4961}
4962
4963PyObject *PyUnicode_Translate(PyObject *str,
4964 PyObject *mapping,
4965 const char *errors)
4966{
4967 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004968
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 str = PyUnicode_FromObject(str);
4970 if (str == NULL)
4971 goto onError;
4972 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4973 PyUnicode_GET_SIZE(str),
4974 mapping,
4975 errors);
4976 Py_DECREF(str);
4977 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004978
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 onError:
4980 Py_XDECREF(str);
4981 return NULL;
4982}
Tim Petersced69f82003-09-16 20:30:58 +00004983
Guido van Rossum9e896b32000-04-05 20:11:21 +00004984/* --- Decimal Encoder ---------------------------------------------------- */
4985
4986int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 char *output,
4989 const char *errors)
4990{
4991 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 PyObject *errorHandler = NULL;
4993 PyObject *exc = NULL;
4994 const char *encoding = "decimal";
4995 const char *reason = "invalid decimal Unicode string";
4996 /* the following variable is used for caching string comparisons
4997 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4998 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999
5000 if (output == NULL) {
5001 PyErr_BadArgument();
5002 return -1;
5003 }
5004
5005 p = s;
5006 end = s + length;
5007 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005009 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005011 Py_ssize_t repsize;
5012 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 Py_UNICODE *uni2;
5014 Py_UNICODE *collstart;
5015 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005016
Guido van Rossum9e896b32000-04-05 20:11:21 +00005017 if (Py_UNICODE_ISSPACE(ch)) {
5018 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005020 continue;
5021 }
5022 decimal = Py_UNICODE_TODECIMAL(ch);
5023 if (decimal >= 0) {
5024 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005026 continue;
5027 }
Guido van Rossumba477042000-04-06 18:18:10 +00005028 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005029 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005031 continue;
5032 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 /* All other characters are considered unencodable */
5034 collstart = p;
5035 collend = p+1;
5036 while (collend < end) {
5037 if ((0 < *collend && *collend < 256) ||
5038 !Py_UNICODE_ISSPACE(*collend) ||
5039 Py_UNICODE_TODECIMAL(*collend))
5040 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 /* cache callback name lookup
5043 * (if not done yet, i.e. it's the first error) */
5044 if (known_errorHandler==-1) {
5045 if ((errors==NULL) || (!strcmp(errors, "strict")))
5046 known_errorHandler = 1;
5047 else if (!strcmp(errors, "replace"))
5048 known_errorHandler = 2;
5049 else if (!strcmp(errors, "ignore"))
5050 known_errorHandler = 3;
5051 else if (!strcmp(errors, "xmlcharrefreplace"))
5052 known_errorHandler = 4;
5053 else
5054 known_errorHandler = 0;
5055 }
5056 switch (known_errorHandler) {
5057 case 1: /* strict */
5058 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5059 goto onError;
5060 case 2: /* replace */
5061 for (p = collstart; p < collend; ++p)
5062 *output++ = '?';
5063 /* fall through */
5064 case 3: /* ignore */
5065 p = collend;
5066 break;
5067 case 4: /* xmlcharrefreplace */
5068 /* generate replacement (temporarily (mis)uses p) */
5069 for (p = collstart; p < collend; ++p)
5070 output += sprintf(output, "&#%d;", (int)*p);
5071 p = collend;
5072 break;
5073 default:
5074 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5075 encoding, reason, s, length, &exc,
5076 collstart-s, collend-s, &newpos);
5077 if (repunicode == NULL)
5078 goto onError;
5079 /* generate replacement */
5080 repsize = PyUnicode_GET_SIZE(repunicode);
5081 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5082 Py_UNICODE ch = *uni2;
5083 if (Py_UNICODE_ISSPACE(ch))
5084 *output++ = ' ';
5085 else {
5086 decimal = Py_UNICODE_TODECIMAL(ch);
5087 if (decimal >= 0)
5088 *output++ = '0' + decimal;
5089 else if (0 < ch && ch < 256)
5090 *output++ = (char)ch;
5091 else {
5092 Py_DECREF(repunicode);
5093 raise_encode_exception(&exc, encoding,
5094 s, length, collstart-s, collend-s, reason);
5095 goto onError;
5096 }
5097 }
5098 }
5099 p = s + newpos;
5100 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005101 }
5102 }
5103 /* 0-terminate the output string */
5104 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 Py_XDECREF(exc);
5106 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005107 return 0;
5108
5109 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 Py_XDECREF(exc);
5111 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005112 return -1;
5113}
5114
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115/* --- Helpers ------------------------------------------------------------ */
5116
Eric Smitha9f7d622008-02-17 19:46:49 +00005117#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005118
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005119#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005120
Fredrik Lundha50d2012006-05-26 17:04:58 +00005121#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005122
5123#include "stringlib/count.h"
5124#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005125#include "stringlib/partition.h"
5126
Fredrik Lundhc8162812006-05-26 19:33:03 +00005127/* helper macro to fixup start/end slice values */
5128#define FIX_START_END(obj) \
5129 if (start < 0) \
5130 start += (obj)->length; \
5131 if (start < 0) \
5132 start = 0; \
5133 if (end > (obj)->length) \
5134 end = (obj)->length; \
5135 if (end < 0) \
5136 end += (obj)->length; \
5137 if (end < 0) \
5138 end = 0;
5139
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005141 PyObject *substr,
5142 Py_ssize_t start,
5143 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005145 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005146 PyUnicodeObject* str_obj;
5147 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005148
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005149 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5150 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005152 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5153 if (!sub_obj) {
5154 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 return -1;
5156 }
Tim Petersced69f82003-09-16 20:30:58 +00005157
Fredrik Lundhc8162812006-05-26 19:33:03 +00005158 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005159
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005160 result = stringlib_count(
5161 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5162 );
5163
5164 Py_DECREF(sub_obj);
5165 Py_DECREF(str_obj);
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 return result;
5168}
5169
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005171 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005172 Py_ssize_t start,
5173 Py_ssize_t end,
5174 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005176 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005177
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005178 str = PyUnicode_FromObject(str);
5179 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005180 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005181 sub = PyUnicode_FromObject(sub);
5182 if (!sub) {
5183 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005184 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 }
Tim Petersced69f82003-09-16 20:30:58 +00005186
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005187 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005188 result = stringlib_find_slice(
5189 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5190 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5191 start, end
5192 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005193 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005194 result = stringlib_rfind_slice(
5195 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5196 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5197 start, end
5198 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005199
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005200 Py_DECREF(str);
5201 Py_DECREF(sub);
5202
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 return result;
5204}
5205
Tim Petersced69f82003-09-16 20:30:58 +00005206static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207int tailmatch(PyUnicodeObject *self,
5208 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 Py_ssize_t start,
5210 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 int direction)
5212{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 if (substring->length == 0)
5214 return 1;
5215
Fredrik Lundhc8162812006-05-26 19:33:03 +00005216 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
5218 end -= substring->length;
5219 if (end < start)
5220 return 0;
5221
5222 if (direction > 0) {
5223 if (Py_UNICODE_MATCH(self, end, substring))
5224 return 1;
5225 } else {
5226 if (Py_UNICODE_MATCH(self, start, substring))
5227 return 1;
5228 }
5229
5230 return 0;
5231}
5232
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t start,
5236 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 int direction)
5238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005240
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 str = PyUnicode_FromObject(str);
5242 if (str == NULL)
5243 return -1;
5244 substr = PyUnicode_FromObject(substr);
5245 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005246 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 return -1;
5248 }
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 result = tailmatch((PyUnicodeObject *)str,
5251 (PyUnicodeObject *)substr,
5252 start, end, direction);
5253 Py_DECREF(str);
5254 Py_DECREF(substr);
5255 return result;
5256}
5257
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258/* Apply fixfct filter to the Unicode object self and return a
5259 reference to the modified object */
5260
Tim Petersced69f82003-09-16 20:30:58 +00005261static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262PyObject *fixup(PyUnicodeObject *self,
5263 int (*fixfct)(PyUnicodeObject *s))
5264{
5265
5266 PyUnicodeObject *u;
5267
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005268 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 if (u == NULL)
5270 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005271
5272 Py_UNICODE_COPY(u->str, self->str, self->length);
5273
Tim Peters7a29bd52001-09-12 03:03:31 +00005274 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 /* fixfct should return TRUE if it modified the buffer. If
5276 FALSE, return a reference to the original buffer instead
5277 (to save space, not time) */
5278 Py_INCREF(self);
5279 Py_DECREF(u);
5280 return (PyObject*) self;
5281 }
5282 return (PyObject*) u;
5283}
5284
Tim Petersced69f82003-09-16 20:30:58 +00005285static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286int fixupper(PyUnicodeObject *self)
5287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 Py_UNICODE *s = self->str;
5290 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 while (len-- > 0) {
5293 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 ch = Py_UNICODE_TOUPPER(*s);
5296 if (ch != *s) {
5297 status = 1;
5298 *s = ch;
5299 }
5300 s++;
5301 }
5302
5303 return status;
5304}
5305
Tim Petersced69f82003-09-16 20:30:58 +00005306static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307int fixlower(PyUnicodeObject *self)
5308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 Py_UNICODE *s = self->str;
5311 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 while (len-- > 0) {
5314 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 ch = Py_UNICODE_TOLOWER(*s);
5317 if (ch != *s) {
5318 status = 1;
5319 *s = ch;
5320 }
5321 s++;
5322 }
5323
5324 return status;
5325}
5326
Tim Petersced69f82003-09-16 20:30:58 +00005327static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328int fixswapcase(PyUnicodeObject *self)
5329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 Py_UNICODE *s = self->str;
5332 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005333
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 while (len-- > 0) {
5335 if (Py_UNICODE_ISUPPER(*s)) {
5336 *s = Py_UNICODE_TOLOWER(*s);
5337 status = 1;
5338 } else if (Py_UNICODE_ISLOWER(*s)) {
5339 *s = Py_UNICODE_TOUPPER(*s);
5340 status = 1;
5341 }
5342 s++;
5343 }
5344
5345 return status;
5346}
5347
Tim Petersced69f82003-09-16 20:30:58 +00005348static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349int fixcapitalize(PyUnicodeObject *self)
5350{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005352 Py_UNICODE *s = self->str;
5353 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005354
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005355 if (len == 0)
5356 return 0;
5357 if (Py_UNICODE_ISLOWER(*s)) {
5358 *s = Py_UNICODE_TOUPPER(*s);
5359 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005361 s++;
5362 while (--len > 0) {
5363 if (Py_UNICODE_ISUPPER(*s)) {
5364 *s = Py_UNICODE_TOLOWER(*s);
5365 status = 1;
5366 }
5367 s++;
5368 }
5369 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370}
5371
5372static
5373int fixtitle(PyUnicodeObject *self)
5374{
5375 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5376 register Py_UNICODE *e;
5377 int previous_is_cased;
5378
5379 /* Shortcut for single character strings */
5380 if (PyUnicode_GET_SIZE(self) == 1) {
5381 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5382 if (*p != ch) {
5383 *p = ch;
5384 return 1;
5385 }
5386 else
5387 return 0;
5388 }
Tim Petersced69f82003-09-16 20:30:58 +00005389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 e = p + PyUnicode_GET_SIZE(self);
5391 previous_is_cased = 0;
5392 for (; p < e; p++) {
5393 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (previous_is_cased)
5396 *p = Py_UNICODE_TOLOWER(ch);
5397 else
5398 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005399
5400 if (Py_UNICODE_ISLOWER(ch) ||
5401 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 Py_UNICODE_ISTITLE(ch))
5403 previous_is_cased = 1;
5404 else
5405 previous_is_cased = 0;
5406 }
5407 return 1;
5408}
5409
Tim Peters8ce9f162004-08-27 01:49:32 +00005410PyObject *
5411PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412{
Tim Peters8ce9f162004-08-27 01:49:32 +00005413 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005414 const Py_UNICODE blank = ' ';
5415 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005416 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005418 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5419 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5421 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005422 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005423 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005424 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425
Tim Peters05eba1f2004-08-27 21:32:02 +00005426 fseq = PySequence_Fast(seq, "");
5427 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005428 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005429 }
5430
Tim Peters91879ab2004-08-27 22:35:44 +00005431 /* Grrrr. A codec may be invoked to convert str objects to
5432 * Unicode, and so it's possible to call back into Python code
5433 * during PyUnicode_FromObject(), and so it's possible for a sick
5434 * codec to change the size of fseq (if seq is a list). Therefore
5435 * we have to keep refetching the size -- can't assume seqlen
5436 * is invariant.
5437 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 seqlen = PySequence_Fast_GET_SIZE(fseq);
5439 /* If empty sequence, return u"". */
5440 if (seqlen == 0) {
5441 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5442 goto Done;
5443 }
5444 /* If singleton sequence with an exact Unicode, return that. */
5445 if (seqlen == 1) {
5446 item = PySequence_Fast_GET_ITEM(fseq, 0);
5447 if (PyUnicode_CheckExact(item)) {
5448 Py_INCREF(item);
5449 res = (PyUnicodeObject *)item;
5450 goto Done;
5451 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005452 }
5453
Tim Peters05eba1f2004-08-27 21:32:02 +00005454 /* At least two items to join, or one that isn't exact Unicode. */
5455 if (seqlen > 1) {
5456 /* Set up sep and seplen -- they're needed. */
5457 if (separator == NULL) {
5458 sep = &blank;
5459 seplen = 1;
5460 }
5461 else {
5462 internal_separator = PyUnicode_FromObject(separator);
5463 if (internal_separator == NULL)
5464 goto onError;
5465 sep = PyUnicode_AS_UNICODE(internal_separator);
5466 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005467 /* In case PyUnicode_FromObject() mutated seq. */
5468 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005469 }
5470 }
5471
5472 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005473 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005474 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005475 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 res_p = PyUnicode_AS_UNICODE(res);
5477 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005478
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005480 Py_ssize_t itemlen;
5481 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005482
5483 item = PySequence_Fast_GET_ITEM(fseq, i);
5484 /* Convert item to Unicode. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00005485 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005487 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005488 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005489 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005490 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005491 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 item = PyUnicode_FromObject(item);
5493 if (item == NULL)
5494 goto onError;
5495 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005496
Tim Peters91879ab2004-08-27 22:35:44 +00005497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen = PySequence_Fast_GET_SIZE(fseq);
5499
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005503 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005504 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 if (i < seqlen - 1) {
5506 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005507 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 goto Overflow;
5509 }
5510 if (new_res_used > res_alloc) {
5511 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005512 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005513 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005514 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005515 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005517 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005518 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005520 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005521 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005523
5524 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005525 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005526 res_p += itemlen;
5527 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005528 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 res_p += seplen;
5530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 res_used = new_res_used;
5533 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005534
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 /* Shrink res to match the used area; this probably can't fail,
5536 * but it's cheap to check.
5537 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005538 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005539 goto onError;
5540
5541 Done:
5542 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005543 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 return (PyObject *)res;
5545
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 Overflow:
5547 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005548 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 Py_DECREF(item);
5550 /* fall through */
5551
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 return NULL;
5557}
5558
Tim Petersced69f82003-09-16 20:30:58 +00005559static
5560PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t left,
5562 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 Py_UNICODE fill)
5564{
5565 PyUnicodeObject *u;
5566
5567 if (left < 0)
5568 left = 0;
5569 if (right < 0)
5570 right = 0;
5571
Tim Peters7a29bd52001-09-12 03:03:31 +00005572 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 Py_INCREF(self);
5574 return self;
5575 }
5576
5577 u = _PyUnicode_New(left + self->length + right);
5578 if (u) {
5579 if (left)
5580 Py_UNICODE_FILL(u->str, fill, left);
5581 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5582 if (right)
5583 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5584 }
5585
5586 return u;
5587}
5588
5589#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005590 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 if (!str) \
5592 goto onError; \
5593 if (PyList_Append(list, str)) { \
5594 Py_DECREF(str); \
5595 goto onError; \
5596 } \
5597 else \
5598 Py_DECREF(str);
5599
5600static
5601PyObject *split_whitespace(PyUnicodeObject *self,
5602 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005603 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 register Py_ssize_t i;
5606 register Py_ssize_t j;
5607 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005609 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611 for (i = j = 0; i < len; ) {
5612 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005613 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 i++;
5615 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005616 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 i++;
5618 if (j < i) {
5619 if (maxcount-- <= 0)
5620 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005621 SPLIT_APPEND(buf, j, i);
5622 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 i++;
5624 j = i;
5625 }
5626 }
5627 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005628 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 }
5630 return list;
5631
5632 onError:
5633 Py_DECREF(list);
5634 return NULL;
5635}
5636
5637PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005638 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 register Py_ssize_t i;
5641 register Py_ssize_t j;
5642 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 PyObject *list;
5644 PyObject *str;
5645 Py_UNICODE *data;
5646
5647 string = PyUnicode_FromObject(string);
5648 if (string == NULL)
5649 return NULL;
5650 data = PyUnicode_AS_UNICODE(string);
5651 len = PyUnicode_GET_SIZE(string);
5652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 list = PyList_New(0);
5654 if (!list)
5655 goto onError;
5656
5657 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005659
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005661 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
5664 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005665 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 if (i < len) {
5667 if (data[i] == '\r' && i + 1 < len &&
5668 data[i+1] == '\n')
5669 i += 2;
5670 else
5671 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005672 if (keepends)
5673 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
Guido van Rossum86662912000-04-11 15:38:46 +00005675 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 j = i;
5677 }
5678 if (j < len) {
5679 SPLIT_APPEND(data, j, len);
5680 }
5681
5682 Py_DECREF(string);
5683 return list;
5684
5685 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005686 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 Py_DECREF(string);
5688 return NULL;
5689}
5690
Tim Petersced69f82003-09-16 20:30:58 +00005691static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692PyObject *split_char(PyUnicodeObject *self,
5693 PyObject *list,
5694 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 register Py_ssize_t i;
5698 register Py_ssize_t j;
5699 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005701 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
5703 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005704 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 if (maxcount-- <= 0)
5706 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005707 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 i = j = i + 1;
5709 } else
5710 i++;
5711 }
5712 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005713 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 }
5715 return list;
5716
5717 onError:
5718 Py_DECREF(list);
5719 return NULL;
5720}
5721
Tim Petersced69f82003-09-16 20:30:58 +00005722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723PyObject *split_substring(PyUnicodeObject *self,
5724 PyObject *list,
5725 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005728 register Py_ssize_t i;
5729 register Py_ssize_t j;
5730 Py_ssize_t len = self->length;
5731 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 PyObject *str;
5733
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005734 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 if (Py_UNICODE_MATCH(self, i, substring)) {
5736 if (maxcount-- <= 0)
5737 break;
5738 SPLIT_APPEND(self->str, j, i);
5739 i = j = i + sublen;
5740 } else
5741 i++;
5742 }
5743 if (j <= len) {
5744 SPLIT_APPEND(self->str, j, len);
5745 }
5746 return list;
5747
5748 onError:
5749 Py_DECREF(list);
5750 return NULL;
5751}
5752
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005753static
5754PyObject *rsplit_whitespace(PyUnicodeObject *self,
5755 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 register Py_ssize_t i;
5759 register Py_ssize_t j;
5760 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005762 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005763
5764 for (i = j = len - 1; i >= 0; ) {
5765 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005766 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767 i--;
5768 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005769 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770 i--;
5771 if (j > i) {
5772 if (maxcount-- <= 0)
5773 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005774 SPLIT_APPEND(buf, i + 1, j + 1);
5775 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776 i--;
5777 j = i;
5778 }
5779 }
5780 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005781 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005783 if (PyList_Reverse(list) < 0)
5784 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785 return list;
5786
5787 onError:
5788 Py_DECREF(list);
5789 return NULL;
5790}
5791
5792static
5793PyObject *rsplit_char(PyUnicodeObject *self,
5794 PyObject *list,
5795 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 register Py_ssize_t i;
5799 register Py_ssize_t j;
5800 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005802 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005803
5804 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005805 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005806 if (maxcount-- <= 0)
5807 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005808 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005809 j = i = i - 1;
5810 } else
5811 i--;
5812 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005813 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005814 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005815 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005816 if (PyList_Reverse(list) < 0)
5817 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 return list;
5819
5820 onError:
5821 Py_DECREF(list);
5822 return NULL;
5823}
5824
5825static
5826PyObject *rsplit_substring(PyUnicodeObject *self,
5827 PyObject *list,
5828 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 register Py_ssize_t i;
5832 register Py_ssize_t j;
5833 Py_ssize_t len = self->length;
5834 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005835 PyObject *str;
5836
5837 for (i = len - sublen, j = len; i >= 0; ) {
5838 if (Py_UNICODE_MATCH(self, i, substring)) {
5839 if (maxcount-- <= 0)
5840 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005841 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005842 j = i;
5843 i -= sublen;
5844 } else
5845 i--;
5846 }
5847 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005848 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005850 if (PyList_Reverse(list) < 0)
5851 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852 return list;
5853
5854 onError:
5855 Py_DECREF(list);
5856 return NULL;
5857}
5858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859#undef SPLIT_APPEND
5860
5861static
5862PyObject *split(PyUnicodeObject *self,
5863 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
5866 PyObject *list;
5867
5868 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005869 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 list = PyList_New(0);
5872 if (!list)
5873 return NULL;
5874
5875 if (substring == NULL)
5876 return split_whitespace(self,list,maxcount);
5877
5878 else if (substring->length == 1)
5879 return split_char(self,list,substring->str[0],maxcount);
5880
5881 else if (substring->length == 0) {
5882 Py_DECREF(list);
5883 PyErr_SetString(PyExc_ValueError, "empty separator");
5884 return NULL;
5885 }
5886 else
5887 return split_substring(self,list,substring,maxcount);
5888}
5889
Tim Petersced69f82003-09-16 20:30:58 +00005890static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005891PyObject *rsplit(PyUnicodeObject *self,
5892 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894{
5895 PyObject *list;
5896
5897 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005898 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005899
5900 list = PyList_New(0);
5901 if (!list)
5902 return NULL;
5903
5904 if (substring == NULL)
5905 return rsplit_whitespace(self,list,maxcount);
5906
5907 else if (substring->length == 1)
5908 return rsplit_char(self,list,substring->str[0],maxcount);
5909
5910 else if (substring->length == 0) {
5911 Py_DECREF(list);
5912 PyErr_SetString(PyExc_ValueError, "empty separator");
5913 return NULL;
5914 }
5915 else
5916 return rsplit_substring(self,list,substring,maxcount);
5917}
5918
5919static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920PyObject *replace(PyUnicodeObject *self,
5921 PyUnicodeObject *str1,
5922 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
5925 PyUnicodeObject *u;
5926
5927 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005928 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929
Fredrik Lundh347ee272006-05-24 16:35:18 +00005930 if (str1->length == str2->length) {
5931 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005932 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005933 if (str1->length == 1) {
5934 /* replace characters */
5935 Py_UNICODE u1, u2;
5936 if (!findchar(self->str, self->length, str1->str[0]))
5937 goto nothing;
5938 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5939 if (!u)
5940 return NULL;
5941 Py_UNICODE_COPY(u->str, self->str, self->length);
5942 u1 = str1->str[0];
5943 u2 = str2->str[0];
5944 for (i = 0; i < u->length; i++)
5945 if (u->str[i] == u1) {
5946 if (--maxcount < 0)
5947 break;
5948 u->str[i] = u2;
5949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005951 i = fastsearch(
5952 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005954 if (i < 0)
5955 goto nothing;
5956 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5957 if (!u)
5958 return NULL;
5959 Py_UNICODE_COPY(u->str, self->str, self->length);
5960 while (i <= self->length - str1->length)
5961 if (Py_UNICODE_MATCH(self, i, str1)) {
5962 if (--maxcount < 0)
5963 break;
5964 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5965 i += str1->length;
5966 } else
5967 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005971 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005972 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 Py_UNICODE *p;
5974
5975 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005976 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 if (n > maxcount)
5978 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005979 if (n == 0)
5980 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005981 /* new_size = self->length + n * (str2->length - str1->length)); */
5982 delta = (str2->length - str1->length);
5983 if (delta == 0) {
5984 new_size = self->length;
5985 } else {
5986 product = n * (str2->length - str1->length);
5987 if ((product / (str2->length - str1->length)) != n) {
5988 PyErr_SetString(PyExc_OverflowError,
5989 "replace string is too long");
5990 return NULL;
5991 }
5992 new_size = self->length + product;
5993 if (new_size < 0) {
5994 PyErr_SetString(PyExc_OverflowError,
5995 "replace string is too long");
5996 return NULL;
5997 }
5998 }
5999 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006000 if (!u)
6001 return NULL;
6002 i = 0;
6003 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006004 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006005 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006006 while (n-- > 0) {
6007 /* look for next match */
6008 j = i;
6009 while (j <= e) {
6010 if (Py_UNICODE_MATCH(self, j, str1))
6011 break;
6012 j++;
6013 }
6014 if (j > i) {
6015 if (j > e)
6016 break;
6017 /* copy unchanged part [i:j] */
6018 Py_UNICODE_COPY(p, self->str+i, j-i);
6019 p += j - i;
6020 }
6021 /* copy substitution string */
6022 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006023 Py_UNICODE_COPY(p, str2->str, str2->length);
6024 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006025 }
6026 i = j + str1->length;
6027 }
6028 if (i < self->length)
6029 /* copy tail [i:] */
6030 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006031 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006032 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006033 while (n > 0) {
6034 Py_UNICODE_COPY(p, str2->str, str2->length);
6035 p += str2->length;
6036 if (--n <= 0)
6037 break;
6038 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006040 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
6042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006044
6045nothing:
6046 /* nothing to replace; return original string (when possible) */
6047 if (PyUnicode_CheckExact(self)) {
6048 Py_INCREF(self);
6049 return (PyObject *) self;
6050 }
6051 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052}
6053
6054/* --- Unicode Object Methods --------------------------------------------- */
6055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057"S.title() -> unicode\n\
6058\n\
6059Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006060characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
6062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006063unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 return fixup(self, fixtitle);
6066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069"S.capitalize() -> unicode\n\
6070\n\
6071Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006075unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 return fixup(self, fixcapitalize);
6078}
6079
6080#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006081PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082"S.capwords() -> unicode\n\
6083\n\
6084Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006085normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086
6087static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006088unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089{
6090 PyObject *list;
6091 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 /* Split into words */
6095 list = split(self, NULL, -1);
6096 if (!list)
6097 return NULL;
6098
6099 /* Capitalize each word */
6100 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6101 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6102 fixcapitalize);
6103 if (item == NULL)
6104 goto onError;
6105 Py_DECREF(PyList_GET_ITEM(list, i));
6106 PyList_SET_ITEM(list, i, item);
6107 }
6108
6109 /* Join the words to form a new string */
6110 item = PyUnicode_Join(NULL, list);
6111
6112onError:
6113 Py_DECREF(list);
6114 return (PyObject *)item;
6115}
6116#endif
6117
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006118/* Argument converter. Coerces to a single unicode character */
6119
6120static int
6121convert_uc(PyObject *obj, void *addr)
6122{
6123 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6124 PyObject *uniobj;
6125 Py_UNICODE *unistr;
6126
6127 uniobj = PyUnicode_FromObject(obj);
6128 if (uniobj == NULL) {
6129 PyErr_SetString(PyExc_TypeError,
6130 "The fill character cannot be converted to Unicode");
6131 return 0;
6132 }
6133 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6134 PyErr_SetString(PyExc_TypeError,
6135 "The fill character must be exactly one character long");
6136 Py_DECREF(uniobj);
6137 return 0;
6138 }
6139 unistr = PyUnicode_AS_UNICODE(uniobj);
6140 *fillcharloc = unistr[0];
6141 Py_DECREF(uniobj);
6142 return 1;
6143}
6144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006145PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006146"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006148Return S centered in a Unicode string of length width. Padding is\n\
6149done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151static PyObject *
6152unicode_center(PyUnicodeObject *self, PyObject *args)
6153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006154 Py_ssize_t marg, left;
6155 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006156 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Thomas Woutersde017742006-02-16 19:34:37 +00006158 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 return NULL;
6160
Tim Peters7a29bd52001-09-12 03:03:31 +00006161 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 Py_INCREF(self);
6163 return (PyObject*) self;
6164 }
6165
6166 marg = width - self->length;
6167 left = marg / 2 + (marg & width & 1);
6168
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006169 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170}
6171
Marc-André Lemburge5034372000-08-08 08:04:29 +00006172#if 0
6173
6174/* This code should go into some future Unicode collation support
6175 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006176 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006177
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178/* speedy UTF-16 code point order comparison */
6179/* gleaned from: */
6180/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6181
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006182static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006183{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006184 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006185 0, 0, 0, 0, 0, 0, 0, 0,
6186 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006187 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006188};
6189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190static int
6191unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006193 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006194
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 Py_UNICODE *s1 = str1->str;
6196 Py_UNICODE *s2 = str2->str;
6197
6198 len1 = str1->length;
6199 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006202 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006203
6204 c1 = *s1++;
6205 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006206
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006207 if (c1 > (1<<11) * 26)
6208 c1 += utf16Fixup[c1>>11];
6209 if (c2 > (1<<11) * 26)
6210 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006211 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006212
6213 if (c1 != c2)
6214 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006215
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006216 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 }
6218
6219 return (len1 < len2) ? -1 : (len1 != len2);
6220}
6221
Marc-André Lemburge5034372000-08-08 08:04:29 +00006222#else
6223
6224static int
6225unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006227 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006228
6229 Py_UNICODE *s1 = str1->str;
6230 Py_UNICODE *s2 = str2->str;
6231
6232 len1 = str1->length;
6233 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006234
Marc-André Lemburge5034372000-08-08 08:04:29 +00006235 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006236 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006237
Fredrik Lundh45714e92001-06-26 16:39:36 +00006238 c1 = *s1++;
6239 c2 = *s2++;
6240
6241 if (c1 != c2)
6242 return (c1 < c2) ? -1 : 1;
6243
Marc-André Lemburge5034372000-08-08 08:04:29 +00006244 len1--; len2--;
6245 }
6246
6247 return (len1 < len2) ? -1 : (len1 != len2);
6248}
6249
6250#endif
6251
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252int PyUnicode_Compare(PyObject *left,
6253 PyObject *right)
6254{
6255 PyUnicodeObject *u = NULL, *v = NULL;
6256 int result;
6257
6258 /* Coerce the two arguments */
6259 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6260 if (u == NULL)
6261 goto onError;
6262 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6263 if (v == NULL)
6264 goto onError;
6265
Thomas Wouters7e474022000-07-16 12:04:32 +00006266 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 if (v == u) {
6268 Py_DECREF(u);
6269 Py_DECREF(v);
6270 return 0;
6271 }
6272
6273 result = unicode_compare(u, v);
6274
6275 Py_DECREF(u);
6276 Py_DECREF(v);
6277 return result;
6278
6279onError:
6280 Py_XDECREF(u);
6281 Py_XDECREF(v);
6282 return -1;
6283}
6284
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006285PyObject *PyUnicode_RichCompare(PyObject *left,
6286 PyObject *right,
6287 int op)
6288{
6289 int result;
6290
6291 result = PyUnicode_Compare(left, right);
6292 if (result == -1 && PyErr_Occurred())
6293 goto onError;
6294
6295 /* Convert the return value to a Boolean */
6296 switch (op) {
6297 case Py_EQ:
6298 result = (result == 0);
6299 break;
6300 case Py_NE:
6301 result = (result != 0);
6302 break;
6303 case Py_LE:
6304 result = (result <= 0);
6305 break;
6306 case Py_GE:
6307 result = (result >= 0);
6308 break;
6309 case Py_LT:
6310 result = (result == -1);
6311 break;
6312 case Py_GT:
6313 result = (result == 1);
6314 break;
6315 }
6316 return PyBool_FromLong(result);
6317
6318 onError:
6319
6320 /* Standard case
6321
6322 Type errors mean that PyUnicode_FromObject() could not convert
6323 one of the arguments (usually the right hand side) to Unicode,
6324 ie. we can't handle the comparison request. However, it is
6325 possible that the other object knows a comparison method, which
6326 is why we return Py_NotImplemented to give the other object a
6327 chance.
6328
6329 */
6330 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6331 PyErr_Clear();
6332 Py_INCREF(Py_NotImplemented);
6333 return Py_NotImplemented;
6334 }
6335 if (op != Py_EQ && op != Py_NE)
6336 return NULL;
6337
6338 /* Equality comparison.
6339
6340 This is a special case: we silence any PyExc_UnicodeDecodeError
6341 and instead turn it into a PyErr_UnicodeWarning.
6342
6343 */
6344 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6345 return NULL;
6346 PyErr_Clear();
6347 if (PyErr_Warn(PyExc_UnicodeWarning,
6348 (op == Py_EQ) ?
6349 "Unicode equal comparison "
6350 "failed to convert both arguments to Unicode - "
6351 "interpreting them as being unequal" :
6352 "Unicode unequal comparison "
6353 "failed to convert both arguments to Unicode - "
6354 "interpreting them as being unequal"
6355 ) < 0)
6356 return NULL;
6357 result = (op == Py_NE);
6358 return PyBool_FromLong(result);
6359}
6360
Guido van Rossum403d68b2000-03-13 15:55:09 +00006361int PyUnicode_Contains(PyObject *container,
6362 PyObject *element)
6363{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006364 PyObject *str, *sub;
6365 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006366
6367 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006368 sub = PyUnicode_FromObject(element);
6369 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006370 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006371 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006372 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006373 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006374
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006375 str = PyUnicode_FromObject(container);
6376 if (!str) {
6377 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006378 return -1;
6379 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006380
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006381 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006382
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006383 Py_DECREF(str);
6384 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006385
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006386 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006387}
6388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389/* Concat to string or Unicode object giving a new Unicode object. */
6390
6391PyObject *PyUnicode_Concat(PyObject *left,
6392 PyObject *right)
6393{
6394 PyUnicodeObject *u = NULL, *v = NULL, *w;
6395
6396 /* Coerce the two arguments */
6397 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6398 if (u == NULL)
6399 goto onError;
6400 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6401 if (v == NULL)
6402 goto onError;
6403
6404 /* Shortcuts */
6405 if (v == unicode_empty) {
6406 Py_DECREF(v);
6407 return (PyObject *)u;
6408 }
6409 if (u == unicode_empty) {
6410 Py_DECREF(u);
6411 return (PyObject *)v;
6412 }
6413
6414 /* Concat the two Unicode strings */
6415 w = _PyUnicode_New(u->length + v->length);
6416 if (w == NULL)
6417 goto onError;
6418 Py_UNICODE_COPY(w->str, u->str, u->length);
6419 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6420
6421 Py_DECREF(u);
6422 Py_DECREF(v);
6423 return (PyObject *)w;
6424
6425onError:
6426 Py_XDECREF(u);
6427 Py_XDECREF(v);
6428 return NULL;
6429}
6430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432"S.count(sub[, start[, end]]) -> int\n\
6433\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006434Return the number of non-overlapping occurrences of substring sub in\n\
6435Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
6438static PyObject *
6439unicode_count(PyUnicodeObject *self, PyObject *args)
6440{
6441 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006442 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006443 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 PyObject *result;
6445
Guido van Rossumb8872e62000-05-09 14:14:27 +00006446 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6447 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 return NULL;
6449
6450 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006451 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 if (substring == NULL)
6453 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006454
Fredrik Lundhc8162812006-05-26 19:33:03 +00006455 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006457 result = PyInt_FromSsize_t(
6458 stringlib_count(self->str + start, end - start,
6459 substring->str, substring->length)
6460 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
6462 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006463
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return result;
6465}
6466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006467PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006468"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006470Encodes S using the codec registered for encoding. encoding defaults\n\
6471to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006472handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6474'xmlcharrefreplace' as well as any other name registered with\n\
6475codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
6477static PyObject *
6478unicode_encode(PyUnicodeObject *self, PyObject *args)
6479{
6480 char *encoding = NULL;
6481 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006482 PyObject *v;
6483
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6485 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006486 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006487 if (v == NULL)
6488 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006489 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490 PyErr_Format(PyExc_TypeError,
6491 "encoder did not return a string/unicode object "
6492 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006493 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006494 Py_DECREF(v);
6495 return NULL;
6496 }
6497 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006498
6499 onError:
6500 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006501}
6502
6503PyDoc_STRVAR(decode__doc__,
6504"S.decode([encoding[,errors]]) -> string or unicode\n\
6505\n\
6506Decodes S using the codec registered for encoding. encoding defaults\n\
6507to the default encoding. errors may be given to set a different error\n\
6508handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6509a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6510as well as any other name registerd with codecs.register_error that is\n\
6511able to handle UnicodeDecodeErrors.");
6512
6513static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006514unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515{
6516 char *encoding = NULL;
6517 char *errors = NULL;
6518 PyObject *v;
6519
6520 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6521 return NULL;
6522 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006523 if (v == NULL)
6524 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006525 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006526 PyErr_Format(PyExc_TypeError,
6527 "decoder did not return a string/unicode object "
6528 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006529 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006530 Py_DECREF(v);
6531 return NULL;
6532 }
6533 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006534
6535 onError:
6536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537}
6538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006539PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540"S.expandtabs([tabsize]) -> unicode\n\
6541\n\
6542Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
6545static PyObject*
6546unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6547{
6548 Py_UNICODE *e;
6549 Py_UNICODE *p;
6550 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006551 Py_UNICODE *qe;
6552 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 PyUnicodeObject *u;
6554 int tabsize = 8;
6555
6556 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6557 return NULL;
6558
Thomas Wouters7e474022000-07-16 12:04:32 +00006559 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006560 i = 0; /* chars up to and including most recent \n or \r */
6561 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6562 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 for (p = self->str; p < e; p++)
6564 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006565 if (tabsize > 0) {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006566 incr = tabsize - (j % tabsize); /* cannot overflow */
6567 if (j > PY_SSIZE_T_MAX - incr)
6568 goto overflow1;
6569 j += incr;
6570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 }
6572 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006573 if (j > PY_SSIZE_T_MAX - 1)
6574 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 j++;
6576 if (*p == '\n' || *p == '\r') {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006577 if (i > PY_SSIZE_T_MAX - j)
6578 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006580 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 }
6582 }
6583
Guido van Rossum5bdff602008-03-11 21:18:06 +00006584 if (i > PY_SSIZE_T_MAX - j)
6585 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006586
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 /* Second pass: create output string and fill it */
6588 u = _PyUnicode_New(i + j);
6589 if (!u)
6590 return NULL;
6591
Guido van Rossum5bdff602008-03-11 21:18:06 +00006592 j = 0; /* same as in first pass */
6593 q = u->str; /* next output char */
6594 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596 for (p = self->str; p < e; p++)
6597 if (*p == '\t') {
6598 if (tabsize > 0) {
6599 i = tabsize - (j % tabsize);
6600 j += i;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006601 while (i--) {
6602 if (q >= qe)
6603 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
6607 }
6608 else {
Guido van Rossum5bdff602008-03-11 21:18:06 +00006609 if (q >= qe)
6610 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006612 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (*p == '\n' || *p == '\r')
6614 j = 0;
6615 }
6616
6617 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006618
6619 overflow2:
6620 Py_DECREF(u);
6621 overflow1:
6622 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006626PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627"S.find(sub [,start [,end]]) -> int\n\
6628\n\
6629Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006630such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631arguments start and end are interpreted as in slice notation.\n\
6632\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634
6635static PyObject *
6636unicode_find(PyUnicodeObject *self, PyObject *args)
6637{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006638 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006639 Py_ssize_t start;
6640 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006641 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
Facundo Batista57d56692007-11-16 18:04:14 +00006643 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006646 result = stringlib_find_slice(
6647 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6648 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6649 start, end
6650 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
6652 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006653
6654 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655}
6656
6657static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006658unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
6660 if (index < 0 || index >= self->length) {
6661 PyErr_SetString(PyExc_IndexError, "string index out of range");
6662 return NULL;
6663 }
6664
6665 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6666}
6667
6668static long
6669unicode_hash(PyUnicodeObject *self)
6670{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006671 /* Since Unicode objects compare equal to their ASCII string
6672 counterparts, they should use the individual character values
6673 as basis for their hash value. This is needed to assure that
6674 strings and Unicode objects behave in the same way as
6675 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006678 register Py_UNICODE *p;
6679 register long x;
6680
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 if (self->hash != -1)
6682 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006683 len = PyUnicode_GET_SIZE(self);
6684 p = PyUnicode_AS_UNICODE(self);
6685 x = *p << 7;
6686 while (--len >= 0)
6687 x = (1000003*x) ^ *p++;
6688 x ^= PyUnicode_GET_SIZE(self);
6689 if (x == -1)
6690 x = -2;
6691 self->hash = x;
6692 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693}
6694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006695PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696"S.index(sub [,start [,end]]) -> int\n\
6697\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006698Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700static PyObject *
6701unicode_index(PyUnicodeObject *self, PyObject *args)
6702{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006703 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006704 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006705 Py_ssize_t start;
6706 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707
Facundo Batista57d56692007-11-16 18:04:14 +00006708 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006711 result = stringlib_find_slice(
6712 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6713 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6714 start, end
6715 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
6717 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 if (result < 0) {
6720 PyErr_SetString(PyExc_ValueError, "substring not found");
6721 return NULL;
6722 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006723
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725}
6726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006727PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006728"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006730Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006734unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735{
6736 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6737 register const Py_UNICODE *e;
6738 int cased;
6739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 /* Shortcut for single character strings */
6741 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006744 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006745 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 e = p + PyUnicode_GET_SIZE(self);
6749 cased = 0;
6750 for (; p < e; p++) {
6751 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 else if (!cased && Py_UNICODE_ISLOWER(ch))
6756 cased = 1;
6757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006758 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006761PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006764Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
6767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006768unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
6770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6771 register const Py_UNICODE *e;
6772 int cased;
6773
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 /* Shortcut for single character strings */
6775 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006776 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006778 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006779 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006780 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006781
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 e = p + PyUnicode_GET_SIZE(self);
6783 cased = 0;
6784 for (; p < e; p++) {
6785 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006786
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006788 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 else if (!cased && Py_UNICODE_ISUPPER(ch))
6790 cased = 1;
6791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006792 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793}
6794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006795PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006798Return True if S is a titlecased string and there is at least one\n\
6799character in S, i.e. upper- and titlecase characters may only\n\
6800follow uncased characters and lowercase characters only cased ones.\n\
6801Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
6803static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006804unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
6806 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6807 register const Py_UNICODE *e;
6808 int cased, previous_is_cased;
6809
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 /* Shortcut for single character strings */
6811 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6813 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006815 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006816 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 e = p + PyUnicode_GET_SIZE(self);
6820 cased = 0;
6821 previous_is_cased = 0;
6822 for (; p < e; p++) {
6823 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6826 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006827 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 previous_is_cased = 1;
6829 cased = 1;
6830 }
6831 else if (Py_UNICODE_ISLOWER(ch)) {
6832 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 previous_is_cased = 1;
6835 cased = 1;
6836 }
6837 else
6838 previous_is_cased = 0;
6839 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841}
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006844"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006846Return True if all characters in S are whitespace\n\
6847and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
6849static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006850unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851{
6852 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6853 register const Py_UNICODE *e;
6854
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 /* Shortcut for single character strings */
6856 if (PyUnicode_GET_SIZE(self) == 1 &&
6857 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006860 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006861 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006863
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 e = p + PyUnicode_GET_SIZE(self);
6865 for (; p < e; p++) {
6866 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006867 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870}
6871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006875Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877
6878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006879unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006880{
6881 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6882 register const Py_UNICODE *e;
6883
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006884 /* Shortcut for single character strings */
6885 if (PyUnicode_GET_SIZE(self) == 1 &&
6886 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006890 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892
6893 e = p + PyUnicode_GET_SIZE(self);
6894 for (; p < e; p++) {
6895 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899}
6900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006901PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006903\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006904Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906
6907static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006908unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006909{
6910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6911 register const Py_UNICODE *e;
6912
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self) == 1 &&
6915 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917
6918 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006919 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921
6922 e = p + PyUnicode_GET_SIZE(self);
6923 for (; p < e; p++) {
6924 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006928}
6929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006931"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006934False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
6936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006937unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
6939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6940 register const Py_UNICODE *e;
6941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 /* Shortcut for single character strings */
6943 if (PyUnicode_GET_SIZE(self) == 1 &&
6944 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006945 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006947 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006948 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 e = p + PyUnicode_GET_SIZE(self);
6952 for (; p < e; p++) {
6953 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006954 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006962Return True if all characters in S are digits\n\
6963and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
6965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006966unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967{
6968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6969 register const Py_UNICODE *e;
6970
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 /* Shortcut for single character strings */
6972 if (PyUnicode_GET_SIZE(self) == 1 &&
6973 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006976 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006977 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006979
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 e = p + PyUnicode_GET_SIZE(self);
6981 for (; p < e; p++) {
6982 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006983 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986}
6987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006991Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993
6994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006995unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996{
6997 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6998 register const Py_UNICODE *e;
6999
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 /* Shortcut for single character strings */
7001 if (PyUnicode_GET_SIZE(self) == 1 &&
7002 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007005 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007006 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007007 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007008
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 e = p + PyUnicode_GET_SIZE(self);
7010 for (; p < e; p++) {
7011 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007012 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007014 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015}
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018"S.join(sequence) -> unicode\n\
7019\n\
7020Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007024unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Martin v. Löwis18e16552006-02-15 17:27:45 +00007029static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030unicode_length(PyUnicodeObject *self)
7031{
7032 return self->length;
7033}
7034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007036"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037\n\
7038Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007039done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041static PyObject *
7042unicode_ljust(PyUnicodeObject *self, PyObject *args)
7043{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007044 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007045 Py_UNICODE fillchar = ' ';
7046
Martin v. Löwis412fb672006-04-13 06:34:32 +00007047 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 return NULL;
7049
Tim Peters7a29bd52001-09-12 03:03:31 +00007050 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 Py_INCREF(self);
7052 return (PyObject*) self;
7053 }
7054
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007055 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059"S.lower() -> unicode\n\
7060\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007061Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062
7063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007064unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 return fixup(self, fixlower);
7067}
7068
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069#define LEFTSTRIP 0
7070#define RIGHTSTRIP 1
7071#define BOTHSTRIP 2
7072
7073/* Arrays indexed by above */
7074static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7075
7076#define STRIPNAME(i) (stripformat[i]+3)
7077
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078/* externally visible for str.strip(unicode) */
7079PyObject *
7080_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7081{
7082 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007083 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007085 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7086 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007088 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7089
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007090 i = 0;
7091 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007092 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7093 i++;
7094 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095 }
7096
7097 j = len;
7098 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007099 do {
7100 j--;
7101 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7102 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103 }
7104
7105 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007106 Py_INCREF(self);
7107 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108 }
7109 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007110 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111}
7112
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113
7114static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007118 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119
7120 i = 0;
7121 if (striptype != RIGHTSTRIP) {
7122 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7123 i++;
7124 }
7125 }
7126
7127 j = len;
7128 if (striptype != LEFTSTRIP) {
7129 do {
7130 j--;
7131 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7132 j++;
7133 }
7134
7135 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7136 Py_INCREF(self);
7137 return (PyObject*)self;
7138 }
7139 else
7140 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143
7144static PyObject *
7145do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7146{
7147 PyObject *sep = NULL;
7148
7149 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7150 return NULL;
7151
7152 if (sep != NULL && sep != Py_None) {
7153 if (PyUnicode_Check(sep))
7154 return _PyUnicode_XStrip(self, striptype, sep);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007155 else if (PyString_Check(sep)) {
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156 PyObject *res;
7157 sep = PyUnicode_FromObject(sep);
7158 if (sep==NULL)
7159 return NULL;
7160 res = _PyUnicode_XStrip(self, striptype, sep);
7161 Py_DECREF(sep);
7162 return res;
7163 }
7164 else {
7165 PyErr_Format(PyExc_TypeError,
7166 "%s arg must be None, unicode or str",
7167 STRIPNAME(striptype));
7168 return NULL;
7169 }
7170 }
7171
7172 return do_strip(self, striptype);
7173}
7174
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007177"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178\n\
7179Return a copy of the string S with leading and trailing\n\
7180whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007181If chars is given and not None, remove characters in chars instead.\n\
7182If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007183
7184static PyObject *
7185unicode_strip(PyUnicodeObject *self, PyObject *args)
7186{
7187 if (PyTuple_GET_SIZE(args) == 0)
7188 return do_strip(self, BOTHSTRIP); /* Common case */
7189 else
7190 return do_argstrip(self, BOTHSTRIP, args);
7191}
7192
7193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007194PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007195"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007196\n\
7197Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007198If chars is given and not None, remove characters in chars instead.\n\
7199If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007200
7201static PyObject *
7202unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7203{
7204 if (PyTuple_GET_SIZE(args) == 0)
7205 return do_strip(self, LEFTSTRIP); /* Common case */
7206 else
7207 return do_argstrip(self, LEFTSTRIP, args);
7208}
7209
7210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007212"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213\n\
7214Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007215If chars is given and not None, remove characters in chars instead.\n\
7216If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007217
7218static PyObject *
7219unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7220{
7221 if (PyTuple_GET_SIZE(args) == 0)
7222 return do_strip(self, RIGHTSTRIP); /* Common case */
7223 else
7224 return do_argstrip(self, RIGHTSTRIP, args);
7225}
7226
7227
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007229unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230{
7231 PyUnicodeObject *u;
7232 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007233 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007234 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236 if (len < 0)
7237 len = 0;
7238
Tim Peters7a29bd52001-09-12 03:03:31 +00007239 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 /* no repeat, return original string */
7241 Py_INCREF(str);
7242 return (PyObject*) str;
7243 }
Tim Peters8f422462000-09-09 06:13:41 +00007244
7245 /* ensure # of chars needed doesn't overflow int and # of bytes
7246 * needed doesn't overflow size_t
7247 */
7248 nchars = len * str->length;
7249 if (len && nchars / len != str->length) {
7250 PyErr_SetString(PyExc_OverflowError,
7251 "repeated string is too long");
7252 return NULL;
7253 }
7254 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7255 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7256 PyErr_SetString(PyExc_OverflowError,
7257 "repeated string is too long");
7258 return NULL;
7259 }
7260 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 if (!u)
7262 return NULL;
7263
7264 p = u->str;
7265
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007266 if (str->length == 1 && len > 0) {
7267 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007268 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007269 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007270 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007271 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007272 done = str->length;
7273 }
7274 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007275 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007276 Py_UNICODE_COPY(p+done, p, n);
7277 done += n;
7278 }
7279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
7281 return (PyObject*) u;
7282}
7283
7284PyObject *PyUnicode_Replace(PyObject *obj,
7285 PyObject *subobj,
7286 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007287 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
7289 PyObject *self;
7290 PyObject *str1;
7291 PyObject *str2;
7292 PyObject *result;
7293
7294 self = PyUnicode_FromObject(obj);
7295 if (self == NULL)
7296 return NULL;
7297 str1 = PyUnicode_FromObject(subobj);
7298 if (str1 == NULL) {
7299 Py_DECREF(self);
7300 return NULL;
7301 }
7302 str2 = PyUnicode_FromObject(replobj);
7303 if (str2 == NULL) {
7304 Py_DECREF(self);
7305 Py_DECREF(str1);
7306 return NULL;
7307 }
Tim Petersced69f82003-09-16 20:30:58 +00007308 result = replace((PyUnicodeObject *)self,
7309 (PyUnicodeObject *)str1,
7310 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 maxcount);
7312 Py_DECREF(self);
7313 Py_DECREF(str1);
7314 Py_DECREF(str2);
7315 return result;
7316}
7317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318PyDoc_STRVAR(replace__doc__,
Georg Brandl30fadc12008-05-30 07:54:16 +00007319"S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320\n\
7321Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007322old replaced by new. If the optional argument count is\n\
7323given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
7325static PyObject*
7326unicode_replace(PyUnicodeObject *self, PyObject *args)
7327{
7328 PyUnicodeObject *str1;
7329 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007330 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 PyObject *result;
7332
Martin v. Löwis18e16552006-02-15 17:27:45 +00007333 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 return NULL;
7335 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7336 if (str1 == NULL)
7337 return NULL;
7338 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007339 if (str2 == NULL) {
7340 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343
7344 result = replace(self, str1, str2, maxcount);
7345
7346 Py_DECREF(str1);
7347 Py_DECREF(str2);
7348 return result;
7349}
7350
7351static
7352PyObject *unicode_repr(PyObject *unicode)
7353{
7354 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7355 PyUnicode_GET_SIZE(unicode),
7356 1);
7357}
7358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007359PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360"S.rfind(sub [,start [,end]]) -> int\n\
7361\n\
7362Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007363such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364arguments start and end are interpreted as in slice notation.\n\
7365\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject *
7369unicode_rfind(PyUnicodeObject *self, PyObject *args)
7370{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007371 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007372 Py_ssize_t start;
7373 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007374 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Facundo Batista57d56692007-11-16 18:04:14 +00007376 if (!_ParseTupleFinds(args, &substring, &start, &end))
7377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007379 result = stringlib_rfind_slice(
7380 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7381 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7382 start, end
7383 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
7385 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007386
7387 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
7389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391"S.rindex(sub [,start [,end]]) -> int\n\
7392\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
7395static PyObject *
7396unicode_rindex(PyUnicodeObject *self, PyObject *args)
7397{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007398 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007399 Py_ssize_t start;
7400 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007401 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Facundo Batista57d56692007-11-16 18:04:14 +00007403 if (!_ParseTupleFinds(args, &substring, &start, &end))
7404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007406 result = stringlib_rfind_slice(
7407 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7408 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7409 start, end
7410 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
7412 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007413
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 if (result < 0) {
7415 PyErr_SetString(PyExc_ValueError, "substring not found");
7416 return NULL;
7417 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419}
7420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007421PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007422"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423\n\
7424Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007425done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
7427static PyObject *
7428unicode_rjust(PyUnicodeObject *self, PyObject *args)
7429{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007430 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007431 Py_UNICODE fillchar = ' ';
7432
Martin v. Löwis412fb672006-04-13 06:34:32 +00007433 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 return NULL;
7435
Tim Peters7a29bd52001-09-12 03:03:31 +00007436 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 Py_INCREF(self);
7438 return (PyObject*) self;
7439 }
7440
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007441 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442}
7443
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446{
7447 /* standard clamping */
7448 if (start < 0)
7449 start = 0;
7450 if (end < 0)
7451 end = 0;
7452 if (end > self->length)
7453 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007454 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 /* full slice, return original string */
7456 Py_INCREF(self);
7457 return (PyObject*) self;
7458 }
7459 if (start > end)
7460 start = end;
7461 /* copy slice */
7462 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7463 end - start);
7464}
7465
7466PyObject *PyUnicode_Split(PyObject *s,
7467 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
7470 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007471
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 s = PyUnicode_FromObject(s);
7473 if (s == NULL)
7474 return NULL;
7475 if (sep != NULL) {
7476 sep = PyUnicode_FromObject(sep);
7477 if (sep == NULL) {
7478 Py_DECREF(s);
7479 return NULL;
7480 }
7481 }
7482
7483 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7484
7485 Py_DECREF(s);
7486 Py_XDECREF(sep);
7487 return result;
7488}
7489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007490PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491"S.split([sep [,maxsplit]]) -> list of strings\n\
7492\n\
7493Return a list of the words in S, using sep as the\n\
7494delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007495splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007496whitespace string is a separator and empty strings are\n\
7497removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
7499static PyObject*
7500unicode_split(PyUnicodeObject *self, PyObject *args)
7501{
7502 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 return NULL;
7507
7508 if (substring == Py_None)
7509 return split(self, NULL, maxcount);
7510 else if (PyUnicode_Check(substring))
7511 return split(self, (PyUnicodeObject *)substring, maxcount);
7512 else
7513 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7514}
7515
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007516PyObject *
7517PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7518{
7519 PyObject* str_obj;
7520 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007521 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007522
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007523 str_obj = PyUnicode_FromObject(str_in);
7524 if (!str_obj)
7525 return NULL;
7526 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007527 if (!sep_obj) {
7528 Py_DECREF(str_obj);
7529 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007530 }
7531
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007532 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007533 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7534 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7535 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007536
Fredrik Lundhb9479482006-05-26 17:22:38 +00007537 Py_DECREF(sep_obj);
7538 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007539
7540 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007541}
7542
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007543
7544PyObject *
7545PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7546{
7547 PyObject* str_obj;
7548 PyObject* sep_obj;
7549 PyObject* out;
7550
7551 str_obj = PyUnicode_FromObject(str_in);
7552 if (!str_obj)
7553 return NULL;
7554 sep_obj = PyUnicode_FromObject(sep_in);
7555 if (!sep_obj) {
7556 Py_DECREF(str_obj);
7557 return NULL;
7558 }
7559
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007560 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007561 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7562 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7563 );
7564
7565 Py_DECREF(sep_obj);
7566 Py_DECREF(str_obj);
7567
7568 return out;
7569}
7570
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007571PyDoc_STRVAR(partition__doc__,
7572"S.partition(sep) -> (head, sep, tail)\n\
7573\n\
7574Searches for the separator sep in S, and returns the part before it,\n\
7575the separator itself, and the part after it. If the separator is not\n\
7576found, returns S and two empty strings.");
7577
7578static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007579unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007580{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007581 return PyUnicode_Partition((PyObject *)self, separator);
7582}
7583
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007584PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007585"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007586\n\
7587Searches for the separator sep in S, starting at the end of S, and returns\n\
7588the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007589separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007590
7591static PyObject*
7592unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7593{
7594 return PyUnicode_RPartition((PyObject *)self, separator);
7595}
7596
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007597PyObject *PyUnicode_RSplit(PyObject *s,
7598 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007600{
7601 PyObject *result;
7602
7603 s = PyUnicode_FromObject(s);
7604 if (s == NULL)
7605 return NULL;
7606 if (sep != NULL) {
7607 sep = PyUnicode_FromObject(sep);
7608 if (sep == NULL) {
7609 Py_DECREF(s);
7610 return NULL;
7611 }
7612 }
7613
7614 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7615
7616 Py_DECREF(s);
7617 Py_XDECREF(sep);
7618 return result;
7619}
7620
7621PyDoc_STRVAR(rsplit__doc__,
7622"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7623\n\
7624Return a list of the words in S, using sep as the\n\
7625delimiter string, starting at the end of the string and\n\
7626working to the front. If maxsplit is given, at most maxsplit\n\
7627splits are done. If sep is not specified, any whitespace string\n\
7628is a separator.");
7629
7630static PyObject*
7631unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7632{
7633 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007634 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007635
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637 return NULL;
7638
7639 if (substring == Py_None)
7640 return rsplit(self, NULL, maxcount);
7641 else if (PyUnicode_Check(substring))
7642 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7643 else
7644 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7645}
7646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007648"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649\n\
7650Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007651Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
7654static PyObject*
7655unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7656{
Guido van Rossum86662912000-04-11 15:38:46 +00007657 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
Guido van Rossum86662912000-04-11 15:38:46 +00007659 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 return NULL;
7661
Guido van Rossum86662912000-04-11 15:38:46 +00007662 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663}
7664
7665static
7666PyObject *unicode_str(PyUnicodeObject *self)
7667{
Fred Drakee4315f52000-05-09 19:53:39 +00007668 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669}
7670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007671PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672"S.swapcase() -> unicode\n\
7673\n\
7674Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007675and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676
7677static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007678unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 return fixup(self, fixswapcase);
7681}
7682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007683PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684"S.translate(table) -> unicode\n\
7685\n\
7686Return a copy of the string S, where all characters have been mapped\n\
7687through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007688Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7689Unmapped characters are left untouched. Characters mapped to None\n\
7690are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
7692static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007693unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694{
Tim Petersced69f82003-09-16 20:30:58 +00007695 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007697 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 "ignore");
7699}
7700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007701PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702"S.upper() -> unicode\n\
7703\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007707unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 return fixup(self, fixupper);
7710}
7711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713"S.zfill(width) -> unicode\n\
7714\n\
7715Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
7718static PyObject *
7719unicode_zfill(PyUnicodeObject *self, PyObject *args)
7720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007721 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 PyUnicodeObject *u;
7723
Martin v. Löwis18e16552006-02-15 17:27:45 +00007724 Py_ssize_t width;
7725 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 return NULL;
7727
7728 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007729 if (PyUnicode_CheckExact(self)) {
7730 Py_INCREF(self);
7731 return (PyObject*) self;
7732 }
7733 else
7734 return PyUnicode_FromUnicode(
7735 PyUnicode_AS_UNICODE(self),
7736 PyUnicode_GET_SIZE(self)
7737 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 }
7739
7740 fill = width - self->length;
7741
7742 u = pad(self, fill, 0, '0');
7743
Walter Dörwald068325e2002-04-15 13:36:47 +00007744 if (u == NULL)
7745 return NULL;
7746
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747 if (u->str[fill] == '+' || u->str[fill] == '-') {
7748 /* move sign to beginning of string */
7749 u->str[0] = u->str[fill];
7750 u->str[fill] = '0';
7751 }
7752
7753 return (PyObject*) u;
7754}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756#if 0
7757static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007758free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007760 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761}
7762#endif
7763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007765"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007767Return True if S starts with the specified prefix, False otherwise.\n\
7768With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007769With optional end, stop comparing S at that position.\n\
7770prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772static PyObject *
7773unicode_startswith(PyUnicodeObject *self,
7774 PyObject *args)
7775{
Georg Brandl24250812006-06-09 18:45:48 +00007776 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007778 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007779 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007780 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781
Georg Brandl24250812006-06-09 18:45:48 +00007782 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007783 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007785 if (PyTuple_Check(subobj)) {
7786 Py_ssize_t i;
7787 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7788 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7789 PyTuple_GET_ITEM(subobj, i));
7790 if (substring == NULL)
7791 return NULL;
7792 result = tailmatch(self, substring, start, end, -1);
7793 Py_DECREF(substring);
7794 if (result) {
7795 Py_RETURN_TRUE;
7796 }
7797 }
7798 /* nothing matched */
7799 Py_RETURN_FALSE;
7800 }
7801 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007803 return NULL;
7804 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007806 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807}
7808
7809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007811"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007813Return True if S ends with the specified suffix, False otherwise.\n\
7814With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007815With optional end, stop comparing S at that position.\n\
7816suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817
7818static PyObject *
7819unicode_endswith(PyUnicodeObject *self,
7820 PyObject *args)
7821{
Georg Brandl24250812006-06-09 18:45:48 +00007822 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007824 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007825 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007826 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827
Georg Brandl24250812006-06-09 18:45:48 +00007828 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7829 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007831 if (PyTuple_Check(subobj)) {
7832 Py_ssize_t i;
7833 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7834 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7835 PyTuple_GET_ITEM(subobj, i));
7836 if (substring == NULL)
7837 return NULL;
7838 result = tailmatch(self, substring, start, end, +1);
7839 Py_DECREF(substring);
7840 if (result) {
7841 Py_RETURN_TRUE;
7842 }
7843 }
7844 Py_RETURN_FALSE;
7845 }
7846 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
Georg Brandl24250812006-06-09 18:45:48 +00007850 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007852 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853}
7854
7855
Eric Smitha9f7d622008-02-17 19:46:49 +00007856/* Implements do_string_format, which is unicode because of stringlib */
7857#include "stringlib/string_format.h"
7858
7859PyDoc_STRVAR(format__doc__,
7860"S.format(*args, **kwargs) -> unicode\n\
7861\n\
7862");
7863
Eric Smithdc13b792008-05-30 18:10:04 +00007864static PyObject *
7865unicode__format__(PyObject *self, PyObject *args)
7866{
7867 PyObject *format_spec;
7868 PyObject *result = NULL;
7869 PyObject *tmp = NULL;
7870
7871 /* If 2.x, convert format_spec to the same type as value */
7872 /* This is to allow things like u''.format('') */
7873 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7874 goto done;
7875 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7876 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7877 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7878 goto done;
7879 }
7880 tmp = PyObject_Unicode(format_spec);
7881 if (tmp == NULL)
7882 goto done;
7883 format_spec = tmp;
7884
7885 result = _PyUnicode_FormatAdvanced(self,
7886 PyUnicode_AS_UNICODE(format_spec),
7887 PyUnicode_GET_SIZE(format_spec));
7888done:
7889 Py_XDECREF(tmp);
7890 return result;
7891}
7892
Eric Smitha9f7d622008-02-17 19:46:49 +00007893PyDoc_STRVAR(p_format__doc__,
7894"S.__format__(format_spec) -> unicode\n\
7895\n\
7896");
7897
Robert Schuppenies901c9972008-06-10 10:10:31 +00007898static PyObject *
7899unicode__sizeof__(PyUnicodeObject *v)
7900{
7901 PyObject *res = NULL, *defsize = NULL;
7902
7903 res = PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7904 sizeof(Py_UNICODE) * (v->length + 1));
7905 if (v->defenc) {
7906 defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL);
7907 if (defsize == NULL) {
7908 Py_DECREF(res);
7909 return NULL;
7910 }
7911 res = PyNumber_Add(res, defsize);
7912 Py_DECREF(defsize);
7913 }
7914 return res;
7915}
7916
7917PyDoc_STRVAR(sizeof__doc__,
7918"S.__sizeof__() -> size of S in memory, in bytes\n\
7919\n\
7920");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007921
7922static PyObject *
7923unicode_getnewargs(PyUnicodeObject *v)
7924{
7925 return Py_BuildValue("(u#)", v->str, v->length);
7926}
7927
7928
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929static PyMethodDef unicode_methods[] = {
7930
7931 /* Order is according to common usage: often used methods should
7932 appear first, since lookup is done sequentially. */
7933
Georg Brandlecdc0a92006-03-30 12:19:07 +00007934 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007935 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7936 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007937 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007938 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7939 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7940 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7941 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7942 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7943 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7944 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007945 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007946 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7947 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7948 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007949 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007950 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007951/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7952 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7953 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7954 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007955 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007956 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007957 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007958 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007959 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7960 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7961 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7962 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7963 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7964 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7965 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7966 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7967 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7968 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7969 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7970 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7971 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7972 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007973 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007974 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7975 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7976 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7977 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007978 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007979#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981#endif
7982
7983#if 0
7984 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007985 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986#endif
7987
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007988 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 {NULL, NULL}
7990};
7991
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007992static PyObject *
7993unicode_mod(PyObject *v, PyObject *w)
7994{
7995 if (!PyUnicode_Check(v)) {
7996 Py_INCREF(Py_NotImplemented);
7997 return Py_NotImplemented;
7998 }
7999 return PyUnicode_Format(v, w);
8000}
8001
8002static PyNumberMethods unicode_as_number = {
8003 0, /*nb_add*/
8004 0, /*nb_subtract*/
8005 0, /*nb_multiply*/
8006 0, /*nb_divide*/
8007 unicode_mod, /*nb_remainder*/
8008};
8009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00008012 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8014 (ssizeargfunc) unicode_getitem, /* sq_item */
8015 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 0, /* sq_ass_item */
8017 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00008018 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019};
8020
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008021static PyObject*
8022unicode_subscript(PyUnicodeObject* self, PyObject* item)
8023{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008024 if (PyIndex_Check(item)) {
8025 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008026 if (i == -1 && PyErr_Occurred())
8027 return NULL;
8028 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008029 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008030 return unicode_getitem(self, i);
8031 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008032 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008033 Py_UNICODE* source_buf;
8034 Py_UNICODE* result_buf;
8035 PyObject* result;
8036
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008037 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008038 &start, &stop, &step, &slicelength) < 0) {
8039 return NULL;
8040 }
8041
8042 if (slicelength <= 0) {
8043 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008044 } else if (start == 0 && step == 1 && slicelength == self->length &&
8045 PyUnicode_CheckExact(self)) {
8046 Py_INCREF(self);
8047 return (PyObject *)self;
8048 } else if (step == 1) {
8049 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008050 } else {
8051 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008052 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8053 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008054
8055 if (result_buf == NULL)
8056 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008057
8058 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8059 result_buf[i] = source_buf[cur];
8060 }
Tim Petersced69f82003-09-16 20:30:58 +00008061
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008062 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008063 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008064 return result;
8065 }
8066 } else {
8067 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8068 return NULL;
8069 }
8070}
8071
8072static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008073 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008074 (binaryfunc)unicode_subscript, /* mp_subscript */
8075 (objobjargproc)0, /* mp_ass_subscript */
8076};
8077
Martin v. Löwis18e16552006-02-15 17:27:45 +00008078static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008080 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 const void **ptr)
8082{
8083 if (index != 0) {
8084 PyErr_SetString(PyExc_SystemError,
8085 "accessing non-existent unicode segment");
8086 return -1;
8087 }
8088 *ptr = (void *) self->str;
8089 return PyUnicode_GET_DATA_SIZE(self);
8090}
8091
Martin v. Löwis18e16552006-02-15 17:27:45 +00008092static Py_ssize_t
8093unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 const void **ptr)
8095{
8096 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008097 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 return -1;
8099}
8100
8101static int
8102unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
8105 if (lenp)
8106 *lenp = PyUnicode_GET_DATA_SIZE(self);
8107 return 1;
8108}
8109
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008110static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 const void **ptr)
8114{
8115 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 if (index != 0) {
8118 PyErr_SetString(PyExc_SystemError,
8119 "accessing non-existent unicode segment");
8120 return -1;
8121 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008122 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 if (str == NULL)
8124 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008125 *ptr = (void *) PyString_AS_STRING(str);
8126 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127}
8128
8129/* Helpers for PyUnicode_Format() */
8130
8131static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008132getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008134 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 if (argidx < arglen) {
8136 (*p_argidx)++;
8137 if (arglen < 0)
8138 return args;
8139 else
8140 return PyTuple_GetItem(args, argidx);
8141 }
8142 PyErr_SetString(PyExc_TypeError,
8143 "not enough arguments for format string");
8144 return NULL;
8145}
8146
8147#define F_LJUST (1<<0)
8148#define F_SIGN (1<<1)
8149#define F_BLANK (1<<2)
8150#define F_ALT (1<<3)
8151#define F_ZERO (1<<4)
8152
Martin v. Löwis18e16552006-02-15 17:27:45 +00008153static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008154strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 register Py_ssize_t i;
8157 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 for (i = len - 1; i >= 0; i--)
8159 buffer[i] = (Py_UNICODE) charbuffer[i];
8160
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return len;
8162}
8163
Neal Norwitzfc76d632006-01-10 06:03:13 +00008164static int
8165doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8166{
Tim Peters15231542006-02-16 01:08:01 +00008167 Py_ssize_t result;
8168
Neal Norwitzfc76d632006-01-10 06:03:13 +00008169 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008170 result = strtounicode(buffer, (char *)buffer);
8171 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008172}
8173
8174static int
8175longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8176{
Tim Peters15231542006-02-16 01:08:01 +00008177 Py_ssize_t result;
8178
Neal Norwitzfc76d632006-01-10 06:03:13 +00008179 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008180 result = strtounicode(buffer, (char *)buffer);
8181 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008182}
8183
Guido van Rossum078151d2002-08-11 04:24:12 +00008184/* XXX To save some code duplication, formatfloat/long/int could have been
8185 shared with stringobject.c, converting from 8-bit to Unicode after the
8186 formatting is done. */
8187
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188static int
8189formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008190 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 int flags,
8192 int prec,
8193 int type,
8194 PyObject *v)
8195{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008196 /* fmt = '%#.' + `prec` + `type`
8197 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 char fmt[20];
8199 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008200
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 x = PyFloat_AsDouble(v);
8202 if (x == -1.0 && PyErr_Occurred())
8203 return -1;
8204 if (prec < 0)
8205 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8207 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008208 /* Worst case length calc to ensure no buffer overrun:
8209
8210 'g' formats:
8211 fmt = %#.<prec>g
8212 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8213 for any double rep.)
8214 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8215
8216 'f' formats:
8217 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8218 len = 1 + 50 + 1 + prec = 52 + prec
8219
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008220 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008221 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008222
8223 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008224 if (((type == 'g' || type == 'G') &&
8225 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008226 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008227 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008228 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 return -1;
8230 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008231 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8232 (flags&F_ALT) ? "#" : "",
8233 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008234 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235}
8236
Tim Peters38fd5b62000-09-21 05:43:11 +00008237static PyObject*
8238formatlong(PyObject *val, int flags, int prec, int type)
8239{
8240 char *buf;
8241 int i, len;
8242 PyObject *str; /* temporary string object. */
8243 PyUnicodeObject *result;
8244
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008245 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008246 if (!str)
8247 return NULL;
8248 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008249 if (!result) {
8250 Py_DECREF(str);
8251 return NULL;
8252 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008253 for (i = 0; i < len; i++)
8254 result->str[i] = buf[i];
8255 result->str[len] = 0;
8256 Py_DECREF(str);
8257 return (PyObject*)result;
8258}
8259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260static int
8261formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008262 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 int flags,
8264 int prec,
8265 int type,
8266 PyObject *v)
8267{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008268 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008269 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8270 * + 1 + 1
8271 * = 24
8272 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008273 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008274 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 long x;
8276
8277 x = PyInt_AsLong(v);
8278 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008279 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008280 if (x < 0 && type == 'u') {
8281 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008282 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008283 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8284 sign = "-";
8285 else
8286 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008288 prec = 1;
8289
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008290 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8291 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008292 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008293 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008294 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008295 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008296 return -1;
8297 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008298
8299 if ((flags & F_ALT) &&
8300 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008301 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008302 * of issues that cause pain:
8303 * - when 0 is being converted, the C standard leaves off
8304 * the '0x' or '0X', which is inconsistent with other
8305 * %#x/%#X conversions and inconsistent with Python's
8306 * hex() function
8307 * - there are platforms that violate the standard and
8308 * convert 0 with the '0x' or '0X'
8309 * (Metrowerks, Compaq Tru64)
8310 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008311 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008312 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008313 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008314 * We can achieve the desired consistency by inserting our
8315 * own '0x' or '0X' prefix, and substituting %x/%X in place
8316 * of %#x/%#X.
8317 *
8318 * Note that this is the same approach as used in
8319 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008320 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008321 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8322 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008323 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008324 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008325 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8326 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008327 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008328 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008329 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008330 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008331 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008332 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333}
8334
8335static int
8336formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008337 size_t buflen,
8338 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008340 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008341 if (PyUnicode_Check(v)) {
8342 if (PyUnicode_GET_SIZE(v) != 1)
8343 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008347 else if (PyString_Check(v)) {
8348 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008349 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008350 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352
8353 else {
8354 /* Integer input truncated to a character */
8355 long x;
8356 x = PyInt_AsLong(v);
8357 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008358 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008359#ifdef Py_UNICODE_WIDE
8360 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008361 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008362 "%c arg not in range(0x110000) "
8363 "(wide Python build)");
8364 return -1;
8365 }
8366#else
8367 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008368 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008369 "%c arg not in range(0x10000) "
8370 "(narrow Python build)");
8371 return -1;
8372 }
8373#endif
8374 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 }
8376 buf[1] = '\0';
8377 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008378
8379 onError:
8380 PyErr_SetString(PyExc_TypeError,
8381 "%c requires int or char");
8382 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383}
8384
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008385/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8386
8387 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8388 chars are formatted. XXX This is a magic number. Each formatting
8389 routine does bounds checking to ensure no overflow, but a better
8390 solution may be to malloc a buffer of appropriate size for each
8391 format. For now, the current solution is sufficient.
8392*/
8393#define FORMATBUFLEN (size_t)120
8394
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395PyObject *PyUnicode_Format(PyObject *format,
8396 PyObject *args)
8397{
8398 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 int args_owned = 0;
8401 PyUnicodeObject *result = NULL;
8402 PyObject *dict = NULL;
8403 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008404
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 if (format == NULL || args == NULL) {
8406 PyErr_BadInternalCall();
8407 return NULL;
8408 }
8409 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008410 if (uformat == NULL)
8411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 fmt = PyUnicode_AS_UNICODE(uformat);
8413 fmtcnt = PyUnicode_GET_SIZE(uformat);
8414
8415 reslen = rescnt = fmtcnt + 100;
8416 result = _PyUnicode_New(reslen);
8417 if (result == NULL)
8418 goto onError;
8419 res = PyUnicode_AS_UNICODE(result);
8420
8421 if (PyTuple_Check(args)) {
8422 arglen = PyTuple_Size(args);
8423 argidx = 0;
8424 }
8425 else {
8426 arglen = -1;
8427 argidx = -2;
8428 }
Christian Heimese93237d2007-12-19 02:37:44 +00008429 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008430 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 dict = args;
8432
8433 while (--fmtcnt >= 0) {
8434 if (*fmt != '%') {
8435 if (--rescnt < 0) {
8436 rescnt = fmtcnt + 100;
8437 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008438 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8441 --rescnt;
8442 }
8443 *res++ = *fmt++;
8444 }
8445 else {
8446 /* Got a format specifier */
8447 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008448 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 Py_UNICODE c = '\0';
8451 Py_UNICODE fill;
Facundo Batistac11cecf2008-02-24 03:17:21 +00008452 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 PyObject *v = NULL;
8454 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008455 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008457 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008458 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459
8460 fmt++;
8461 if (*fmt == '(') {
8462 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008463 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 PyObject *key;
8465 int pcount = 1;
8466
8467 if (dict == NULL) {
8468 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008469 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 goto onError;
8471 }
8472 ++fmt;
8473 --fmtcnt;
8474 keystart = fmt;
8475 /* Skip over balanced parentheses */
8476 while (pcount > 0 && --fmtcnt >= 0) {
8477 if (*fmt == ')')
8478 --pcount;
8479 else if (*fmt == '(')
8480 ++pcount;
8481 fmt++;
8482 }
8483 keylen = fmt - keystart - 1;
8484 if (fmtcnt < 0 || pcount > 0) {
8485 PyErr_SetString(PyExc_ValueError,
8486 "incomplete format key");
8487 goto onError;
8488 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008489#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008490 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491 then looked up since Python uses strings to hold
8492 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008493 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 key = PyUnicode_EncodeUTF8(keystart,
8495 keylen,
8496 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008497#else
8498 key = PyUnicode_FromUnicode(keystart, keylen);
8499#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 if (key == NULL)
8501 goto onError;
8502 if (args_owned) {
8503 Py_DECREF(args);
8504 args_owned = 0;
8505 }
8506 args = PyObject_GetItem(dict, key);
8507 Py_DECREF(key);
8508 if (args == NULL) {
8509 goto onError;
8510 }
8511 args_owned = 1;
8512 arglen = -1;
8513 argidx = -2;
8514 }
8515 while (--fmtcnt >= 0) {
8516 switch (c = *fmt++) {
8517 case '-': flags |= F_LJUST; continue;
8518 case '+': flags |= F_SIGN; continue;
8519 case ' ': flags |= F_BLANK; continue;
8520 case '#': flags |= F_ALT; continue;
8521 case '0': flags |= F_ZERO; continue;
8522 }
8523 break;
8524 }
8525 if (c == '*') {
8526 v = getnextarg(args, arglen, &argidx);
8527 if (v == NULL)
8528 goto onError;
8529 if (!PyInt_Check(v)) {
8530 PyErr_SetString(PyExc_TypeError,
8531 "* wants int");
8532 goto onError;
8533 }
8534 width = PyInt_AsLong(v);
8535 if (width < 0) {
8536 flags |= F_LJUST;
8537 width = -width;
8538 }
8539 if (--fmtcnt >= 0)
8540 c = *fmt++;
8541 }
8542 else if (c >= '0' && c <= '9') {
8543 width = c - '0';
8544 while (--fmtcnt >= 0) {
8545 c = *fmt++;
8546 if (c < '0' || c > '9')
8547 break;
8548 if ((width*10) / 10 != width) {
8549 PyErr_SetString(PyExc_ValueError,
8550 "width too big");
8551 goto onError;
8552 }
8553 width = width*10 + (c - '0');
8554 }
8555 }
8556 if (c == '.') {
8557 prec = 0;
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 if (c == '*') {
8561 v = getnextarg(args, arglen, &argidx);
8562 if (v == NULL)
8563 goto onError;
8564 if (!PyInt_Check(v)) {
8565 PyErr_SetString(PyExc_TypeError,
8566 "* wants int");
8567 goto onError;
8568 }
8569 prec = PyInt_AsLong(v);
8570 if (prec < 0)
8571 prec = 0;
8572 if (--fmtcnt >= 0)
8573 c = *fmt++;
8574 }
8575 else if (c >= '0' && c <= '9') {
8576 prec = c - '0';
8577 while (--fmtcnt >= 0) {
8578 c = Py_CHARMASK(*fmt++);
8579 if (c < '0' || c > '9')
8580 break;
8581 if ((prec*10) / 10 != prec) {
8582 PyErr_SetString(PyExc_ValueError,
8583 "prec too big");
8584 goto onError;
8585 }
8586 prec = prec*10 + (c - '0');
8587 }
8588 }
8589 } /* prec */
8590 if (fmtcnt >= 0) {
8591 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 if (--fmtcnt >= 0)
8593 c = *fmt++;
8594 }
8595 }
8596 if (fmtcnt < 0) {
8597 PyErr_SetString(PyExc_ValueError,
8598 "incomplete format");
8599 goto onError;
8600 }
8601 if (c != '%') {
8602 v = getnextarg(args, arglen, &argidx);
8603 if (v == NULL)
8604 goto onError;
8605 }
8606 sign = 0;
8607 fill = ' ';
8608 switch (c) {
8609
8610 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008611 pbuf = formatbuf;
8612 /* presume that buffer length is at least 1 */
8613 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 len = 1;
8615 break;
8616
8617 case 's':
8618 case 'r':
8619 if (PyUnicode_Check(v) && c == 's') {
8620 temp = v;
8621 Py_INCREF(temp);
8622 }
8623 else {
8624 PyObject *unicode;
8625 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008626 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 else
8628 temp = PyObject_Repr(v);
8629 if (temp == NULL)
8630 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008631 if (PyUnicode_Check(temp))
8632 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008633 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008634 /* convert to string to Unicode */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008635 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8636 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008637 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008639 Py_DECREF(temp);
8640 temp = unicode;
8641 if (temp == NULL)
8642 goto onError;
8643 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008644 else {
8645 Py_DECREF(temp);
8646 PyErr_SetString(PyExc_TypeError,
8647 "%s argument has non-string str()");
8648 goto onError;
8649 }
8650 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008651 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 len = PyUnicode_GET_SIZE(temp);
8653 if (prec >= 0 && len > prec)
8654 len = prec;
8655 break;
8656
8657 case 'i':
8658 case 'd':
8659 case 'u':
8660 case 'o':
8661 case 'x':
8662 case 'X':
8663 if (c == 'i')
8664 c = 'd';
Facundo Batistac11cecf2008-02-24 03:17:21 +00008665 isnumok = 0;
8666 if (PyNumber_Check(v)) {
8667 PyObject *iobj=NULL;
8668
8669 if (PyInt_Check(v) || (PyLong_Check(v))) {
8670 iobj = v;
8671 Py_INCREF(iobj);
8672 }
8673 else {
8674 iobj = PyNumber_Int(v);
8675 if (iobj==NULL) iobj = PyNumber_Long(v);
8676 }
8677 if (iobj!=NULL) {
8678 if (PyInt_Check(iobj)) {
8679 isnumok = 1;
8680 pbuf = formatbuf;
8681 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8682 flags, prec, c, iobj);
8683 Py_DECREF(iobj);
8684 if (len < 0)
8685 goto onError;
8686 sign = 1;
8687 }
8688 else if (PyLong_Check(iobj)) {
8689 isnumok = 1;
8690 temp = formatlong(iobj, flags, prec, c);
8691 Py_DECREF(iobj);
8692 if (!temp)
8693 goto onError;
8694 pbuf = PyUnicode_AS_UNICODE(temp);
8695 len = PyUnicode_GET_SIZE(temp);
8696 sign = 1;
8697 }
8698 else {
8699 Py_DECREF(iobj);
8700 }
8701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 }
Facundo Batistac11cecf2008-02-24 03:17:21 +00008703 if (!isnumok) {
8704 PyErr_Format(PyExc_TypeError,
8705 "%%%c format: a number is required, "
Martin v. Löwisd918e4e2008-04-07 03:08:28 +00008706 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00008707 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00008708 }
8709 if (flags & F_ZERO)
8710 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 break;
8712
8713 case 'e':
8714 case 'E':
8715 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008716 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 case 'g':
8718 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008719 if (c == 'F')
8720 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008721 pbuf = formatbuf;
8722 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8723 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 if (len < 0)
8725 goto onError;
8726 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008727 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 fill = '0';
8729 break;
8730
8731 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008732 pbuf = formatbuf;
8733 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 if (len < 0)
8735 goto onError;
8736 break;
8737
8738 default:
8739 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008740 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008741 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008742 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008743 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008744 (Py_ssize_t)(fmt - 1 -
8745 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 goto onError;
8747 }
8748 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008749 if (*pbuf == '-' || *pbuf == '+') {
8750 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 len--;
8752 }
8753 else if (flags & F_SIGN)
8754 sign = '+';
8755 else if (flags & F_BLANK)
8756 sign = ' ';
8757 else
8758 sign = 0;
8759 }
8760 if (width < len)
8761 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008762 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 reslen -= rescnt;
8764 rescnt = width + fmtcnt + 100;
8765 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008766 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008767 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008768 PyErr_NoMemory();
8769 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008770 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008771 if (_PyUnicode_Resize(&result, reslen) < 0) {
8772 Py_XDECREF(temp);
8773 goto onError;
8774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 res = PyUnicode_AS_UNICODE(result)
8776 + reslen - rescnt;
8777 }
8778 if (sign) {
8779 if (fill != ' ')
8780 *res++ = sign;
8781 rescnt--;
8782 if (width > len)
8783 width--;
8784 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008785 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8786 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008787 assert(pbuf[1] == c);
8788 if (fill != ' ') {
8789 *res++ = *pbuf++;
8790 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008791 }
Tim Petersfff53252001-04-12 18:38:48 +00008792 rescnt -= 2;
8793 width -= 2;
8794 if (width < 0)
8795 width = 0;
8796 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 if (width > len && !(flags & F_LJUST)) {
8799 do {
8800 --rescnt;
8801 *res++ = fill;
8802 } while (--width > len);
8803 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008804 if (fill == ' ') {
8805 if (sign)
8806 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008807 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008808 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008809 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008810 *res++ = *pbuf++;
8811 *res++ = *pbuf++;
8812 }
8813 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008814 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008815 res += len;
8816 rescnt -= len;
8817 while (--width >= len) {
8818 --rescnt;
8819 *res++ = ' ';
8820 }
8821 if (dict && (argidx < arglen) && c != '%') {
8822 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008823 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008824 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 goto onError;
8826 }
8827 Py_XDECREF(temp);
8828 } /* '%' */
8829 } /* until end */
8830 if (argidx < arglen && !dict) {
8831 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008832 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 goto onError;
8834 }
8835
Thomas Woutersa96affe2006-03-12 00:29:36 +00008836 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838 if (args_owned) {
8839 Py_DECREF(args);
8840 }
8841 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 return (PyObject *)result;
8843
8844 onError:
8845 Py_XDECREF(result);
8846 Py_DECREF(uformat);
8847 if (args_owned) {
8848 Py_DECREF(args);
8849 }
8850 return NULL;
8851}
8852
8853static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008854 (readbufferproc) unicode_buffer_getreadbuf,
8855 (writebufferproc) unicode_buffer_getwritebuf,
8856 (segcountproc) unicode_buffer_getsegcount,
8857 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858};
8859
Jeremy Hylton938ace62002-07-17 16:30:39 +00008860static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008861unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8862
Tim Peters6d6c1a32001-08-02 04:15:00 +00008863static PyObject *
8864unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8865{
8866 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008867 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008868 char *encoding = NULL;
8869 char *errors = NULL;
8870
Guido van Rossume023fe02001-08-30 03:12:59 +00008871 if (type != &PyUnicode_Type)
8872 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008873 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8874 kwlist, &x, &encoding, &errors))
8875 return NULL;
8876 if (x == NULL)
8877 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008878 if (encoding == NULL && errors == NULL)
8879 return PyObject_Unicode(x);
8880 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008881 return PyUnicode_FromEncodedObject(x, encoding, errors);
8882}
8883
Guido van Rossume023fe02001-08-30 03:12:59 +00008884static PyObject *
8885unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8886{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008887 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008888 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008889
8890 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8891 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8892 if (tmp == NULL)
8893 return NULL;
8894 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008895 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008896 if (pnew == NULL) {
8897 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008898 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008899 }
Neal Norwitz419fd492008-03-17 20:22:43 +00008900 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008901 if (pnew->str == NULL) {
8902 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008903 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008904 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008905 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008906 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008907 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8908 pnew->length = n;
8909 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008910 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008911 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008912}
8913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008914PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008915"unicode(string [, encoding[, errors]]) -> object\n\
8916\n\
8917Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008918encoding defaults to the current default string encoding.\n\
8919errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008920
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008922 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 "unicode", /* tp_name */
8924 sizeof(PyUnicodeObject), /* tp_size */
8925 0, /* tp_itemsize */
8926 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008927 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008929 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008931 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008932 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008933 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008935 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 (hashfunc) unicode_hash, /* tp_hash*/
8937 0, /* tp_call*/
8938 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008939 PyObject_GenericGetAttr, /* tp_getattro */
8940 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008942 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008943 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008944 unicode_doc, /* tp_doc */
8945 0, /* tp_traverse */
8946 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008947 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008948 0, /* tp_weaklistoffset */
8949 0, /* tp_iter */
8950 0, /* tp_iternext */
8951 unicode_methods, /* tp_methods */
8952 0, /* tp_members */
8953 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008954 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008955 0, /* tp_dict */
8956 0, /* tp_descr_get */
8957 0, /* tp_descr_set */
8958 0, /* tp_dictoffset */
8959 0, /* tp_init */
8960 0, /* tp_alloc */
8961 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008962 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963};
8964
8965/* Initialize the Unicode implementation */
8966
Thomas Wouters78890102000-07-22 19:25:51 +00008967void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008969 int i;
8970
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008971 /* XXX - move this array to unicodectype.c ? */
8972 Py_UNICODE linebreak[] = {
8973 0x000A, /* LINE FEED */
8974 0x000D, /* CARRIAGE RETURN */
8975 0x001C, /* FILE SEPARATOR */
8976 0x001D, /* GROUP SEPARATOR */
8977 0x001E, /* RECORD SEPARATOR */
8978 0x0085, /* NEXT LINE */
8979 0x2028, /* LINE SEPARATOR */
8980 0x2029, /* PARAGRAPH SEPARATOR */
8981 };
8982
Fred Drakee4315f52000-05-09 19:53:39 +00008983 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008984 free_list = NULL;
8985 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008987 if (!unicode_empty)
8988 return;
8989
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008990 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008991 for (i = 0; i < 256; i++)
8992 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008993 if (PyType_Ready(&PyUnicode_Type) < 0)
8994 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008995
8996 /* initialize the linebreak bloom filter */
8997 bloom_linebreak = make_bloom_mask(
8998 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8999 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009000
9001 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002}
9003
9004/* Finalize the Unicode implementation */
9005
Christian Heimes3b718a72008-02-14 12:47:33 +00009006int
9007PyUnicode_ClearFreeList(void)
9008{
9009 int freelist_size = numfree;
9010 PyUnicodeObject *u;
9011
9012 for (u = free_list; u != NULL;) {
9013 PyUnicodeObject *v = u;
9014 u = *(PyUnicodeObject **)u;
9015 if (v->str)
Neal Norwitz419fd492008-03-17 20:22:43 +00009016 PyObject_DEL(v->str);
Christian Heimes3b718a72008-02-14 12:47:33 +00009017 Py_XDECREF(v->defenc);
9018 PyObject_Del(v);
9019 numfree--;
9020 }
9021 free_list = NULL;
9022 assert(numfree == 0);
9023 return freelist_size;
9024}
9025
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026void
Thomas Wouters78890102000-07-22 19:25:51 +00009027_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009029 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009031 Py_XDECREF(unicode_empty);
9032 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009034 for (i = 0; i < 256; i++) {
9035 if (unicode_latin1[i]) {
9036 Py_DECREF(unicode_latin1[i]);
9037 unicode_latin1[i] = NULL;
9038 }
9039 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009040 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009042
Anthony Baxterac6bd462006-04-13 02:06:09 +00009043#ifdef __cplusplus
9044}
9045#endif
9046
9047
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009048/*
9049Local variables:
9050c-basic-offset: 4
9051indent-tabs-mode: nil
9052End:
9053*/