blob: ce28692d7b37fc3748b87f2c9541a6ecf1e8b6e4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Thomas Wouters477c8d52006-05-27 19:21:47 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
166 return 0;
167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000187
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Thomas Wouters477c8d52006-05-27 19:21:47 +00002018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
2021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
Thomas Wouters89f507f2006-12-13 04:49:30 +00002043 /* XXX(nnorwitz): rather than over-allocating, it would be
2044 better to choose a different scheme. Perhaps scan the
2045 first N-chars of the string and allocate based on that size.
2046 */
2047 /* Initial allocation is based on the longest-possible unichr
2048 escape.
2049
2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051 unichr, so in this case it's the longest unichr escape. In
2052 narrow (UTF-16) builds this is five chars per source unichr
2053 since there are two unichrs in the surrogate pair, so in narrow
2054 (UTF-16) builds it's not the longest unichr escape.
2055
2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057 so in the narrow (UTF-16) build case it's the longest unichr
2058 escape.
2059 */
2060
2061 repr = PyString_FromStringAndSize(NULL,
2062 2
2063#ifdef Py_UNICODE_WIDE
2064 + 10*size
2065#else
2066 + 6*size
2067#endif
2068 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 if (repr == NULL)
2070 return NULL;
2071
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002072 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073
2074 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002076 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 !findchar(s, size, '"')) ? '"' : '\'';
2078 }
2079 while (size-- > 0) {
2080 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002081
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002082 /* Escape quotes and backslashes */
2083 if ((quotes &&
2084 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 *p++ = '\\';
2086 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002087 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002089
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002090#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002091 /* Map 21-bit characters to '\U00xxxxxx' */
2092 else if (ch >= 0x10000) {
2093 *p++ = '\\';
2094 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002102 *p++ = hexdigit[ch & 0x0000000F];
2103 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002104 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002105#else
2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002107 else if (ch >= 0xD800 && ch < 0xDC00) {
2108 Py_UNICODE ch2;
2109 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002110
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 ch2 = *s++;
2112 size--;
2113 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115 *p++ = '\\';
2116 *p++ = 'U';
2117 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124 *p++ = hexdigit[ucs & 0x0000000F];
2125 continue;
2126 }
2127 /* Fall through: isolated surrogates are copied as-is */
2128 s--;
2129 size++;
2130 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002131#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002132
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002134 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 *p++ = '\\';
2136 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002137 *p++ = hexdigit[(ch >> 12) & 0x000F];
2138 *p++ = hexdigit[(ch >> 8) & 0x000F];
2139 *p++ = hexdigit[(ch >> 4) & 0x000F];
2140 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 /* Map special whitespace to '\t', \n', '\r' */
2144 else if (ch == '\t') {
2145 *p++ = '\\';
2146 *p++ = 't';
2147 }
2148 else if (ch == '\n') {
2149 *p++ = '\\';
2150 *p++ = 'n';
2151 }
2152 else if (ch == '\r') {
2153 *p++ = '\\';
2154 *p++ = 'r';
2155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002157 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002158 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002160 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002161 *p++ = hexdigit[(ch >> 4) & 0x000F];
2162 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002164
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2168 }
2169 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002170 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171
2172 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002173 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 return repr;
2175}
2176
2177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179{
2180 return unicodeescape_string(s, size, 0);
2181}
2182
2183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2184{
2185 if (!PyUnicode_Check(unicode)) {
2186 PyErr_BadArgument();
2187 return NULL;
2188 }
2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190 PyUnicode_GET_SIZE(unicode));
2191}
2192
2193/* --- Raw Unicode Escape Codec ------------------------------------------- */
2194
2195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002196 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 const char *errors)
2198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002200 Py_ssize_t startinpos;
2201 Py_ssize_t endinpos;
2202 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 const char *end;
2206 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 PyObject *errorHandler = NULL;
2208 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002209
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 /* Escaped strings will always be longer than the resulting
2211 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002212 length after conversion to the true value. (But decoding error
2213 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 v = _PyUnicode_New(size);
2215 if (v == NULL)
2216 goto onError;
2217 if (size == 0)
2218 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 end = s + size;
2221 while (s < end) {
2222 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002223 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
2227 /* Non-escape characters are interpreted as Unicode ordinals */
2228 if (*s != '\\') {
2229 *p++ = (unsigned char)*s++;
2230 continue;
2231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233
2234 /* \u-escapes are only interpreted iff the number of leading
2235 backslashes if odd */
2236 bs = s;
2237 for (;s < end;) {
2238 if (*s != '\\')
2239 break;
2240 *p++ = (unsigned char)*s++;
2241 }
2242 if (((s - bs) & 1) == 0 ||
2243 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002244 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 continue;
2246 }
2247 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 s++;
2250
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002253 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 endinpos = s-starts;
2257 if (unicode_decode_call_errorhandler(
2258 errors, &errorHandler,
2259 "rawunicodeescape", "truncated \\uXXXX",
2260 starts, size, &startinpos, &endinpos, &exc, &s,
2261 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 }
2265 x = (x<<4) & ~0xF;
2266 if (c >= '0' && c <= '9')
2267 x += c - '0';
2268 else if (c >= 'a' && c <= 'f')
2269 x += 10 + c - 'a';
2270 else
2271 x += 10 + c - 'A';
2272 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002273#ifndef Py_UNICODE_WIDE
2274 if (x > 0x10000) {
2275 if (unicode_decode_call_errorhandler(
2276 errors, &errorHandler,
2277 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278 starts, size, &startinpos, &endinpos, &exc, &s,
2279 (PyObject **)&v, &outpos, &p))
2280 goto onError;
2281 }
2282#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 *p++ = x;
2284 nextByte:
2285 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 Py_XDECREF(errorHandler);
2290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002292
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 onError:
2294 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 Py_XDECREF(errorHandler);
2296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 return NULL;
2298}
2299
2300PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002301 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
2303 PyObject *repr;
2304 char *p;
2305 char *q;
2306
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002307 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309#ifdef Py_UNICODE_WIDE
2310 repr = PyString_FromStringAndSize(NULL, 10 * size);
2311#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if (repr == NULL)
2315 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002316 if (size == 0)
2317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318
2319 p = q = PyString_AS_STRING(repr);
2320 while (size-- > 0) {
2321 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002322#ifdef Py_UNICODE_WIDE
2323 /* Map 32-bit characters to '\Uxxxxxxxx' */
2324 if (ch >= 0x10000) {
2325 *p++ = '\\';
2326 *p++ = 'U';
2327 *p++ = hexdigit[(ch >> 28) & 0xf];
2328 *p++ = hexdigit[(ch >> 24) & 0xf];
2329 *p++ = hexdigit[(ch >> 20) & 0xf];
2330 *p++ = hexdigit[(ch >> 16) & 0xf];
2331 *p++ = hexdigit[(ch >> 12) & 0xf];
2332 *p++ = hexdigit[(ch >> 8) & 0xf];
2333 *p++ = hexdigit[(ch >> 4) & 0xf];
2334 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002335 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002336 else
2337#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 /* Map 16-bit characters to '\uxxxx' */
2339 if (ch >= 256) {
2340 *p++ = '\\';
2341 *p++ = 'u';
2342 *p++ = hexdigit[(ch >> 12) & 0xf];
2343 *p++ = hexdigit[(ch >> 8) & 0xf];
2344 *p++ = hexdigit[(ch >> 4) & 0xf];
2345 *p++ = hexdigit[ch & 15];
2346 }
2347 /* Copy everything else as-is */
2348 else
2349 *p++ = (char) ch;
2350 }
2351 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002352 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 return repr;
2354}
2355
2356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2357{
2358 if (!PyUnicode_Check(unicode)) {
2359 PyErr_BadArgument();
2360 return NULL;
2361 }
2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363 PyUnicode_GET_SIZE(unicode));
2364}
2365
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002366/* --- Unicode Internal Codec ------------------------------------------- */
2367
2368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 const char *errors)
2371{
2372 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002373 Py_ssize_t startinpos;
2374 Py_ssize_t endinpos;
2375 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002376 PyUnicodeObject *v;
2377 Py_UNICODE *p;
2378 const char *end;
2379 const char *reason;
2380 PyObject *errorHandler = NULL;
2381 PyObject *exc = NULL;
2382
Neal Norwitzd43069c2006-01-08 01:12:10 +00002383#ifdef Py_UNICODE_WIDE
2384 Py_UNICODE unimax = PyUnicode_GetMax();
2385#endif
2386
Thomas Wouters89f507f2006-12-13 04:49:30 +00002387 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002388 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2389 if (v == NULL)
2390 goto onError;
2391 if (PyUnicode_GetSize((PyObject *)v) == 0)
2392 return (PyObject *)v;
2393 p = PyUnicode_AS_UNICODE(v);
2394 end = s + size;
2395
2396 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002397 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002398 /* We have to sanity check the raw data, otherwise doom looms for
2399 some malformed UCS-4 data. */
2400 if (
2401 #ifdef Py_UNICODE_WIDE
2402 *p > unimax || *p < 0 ||
2403 #endif
2404 end-s < Py_UNICODE_SIZE
2405 )
2406 {
2407 startinpos = s - starts;
2408 if (end-s < Py_UNICODE_SIZE) {
2409 endinpos = end-starts;
2410 reason = "truncated input";
2411 }
2412 else {
2413 endinpos = s - starts + Py_UNICODE_SIZE;
2414 reason = "illegal code point (> 0x10FFFF)";
2415 }
2416 outpos = p - PyUnicode_AS_UNICODE(v);
2417 if (unicode_decode_call_errorhandler(
2418 errors, &errorHandler,
2419 "unicode_internal", reason,
2420 starts, size, &startinpos, &endinpos, &exc, &s,
2421 (PyObject **)&v, &outpos, &p)) {
2422 goto onError;
2423 }
2424 }
2425 else {
2426 p++;
2427 s += Py_UNICODE_SIZE;
2428 }
2429 }
2430
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002431 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002432 goto onError;
2433 Py_XDECREF(errorHandler);
2434 Py_XDECREF(exc);
2435 return (PyObject *)v;
2436
2437 onError:
2438 Py_XDECREF(v);
2439 Py_XDECREF(errorHandler);
2440 Py_XDECREF(exc);
2441 return NULL;
2442}
2443
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444/* --- Latin-1 Codec ------------------------------------------------------ */
2445
2446PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002447 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 const char *errors)
2449{
2450 PyUnicodeObject *v;
2451 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002452
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002454 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002455 Py_UNICODE r = *(unsigned char*)s;
2456 return PyUnicode_FromUnicode(&r, 1);
2457 }
2458
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 v = _PyUnicode_New(size);
2460 if (v == NULL)
2461 goto onError;
2462 if (size == 0)
2463 return (PyObject *)v;
2464 p = PyUnicode_AS_UNICODE(v);
2465 while (size-- > 0)
2466 *p++ = (unsigned char)*s++;
2467 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002468
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 onError:
2470 Py_XDECREF(v);
2471 return NULL;
2472}
2473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474/* create or adjust a UnicodeEncodeError */
2475static void make_encode_exception(PyObject **exceptionObject,
2476 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 const Py_UNICODE *unicode, Py_ssize_t size,
2478 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 if (*exceptionObject == NULL) {
2482 *exceptionObject = PyUnicodeEncodeError_Create(
2483 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 }
2485 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2487 goto onError;
2488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2489 goto onError;
2490 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2491 goto onError;
2492 return;
2493 onError:
2494 Py_DECREF(*exceptionObject);
2495 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
2497}
2498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499/* raises a UnicodeEncodeError */
2500static void raise_encode_exception(PyObject **exceptionObject,
2501 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002502 const Py_UNICODE *unicode, Py_ssize_t size,
2503 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 const char *reason)
2505{
2506 make_encode_exception(exceptionObject,
2507 encoding, unicode, size, startpos, endpos, reason);
2508 if (*exceptionObject != NULL)
2509 PyCodec_StrictErrors(*exceptionObject);
2510}
2511
2512/* error handling callback helper:
2513 build arguments, call the callback and check the arguments,
2514 put the result into newpos and return the replacement string, which
2515 has to be freed by the caller */
2516static PyObject *unicode_encode_call_errorhandler(const char *errors,
2517 PyObject **errorHandler,
2518 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002519 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2520 Py_ssize_t startpos, Py_ssize_t endpos,
2521 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002523 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524
2525 PyObject *restuple;
2526 PyObject *resunicode;
2527
2528 if (*errorHandler == NULL) {
2529 *errorHandler = PyCodec_LookupError(errors);
2530 if (*errorHandler == NULL)
2531 return NULL;
2532 }
2533
2534 make_encode_exception(exceptionObject,
2535 encoding, unicode, size, startpos, endpos, reason);
2536 if (*exceptionObject == NULL)
2537 return NULL;
2538
2539 restuple = PyObject_CallFunctionObjArgs(
2540 *errorHandler, *exceptionObject, NULL);
2541 if (restuple == NULL)
2542 return NULL;
2543 if (!PyTuple_Check(restuple)) {
2544 PyErr_Format(PyExc_TypeError, &argparse[4]);
2545 Py_DECREF(restuple);
2546 return NULL;
2547 }
2548 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2549 &resunicode, newpos)) {
2550 Py_DECREF(restuple);
2551 return NULL;
2552 }
2553 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002554 *newpos = size+*newpos;
2555 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002557 Py_DECREF(restuple);
2558 return NULL;
2559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 Py_INCREF(resunicode);
2561 Py_DECREF(restuple);
2562 return resunicode;
2563}
2564
2565static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002566 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 const char *errors,
2568 int limit)
2569{
2570 /* output object */
2571 PyObject *res;
2572 /* pointers to the beginning and end+1 of input */
2573 const Py_UNICODE *startp = p;
2574 const Py_UNICODE *endp = p + size;
2575 /* pointer to the beginning of the unencodable characters */
2576 /* const Py_UNICODE *badp = NULL; */
2577 /* pointer into the output */
2578 char *str;
2579 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t respos = 0;
2581 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002582 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2583 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 PyObject *errorHandler = NULL;
2585 PyObject *exc = NULL;
2586 /* the following variable is used for caching string comparisons
2587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588 int known_errorHandler = -1;
2589
2590 /* allocate enough for a simple encoding without
2591 replacements, if we need more, we'll resize */
2592 res = PyString_FromStringAndSize(NULL, size);
2593 if (res == NULL)
2594 goto onError;
2595 if (size == 0)
2596 return res;
2597 str = PyString_AS_STRING(res);
2598 ressize = size;
2599
2600 while (p<endp) {
2601 Py_UNICODE c = *p;
2602
2603 /* can we encode this? */
2604 if (c<limit) {
2605 /* no overflow check, because we know that the space is enough */
2606 *str++ = (char)c;
2607 ++p;
2608 }
2609 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002610 Py_ssize_t unicodepos = p-startp;
2611 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002613 Py_ssize_t repsize;
2614 Py_ssize_t newpos;
2615 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 Py_UNICODE *uni2;
2617 /* startpos for collecting unencodable chars */
2618 const Py_UNICODE *collstart = p;
2619 const Py_UNICODE *collend = p;
2620 /* find all unecodable characters */
2621 while ((collend < endp) && ((*collend)>=limit))
2622 ++collend;
2623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624 if (known_errorHandler==-1) {
2625 if ((errors==NULL) || (!strcmp(errors, "strict")))
2626 known_errorHandler = 1;
2627 else if (!strcmp(errors, "replace"))
2628 known_errorHandler = 2;
2629 else if (!strcmp(errors, "ignore"))
2630 known_errorHandler = 3;
2631 else if (!strcmp(errors, "xmlcharrefreplace"))
2632 known_errorHandler = 4;
2633 else
2634 known_errorHandler = 0;
2635 }
2636 switch (known_errorHandler) {
2637 case 1: /* strict */
2638 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2639 goto onError;
2640 case 2: /* replace */
2641 while (collstart++<collend)
2642 *str++ = '?'; /* fall through */
2643 case 3: /* ignore */
2644 p = collend;
2645 break;
2646 case 4: /* xmlcharrefreplace */
2647 respos = str-PyString_AS_STRING(res);
2648 /* determine replacement size (temporarily (mis)uses p) */
2649 for (p = collstart, repsize = 0; p < collend; ++p) {
2650 if (*p<10)
2651 repsize += 2+1+1;
2652 else if (*p<100)
2653 repsize += 2+2+1;
2654 else if (*p<1000)
2655 repsize += 2+3+1;
2656 else if (*p<10000)
2657 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002658#ifndef Py_UNICODE_WIDE
2659 else
2660 repsize += 2+5+1;
2661#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 else if (*p<100000)
2663 repsize += 2+5+1;
2664 else if (*p<1000000)
2665 repsize += 2+6+1;
2666 else
2667 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002668#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 }
2670 requiredsize = respos+repsize+(endp-collend);
2671 if (requiredsize > ressize) {
2672 if (requiredsize<2*ressize)
2673 requiredsize = 2*ressize;
2674 if (_PyString_Resize(&res, requiredsize))
2675 goto onError;
2676 str = PyString_AS_STRING(res) + respos;
2677 ressize = requiredsize;
2678 }
2679 /* generate replacement (temporarily (mis)uses p) */
2680 for (p = collstart; p < collend; ++p) {
2681 str += sprintf(str, "&#%d;", (int)*p);
2682 }
2683 p = collend;
2684 break;
2685 default:
2686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2687 encoding, reason, startp, size, &exc,
2688 collstart-startp, collend-startp, &newpos);
2689 if (repunicode == NULL)
2690 goto onError;
2691 /* need more space? (at least enough for what we
2692 have+the replacement+the rest of the string, so
2693 we won't have to check space for encodable characters) */
2694 respos = str-PyString_AS_STRING(res);
2695 repsize = PyUnicode_GET_SIZE(repunicode);
2696 requiredsize = respos+repsize+(endp-collend);
2697 if (requiredsize > ressize) {
2698 if (requiredsize<2*ressize)
2699 requiredsize = 2*ressize;
2700 if (_PyString_Resize(&res, requiredsize)) {
2701 Py_DECREF(repunicode);
2702 goto onError;
2703 }
2704 str = PyString_AS_STRING(res) + respos;
2705 ressize = requiredsize;
2706 }
2707 /* check if there is anything unencodable in the replacement
2708 and copy it to the output */
2709 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2710 c = *uni2;
2711 if (c >= limit) {
2712 raise_encode_exception(&exc, encoding, startp, size,
2713 unicodepos, unicodepos+1, reason);
2714 Py_DECREF(repunicode);
2715 goto onError;
2716 }
2717 *str = (char)c;
2718 }
2719 p = startp + newpos;
2720 Py_DECREF(repunicode);
2721 }
2722 }
2723 }
2724 /* Resize if we allocated to much */
2725 respos = str-PyString_AS_STRING(res);
2726 if (respos<ressize)
2727 /* If this falls res will be NULL */
2728 _PyString_Resize(&res, respos);
2729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
2731 return res;
2732
2733 onError:
2734 Py_XDECREF(res);
2735 Py_XDECREF(errorHandler);
2736 Py_XDECREF(exc);
2737 return NULL;
2738}
2739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002741 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 const char *errors)
2743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745}
2746
2747PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2748{
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 return NULL;
2752 }
2753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2754 PyUnicode_GET_SIZE(unicode),
2755 NULL);
2756}
2757
2758/* --- 7-bit ASCII Codec -------------------------------------------------- */
2759
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 const char *errors)
2763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 PyUnicodeObject *v;
2766 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002767 Py_ssize_t startinpos;
2768 Py_ssize_t endinpos;
2769 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 const char *e;
2771 PyObject *errorHandler = NULL;
2772 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002773
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002775 if (size == 1 && *(unsigned char*)s < 128) {
2776 Py_UNICODE r = *(unsigned char*)s;
2777 return PyUnicode_FromUnicode(&r, 1);
2778 }
Tim Petersced69f82003-09-16 20:30:58 +00002779
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 v = _PyUnicode_New(size);
2781 if (v == NULL)
2782 goto onError;
2783 if (size == 0)
2784 return (PyObject *)v;
2785 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 e = s + size;
2787 while (s < e) {
2788 register unsigned char c = (unsigned char)*s;
2789 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 ++s;
2792 }
2793 else {
2794 startinpos = s-starts;
2795 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002796 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 if (unicode_decode_call_errorhandler(
2798 errors, &errorHandler,
2799 "ascii", "ordinal not in range(128)",
2800 starts, size, &startinpos, &endinpos, &exc, &s,
2801 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002805 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002806 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002807 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002811
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 onError:
2813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 Py_XDECREF(errorHandler);
2815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 return NULL;
2817}
2818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002820 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 const char *errors)
2822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824}
2825
2826PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2827{
2828 if (!PyUnicode_Check(unicode)) {
2829 PyErr_BadArgument();
2830 return NULL;
2831 }
2832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2833 PyUnicode_GET_SIZE(unicode),
2834 NULL);
2835}
2836
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002837#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002838
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002839/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002840
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002841#if SIZEOF_INT < SIZEOF_SSIZE_T
2842#define NEED_RETRY
2843#endif
2844
2845/* XXX This code is limited to "true" double-byte encodings, as
2846 a) it assumes an incomplete character consists of a single byte, and
2847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848 encodings, see IsDBCSLeadByteEx documentation. */
2849
2850static int is_dbcs_lead_byte(const char *s, int offset)
2851{
2852 const char *curr = s + offset;
2853
2854 if (IsDBCSLeadByte(*curr)) {
2855 const char *prev = CharPrev(s, curr);
2856 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2857 }
2858 return 0;
2859}
2860
2861/*
2862 * Decode MBCS string into unicode object. If 'final' is set, converts
2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2864 */
2865static int decode_mbcs(PyUnicodeObject **v,
2866 const char *s, /* MBCS string */
2867 int size, /* sizeof MBCS string */
2868 int final)
2869{
2870 Py_UNICODE *p;
2871 Py_ssize_t n = 0;
2872 int usize = 0;
2873
2874 assert(size >= 0);
2875
2876 /* Skip trailing lead-byte unless 'final' is set */
2877 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2878 --size;
2879
2880 /* First get the size of the result */
2881 if (size > 0) {
2882 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2883 if (usize == 0) {
2884 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2885 return -1;
2886 }
2887 }
2888
2889 if (*v == NULL) {
2890 /* Create unicode object */
2891 *v = _PyUnicode_New(usize);
2892 if (*v == NULL)
2893 return -1;
2894 }
2895 else {
2896 /* Extend unicode object */
2897 n = PyUnicode_GET_SIZE(*v);
2898 if (_PyUnicode_Resize(v, n + usize) < 0)
2899 return -1;
2900 }
2901
2902 /* Do the conversion */
2903 if (size > 0) {
2904 p = PyUnicode_AS_UNICODE(*v) + n;
2905 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2906 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2907 return -1;
2908 }
2909 }
2910
2911 return size;
2912}
2913
2914PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2915 Py_ssize_t size,
2916 const char *errors,
2917 Py_ssize_t *consumed)
2918{
2919 PyUnicodeObject *v = NULL;
2920 int done;
2921
2922 if (consumed)
2923 *consumed = 0;
2924
2925#ifdef NEED_RETRY
2926 retry:
2927 if (size > INT_MAX)
2928 done = decode_mbcs(&v, s, INT_MAX, 0);
2929 else
2930#endif
2931 done = decode_mbcs(&v, s, (int)size, !consumed);
2932
2933 if (done < 0) {
2934 Py_XDECREF(v);
2935 return NULL;
2936 }
2937
2938 if (consumed)
2939 *consumed += done;
2940
2941#ifdef NEED_RETRY
2942 if (size > INT_MAX) {
2943 s += done;
2944 size -= done;
2945 goto retry;
2946 }
2947#endif
2948
2949 return (PyObject *)v;
2950}
2951
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002952PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002954 const char *errors)
2955{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002956 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2957}
2958
2959/*
2960 * Convert unicode into string object (MBCS).
2961 * Returns 0 if succeed, -1 otherwise.
2962 */
2963static int encode_mbcs(PyObject **repr,
2964 const Py_UNICODE *p, /* unicode */
2965 int size) /* size of unicode */
2966{
2967 int mbcssize = 0;
2968 Py_ssize_t n = 0;
2969
2970 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002971
2972 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002973 if (size > 0) {
2974 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2975 if (mbcssize == 0) {
2976 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977 return -1;
2978 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002979 }
2980
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002981 if (*repr == NULL) {
2982 /* Create string object */
2983 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2984 if (*repr == NULL)
2985 return -1;
2986 }
2987 else {
2988 /* Extend string object */
2989 n = PyString_Size(*repr);
2990 if (_PyString_Resize(repr, n + mbcssize) < 0)
2991 return -1;
2992 }
2993
2994 /* Do the conversion */
2995 if (size > 0) {
2996 char *s = PyString_AS_STRING(*repr) + n;
2997 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2998 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2999 return -1;
3000 }
3001 }
3002
3003 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003004}
3005
3006PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003008 const char *errors)
3009{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003010 PyObject *repr = NULL;
3011 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003012
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003013#ifdef NEED_RETRY
3014 retry:
3015 if (size > INT_MAX)
3016 ret = encode_mbcs(&repr, p, INT_MAX);
3017 else
3018#endif
3019 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003021 if (ret < 0) {
3022 Py_XDECREF(repr);
3023 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003024 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003025
3026#ifdef NEED_RETRY
3027 if (size > INT_MAX) {
3028 p += INT_MAX;
3029 size -= INT_MAX;
3030 goto retry;
3031 }
3032#endif
3033
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003034 return repr;
3035}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003036
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003037PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3038{
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_BadArgument();
3041 return NULL;
3042 }
3043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3044 PyUnicode_GET_SIZE(unicode),
3045 NULL);
3046}
3047
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003048#undef NEED_RETRY
3049
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003050#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003051
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052/* --- Character Mapping Codec -------------------------------------------- */
3053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 PyObject *mapping,
3057 const char *errors)
3058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t startinpos;
3061 Py_ssize_t endinpos;
3062 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 PyUnicodeObject *v;
3065 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 PyObject *errorHandler = NULL;
3068 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003069 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003070 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003071
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 /* Default to Latin-1 */
3073 if (mapping == NULL)
3074 return PyUnicode_DecodeLatin1(s, size, errors);
3075
3076 v = _PyUnicode_New(size);
3077 if (v == NULL)
3078 goto onError;
3079 if (size == 0)
3080 return (PyObject *)v;
3081 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003083 if (PyUnicode_CheckExact(mapping)) {
3084 mapstring = PyUnicode_AS_UNICODE(mapping);
3085 maplen = PyUnicode_GET_SIZE(mapping);
3086 while (s < e) {
3087 unsigned char ch = *s;
3088 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003090 if (ch < maplen)
3091 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003093 if (x == 0xfffe) {
3094 /* undefined mapping */
3095 outpos = p-PyUnicode_AS_UNICODE(v);
3096 startinpos = s-starts;
3097 endinpos = startinpos+1;
3098 if (unicode_decode_call_errorhandler(
3099 errors, &errorHandler,
3100 "charmap", "character maps to <undefined>",
3101 starts, size, &startinpos, &endinpos, &exc, &s,
3102 (PyObject **)&v, &outpos, &p)) {
3103 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003104 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003105 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003106 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003107 *p++ = x;
3108 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003110 }
3111 else {
3112 while (s < e) {
3113 unsigned char ch = *s;
3114 PyObject *w, *x;
3115
3116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117 w = PyInt_FromLong((long)ch);
3118 if (w == NULL)
3119 goto onError;
3120 x = PyObject_GetItem(mapping, w);
3121 Py_DECREF(w);
3122 if (x == NULL) {
3123 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3124 /* No mapping found means: mapping is undefined. */
3125 PyErr_Clear();
3126 x = Py_None;
3127 Py_INCREF(x);
3128 } else
3129 goto onError;
3130 }
3131
3132 /* Apply mapping */
3133 if (PyInt_Check(x)) {
3134 long value = PyInt_AS_LONG(x);
3135 if (value < 0 || value > 65535) {
3136 PyErr_SetString(PyExc_TypeError,
3137 "character mapping must be in range(65536)");
3138 Py_DECREF(x);
3139 goto onError;
3140 }
3141 *p++ = (Py_UNICODE)value;
3142 }
3143 else if (x == Py_None) {
3144 /* undefined mapping */
3145 outpos = p-PyUnicode_AS_UNICODE(v);
3146 startinpos = s-starts;
3147 endinpos = startinpos+1;
3148 if (unicode_decode_call_errorhandler(
3149 errors, &errorHandler,
3150 "charmap", "character maps to <undefined>",
3151 starts, size, &startinpos, &endinpos, &exc, &s,
3152 (PyObject **)&v, &outpos, &p)) {
3153 Py_DECREF(x);
3154 goto onError;
3155 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003156 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003157 continue;
3158 }
3159 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003161
3162 if (targetsize == 1)
3163 /* 1-1 mapping */
3164 *p++ = *PyUnicode_AS_UNICODE(x);
3165
3166 else if (targetsize > 1) {
3167 /* 1-n mapping */
3168 if (targetsize > extrachars) {
3169 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3171 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003172 (targetsize << 2);
3173 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003174 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003175 if (_PyUnicode_Resize(&v,
3176 PyUnicode_GET_SIZE(v) + needed) < 0) {
3177 Py_DECREF(x);
3178 goto onError;
3179 }
3180 p = PyUnicode_AS_UNICODE(v) + oldpos;
3181 }
3182 Py_UNICODE_COPY(p,
3183 PyUnicode_AS_UNICODE(x),
3184 targetsize);
3185 p += targetsize;
3186 extrachars -= targetsize;
3187 }
3188 /* 1-0 mapping: skip the character */
3189 }
3190 else {
3191 /* wrong return value */
3192 PyErr_SetString(PyExc_TypeError,
3193 "character mapping must return integer, None or unicode");
3194 Py_DECREF(x);
3195 goto onError;
3196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003198 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 }
3201 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003207
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 Py_XDECREF(errorHandler);
3210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 Py_XDECREF(v);
3212 return NULL;
3213}
3214
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003215/* Charmap encoding: the lookup table */
3216
3217struct encoding_map{
3218 PyObject_HEAD
3219 unsigned char level1[32];
3220 int count2, count3;
3221 unsigned char level23[1];
3222};
3223
3224static PyObject*
3225encoding_map_size(PyObject *obj, PyObject* args)
3226{
3227 struct encoding_map *map = (struct encoding_map*)obj;
3228 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3229 128*map->count3);
3230}
3231
3232static PyMethodDef encoding_map_methods[] = {
3233 {"size", encoding_map_size, METH_NOARGS,
3234 PyDoc_STR("Return the size (in bytes) of this object") },
3235 { 0 }
3236};
3237
3238static void
3239encoding_map_dealloc(PyObject* o)
3240{
3241 PyObject_FREE(o);
3242}
3243
3244static PyTypeObject EncodingMapType = {
3245 PyObject_HEAD_INIT(NULL)
3246 0, /*ob_size*/
3247 "EncodingMap", /*tp_name*/
3248 sizeof(struct encoding_map), /*tp_basicsize*/
3249 0, /*tp_itemsize*/
3250 /* methods */
3251 encoding_map_dealloc, /*tp_dealloc*/
3252 0, /*tp_print*/
3253 0, /*tp_getattr*/
3254 0, /*tp_setattr*/
3255 0, /*tp_compare*/
3256 0, /*tp_repr*/
3257 0, /*tp_as_number*/
3258 0, /*tp_as_sequence*/
3259 0, /*tp_as_mapping*/
3260 0, /*tp_hash*/
3261 0, /*tp_call*/
3262 0, /*tp_str*/
3263 0, /*tp_getattro*/
3264 0, /*tp_setattro*/
3265 0, /*tp_as_buffer*/
3266 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3267 0, /*tp_doc*/
3268 0, /*tp_traverse*/
3269 0, /*tp_clear*/
3270 0, /*tp_richcompare*/
3271 0, /*tp_weaklistoffset*/
3272 0, /*tp_iter*/
3273 0, /*tp_iternext*/
3274 encoding_map_methods, /*tp_methods*/
3275 0, /*tp_members*/
3276 0, /*tp_getset*/
3277 0, /*tp_base*/
3278 0, /*tp_dict*/
3279 0, /*tp_descr_get*/
3280 0, /*tp_descr_set*/
3281 0, /*tp_dictoffset*/
3282 0, /*tp_init*/
3283 0, /*tp_alloc*/
3284 0, /*tp_new*/
3285 0, /*tp_free*/
3286 0, /*tp_is_gc*/
3287};
3288
3289PyObject*
3290PyUnicode_BuildEncodingMap(PyObject* string)
3291{
3292 Py_UNICODE *decode;
3293 PyObject *result;
3294 struct encoding_map *mresult;
3295 int i;
3296 int need_dict = 0;
3297 unsigned char level1[32];
3298 unsigned char level2[512];
3299 unsigned char *mlevel1, *mlevel2, *mlevel3;
3300 int count2 = 0, count3 = 0;
3301
3302 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3303 PyErr_BadArgument();
3304 return NULL;
3305 }
3306 decode = PyUnicode_AS_UNICODE(string);
3307 memset(level1, 0xFF, sizeof level1);
3308 memset(level2, 0xFF, sizeof level2);
3309
3310 /* If there isn't a one-to-one mapping of NULL to \0,
3311 or if there are non-BMP characters, we need to use
3312 a mapping dictionary. */
3313 if (decode[0] != 0)
3314 need_dict = 1;
3315 for (i = 1; i < 256; i++) {
3316 int l1, l2;
3317 if (decode[i] == 0
3318 #ifdef Py_UNICODE_WIDE
3319 || decode[i] > 0xFFFF
3320 #endif
3321 ) {
3322 need_dict = 1;
3323 break;
3324 }
3325 if (decode[i] == 0xFFFE)
3326 /* unmapped character */
3327 continue;
3328 l1 = decode[i] >> 11;
3329 l2 = decode[i] >> 7;
3330 if (level1[l1] == 0xFF)
3331 level1[l1] = count2++;
3332 if (level2[l2] == 0xFF)
3333 level2[l2] = count3++;
3334 }
3335
3336 if (count2 >= 0xFF || count3 >= 0xFF)
3337 need_dict = 1;
3338
3339 if (need_dict) {
3340 PyObject *result = PyDict_New();
3341 PyObject *key, *value;
3342 if (!result)
3343 return NULL;
3344 for (i = 0; i < 256; i++) {
3345 key = value = NULL;
3346 key = PyInt_FromLong(decode[i]);
3347 value = PyInt_FromLong(i);
3348 if (!key || !value)
3349 goto failed1;
3350 if (PyDict_SetItem(result, key, value) == -1)
3351 goto failed1;
3352 Py_DECREF(key);
3353 Py_DECREF(value);
3354 }
3355 return result;
3356 failed1:
3357 Py_XDECREF(key);
3358 Py_XDECREF(value);
3359 Py_DECREF(result);
3360 return NULL;
3361 }
3362
3363 /* Create a three-level trie */
3364 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3365 16*count2 + 128*count3 - 1);
3366 if (!result)
3367 return PyErr_NoMemory();
3368 PyObject_Init(result, &EncodingMapType);
3369 mresult = (struct encoding_map*)result;
3370 mresult->count2 = count2;
3371 mresult->count3 = count3;
3372 mlevel1 = mresult->level1;
3373 mlevel2 = mresult->level23;
3374 mlevel3 = mresult->level23 + 16*count2;
3375 memcpy(mlevel1, level1, 32);
3376 memset(mlevel2, 0xFF, 16*count2);
3377 memset(mlevel3, 0, 128*count3);
3378 count3 = 0;
3379 for (i = 1; i < 256; i++) {
3380 int o1, o2, o3, i2, i3;
3381 if (decode[i] == 0xFFFE)
3382 /* unmapped character */
3383 continue;
3384 o1 = decode[i]>>11;
3385 o2 = (decode[i]>>7) & 0xF;
3386 i2 = 16*mlevel1[o1] + o2;
3387 if (mlevel2[i2] == 0xFF)
3388 mlevel2[i2] = count3++;
3389 o3 = decode[i] & 0x7F;
3390 i3 = 128*mlevel2[i2] + o3;
3391 mlevel3[i3] = i;
3392 }
3393 return result;
3394}
3395
3396static int
3397encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3398{
3399 struct encoding_map *map = (struct encoding_map*)mapping;
3400 int l1 = c>>11;
3401 int l2 = (c>>7) & 0xF;
3402 int l3 = c & 0x7F;
3403 int i;
3404
3405#ifdef Py_UNICODE_WIDE
3406 if (c > 0xFFFF) {
3407 return -1;
3408 }
3409#endif
3410 if (c == 0)
3411 return 0;
3412 /* level 1*/
3413 i = map->level1[l1];
3414 if (i == 0xFF) {
3415 return -1;
3416 }
3417 /* level 2*/
3418 i = map->level23[16*i+l2];
3419 if (i == 0xFF) {
3420 return -1;
3421 }
3422 /* level 3 */
3423 i = map->level23[16*map->count2 + 128*i + l3];
3424 if (i == 0) {
3425 return -1;
3426 }
3427 return i;
3428}
3429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430/* Lookup the character ch in the mapping. If the character
3431 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003432 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 PyObject *w = PyInt_FromLong((long)c);
3436 PyObject *x;
3437
3438 if (w == NULL)
3439 return NULL;
3440 x = PyObject_GetItem(mapping, w);
3441 Py_DECREF(w);
3442 if (x == NULL) {
3443 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3444 /* No mapping found means: mapping is undefined. */
3445 PyErr_Clear();
3446 x = Py_None;
3447 Py_INCREF(x);
3448 return x;
3449 } else
3450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003452 else if (x == Py_None)
3453 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 else if (PyInt_Check(x)) {
3455 long value = PyInt_AS_LONG(x);
3456 if (value < 0 || value > 255) {
3457 PyErr_SetString(PyExc_TypeError,
3458 "character mapping must be in range(256)");
3459 Py_DECREF(x);
3460 return NULL;
3461 }
3462 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 else if (PyString_Check(x))
3465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 /* wrong return value */
3468 PyErr_SetString(PyExc_TypeError,
3469 "character mapping must return integer, None or str");
3470 Py_DECREF(x);
3471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 }
3473}
3474
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003475static int
3476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3477{
3478 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3479 /* exponentially overallocate to minimize reallocations */
3480 if (requiredsize < 2*outsize)
3481 requiredsize = 2*outsize;
3482 if (_PyString_Resize(outobj, requiredsize)) {
3483 return 0;
3484 }
3485 return 1;
3486}
3487
3488typedef enum charmapencode_result {
3489 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3490}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491/* lookup the character, put the result in the output string and adjust
3492 various state variables. Reallocate the output string if not enough
3493 space is available. Return a new reference to the object that
3494 was put in the output buffer, or Py_None, if the mapping was undefined
3495 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003496 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003501 PyObject *rep;
3502 char *outstart;
3503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003505 if (mapping->ob_type == &EncodingMapType) {
3506 int res = encoding_map_lookup(c, mapping);
3507 Py_ssize_t requiredsize = *outpos+1;
3508 if (res == -1)
3509 return enc_FAILED;
3510 if (outsize<requiredsize)
3511 if (!charmapencode_resize(outobj, outpos, requiredsize))
3512 return enc_EXCEPTION;
3513 outstart = PyString_AS_STRING(*outobj);
3514 outstart[(*outpos)++] = (char)res;
3515 return enc_SUCCESS;
3516 }
3517
3518 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003520 return enc_EXCEPTION;
3521 else if (rep==Py_None) {
3522 Py_DECREF(rep);
3523 return enc_FAILED;
3524 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003526 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003527 if (outsize<requiredsize)
3528 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003530 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003532 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3534 }
3535 else {
3536 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3538 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003539 if (outsize<requiredsize)
3540 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003542 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003544 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 memcpy(outstart + *outpos, repchars, repsize);
3546 *outpos += repsize;
3547 }
3548 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003549 Py_DECREF(rep);
3550 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551}
3552
3553/* handle an error in PyUnicode_EncodeCharmap
3554 Return 0 on success, -1 on error */
3555static
3556int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003557 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003559 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561{
3562 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 Py_ssize_t repsize;
3564 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_UNICODE *uni2;
3566 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t collstartpos = *inpos;
3568 Py_ssize_t collendpos = *inpos+1;
3569 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 char *encoding = "charmap";
3571 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003572 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 /* find all unencodable characters */
3575 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003576 PyObject *rep;
3577 if (mapping->ob_type == &EncodingMapType) {
3578 int res = encoding_map_lookup(p[collendpos], mapping);
3579 if (res != -1)
3580 break;
3581 ++collendpos;
3582 continue;
3583 }
3584
3585 rep = charmapencode_lookup(p[collendpos], mapping);
3586 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 else if (rep!=Py_None) {
3589 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 break;
3591 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003592 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 ++collendpos;
3594 }
3595 /* cache callback name lookup
3596 * (if not done yet, i.e. it's the first error) */
3597 if (*known_errorHandler==-1) {
3598 if ((errors==NULL) || (!strcmp(errors, "strict")))
3599 *known_errorHandler = 1;
3600 else if (!strcmp(errors, "replace"))
3601 *known_errorHandler = 2;
3602 else if (!strcmp(errors, "ignore"))
3603 *known_errorHandler = 3;
3604 else if (!strcmp(errors, "xmlcharrefreplace"))
3605 *known_errorHandler = 4;
3606 else
3607 *known_errorHandler = 0;
3608 }
3609 switch (*known_errorHandler) {
3610 case 1: /* strict */
3611 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3612 return -1;
3613 case 2: /* replace */
3614 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3615 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003616 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 return -1;
3618 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003619 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621 return -1;
3622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 }
3624 /* fall through */
3625 case 3: /* ignore */
3626 *inpos = collendpos;
3627 break;
3628 case 4: /* xmlcharrefreplace */
3629 /* generate replacement (temporarily (mis)uses p) */
3630 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3631 char buffer[2+29+1+1];
3632 char *cp;
3633 sprintf(buffer, "&#%d;", (int)p[collpos]);
3634 for (cp = buffer; *cp; ++cp) {
3635 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003636 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003638 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3640 return -1;
3641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 }
3643 }
3644 *inpos = collendpos;
3645 break;
3646 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003647 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 encoding, reason, p, size, exceptionObject,
3649 collstartpos, collendpos, &newpos);
3650 if (repunicode == NULL)
3651 return -1;
3652 /* generate replacement */
3653 repsize = PyUnicode_GET_SIZE(repunicode);
3654 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3655 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003656 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 return -1;
3658 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003659 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662 return -1;
3663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 }
3665 *inpos = newpos;
3666 Py_DECREF(repunicode);
3667 }
3668 return 0;
3669}
3670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003672 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 PyObject *mapping,
3674 const char *errors)
3675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 /* output object */
3677 PyObject *res = NULL;
3678 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003679 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003681 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 PyObject *errorHandler = NULL;
3683 PyObject *exc = NULL;
3684 /* the following variable is used for caching string comparisons
3685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686 * 3=ignore, 4=xmlcharrefreplace */
3687 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688
3689 /* Default to Latin-1 */
3690 if (mapping == NULL)
3691 return PyUnicode_EncodeLatin1(p, size, errors);
3692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 /* allocate enough for a simple encoding without
3694 replacements, if we need more, we'll resize */
3695 res = PyString_FromStringAndSize(NULL, size);
3696 if (res == NULL)
3697 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003698 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 while (inpos<size) {
3702 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003703 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3704 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003706 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 if (charmap_encoding_error(p, size, &inpos, mapping,
3708 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003709 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003710 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003711 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 else
3715 /* done with this character => adjust input position */
3716 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* Resize if we allocated to much */
3720 if (respos<PyString_GET_SIZE(res)) {
3721 if (_PyString_Resize(&res, respos))
3722 goto onError;
3723 }
3724 Py_XDECREF(exc);
3725 Py_XDECREF(errorHandler);
3726 return res;
3727
3728 onError:
3729 Py_XDECREF(res);
3730 Py_XDECREF(exc);
3731 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 return NULL;
3733}
3734
3735PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3736 PyObject *mapping)
3737{
3738 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3739 PyErr_BadArgument();
3740 return NULL;
3741 }
3742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3743 PyUnicode_GET_SIZE(unicode),
3744 mapping,
3745 NULL);
3746}
3747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748/* create or adjust a UnicodeTranslateError */
3749static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003750 const Py_UNICODE *unicode, Py_ssize_t size,
3751 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 if (*exceptionObject == NULL) {
3755 *exceptionObject = PyUnicodeTranslateError_Create(
3756 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 }
3758 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3760 goto onError;
3761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3762 goto onError;
3763 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3764 goto onError;
3765 return;
3766 onError:
3767 Py_DECREF(*exceptionObject);
3768 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
3770}
3771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772/* raises a UnicodeTranslateError */
3773static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003774 const Py_UNICODE *unicode, Py_ssize_t size,
3775 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 const char *reason)
3777{
3778 make_translate_exception(exceptionObject,
3779 unicode, size, startpos, endpos, reason);
3780 if (*exceptionObject != NULL)
3781 PyCodec_StrictErrors(*exceptionObject);
3782}
3783
3784/* error handling callback helper:
3785 build arguments, call the callback and check the arguments,
3786 put the result into newpos and return the replacement string, which
3787 has to be freed by the caller */
3788static PyObject *unicode_translate_call_errorhandler(const char *errors,
3789 PyObject **errorHandler,
3790 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003791 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3792 Py_ssize_t startpos, Py_ssize_t endpos,
3793 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003795 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003797 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 PyObject *restuple;
3799 PyObject *resunicode;
3800
3801 if (*errorHandler == NULL) {
3802 *errorHandler = PyCodec_LookupError(errors);
3803 if (*errorHandler == NULL)
3804 return NULL;
3805 }
3806
3807 make_translate_exception(exceptionObject,
3808 unicode, size, startpos, endpos, reason);
3809 if (*exceptionObject == NULL)
3810 return NULL;
3811
3812 restuple = PyObject_CallFunctionObjArgs(
3813 *errorHandler, *exceptionObject, NULL);
3814 if (restuple == NULL)
3815 return NULL;
3816 if (!PyTuple_Check(restuple)) {
3817 PyErr_Format(PyExc_TypeError, &argparse[4]);
3818 Py_DECREF(restuple);
3819 return NULL;
3820 }
3821 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_DECREF(restuple);
3824 return NULL;
3825 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003826 if (i_newpos<0)
3827 *newpos = size+i_newpos;
3828 else
3829 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003830 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003831 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003832 Py_DECREF(restuple);
3833 return NULL;
3834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_INCREF(resunicode);
3836 Py_DECREF(restuple);
3837 return resunicode;
3838}
3839
3840/* Lookup the character ch in the mapping and put the result in result,
3841 which must be decrefed by the caller.
3842 Return 0 on success, -1 on error */
3843static
3844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3845{
3846 PyObject *w = PyInt_FromLong((long)c);
3847 PyObject *x;
3848
3849 if (w == NULL)
3850 return -1;
3851 x = PyObject_GetItem(mapping, w);
3852 Py_DECREF(w);
3853 if (x == NULL) {
3854 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3855 /* No mapping found means: use 1:1 mapping. */
3856 PyErr_Clear();
3857 *result = NULL;
3858 return 0;
3859 } else
3860 return -1;
3861 }
3862 else if (x == Py_None) {
3863 *result = x;
3864 return 0;
3865 }
3866 else if (PyInt_Check(x)) {
3867 long value = PyInt_AS_LONG(x);
3868 long max = PyUnicode_GetMax();
3869 if (value < 0 || value > max) {
3870 PyErr_Format(PyExc_TypeError,
3871 "character mapping must be in range(0x%lx)", max+1);
3872 Py_DECREF(x);
3873 return -1;
3874 }
3875 *result = x;
3876 return 0;
3877 }
3878 else if (PyUnicode_Check(x)) {
3879 *result = x;
3880 return 0;
3881 }
3882 else {
3883 /* wrong return value */
3884 PyErr_SetString(PyExc_TypeError,
3885 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003886 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 return -1;
3888 }
3889}
3890/* ensure that *outobj is at least requiredsize characters long,
3891if not reallocate and adjust various state variables.
3892Return 0 on success, -1 on error */
3893static
Walter Dörwald4894c302003-10-24 14:25:28 +00003894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003897 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003898 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003902 if (requiredsize < 2 * oldsize)
3903 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003904 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003905 return -1;
3906 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 }
3908 return 0;
3909}
3910/* lookup the character, put the result in the output string and adjust
3911 various state variables. Return a new reference to the object that
3912 was put in the output buffer in *result, or Py_None, if the mapping was
3913 undefined (in which case no character was written).
3914 The called must decref result.
3915 Return 0 on success, -1 on error. */
3916static
Walter Dörwald4894c302003-10-24 14:25:28 +00003917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003919 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920{
Walter Dörwald4894c302003-10-24 14:25:28 +00003921 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 return -1;
3923 if (*res==NULL) {
3924 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003925 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 }
3927 else if (*res==Py_None)
3928 ;
3929 else if (PyInt_Check(*res)) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3932 }
3933 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 if (repsize==1) {
3936 /* no overflow check, because we know that the space is enough */
3937 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3938 }
3939 else if (repsize!=0) {
3940 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003941 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003942 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003943 repsize - 1;
3944 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 return -1;
3946 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3947 *outp += repsize;
3948 }
3949 }
3950 else
3951 return -1;
3952 return 0;
3953}
3954
3955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 PyObject *mapping,
3958 const char *errors)
3959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 /* output object */
3961 PyObject *res = NULL;
3962 /* pointers to the beginning and end+1 of input */
3963 const Py_UNICODE *startp = p;
3964 const Py_UNICODE *endp = p + size;
3965 /* pointer into the output */
3966 Py_UNICODE *str;
3967 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 char *reason = "character maps to <undefined>";
3970 PyObject *errorHandler = NULL;
3971 PyObject *exc = NULL;
3972 /* the following variable is used for caching string comparisons
3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974 * 3=ignore, 4=xmlcharrefreplace */
3975 int known_errorHandler = -1;
3976
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 if (mapping == NULL) {
3978 PyErr_BadArgument();
3979 return NULL;
3980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981
3982 /* allocate enough for a simple 1:1 translation without
3983 replacements, if we need more, we'll resize */
3984 res = PyUnicode_FromUnicode(NULL, size);
3985 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 return res;
3989 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 while (p<endp) {
3992 /* try to encode it */
3993 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003994 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
3997 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003998 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (x!=Py_None) /* it worked => adjust input pointer */
4000 ++p;
4001 else { /* untranslatable character */
4002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004003 Py_ssize_t repsize;
4004 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 Py_UNICODE *uni2;
4006 /* startpos for collecting untranslatable chars */
4007 const Py_UNICODE *collstart = p;
4008 const Py_UNICODE *collend = p+1;
4009 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 /* find all untranslatable characters */
4012 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004013 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 goto onError;
4015 Py_XDECREF(x);
4016 if (x!=Py_None)
4017 break;
4018 ++collend;
4019 }
4020 /* cache callback name lookup
4021 * (if not done yet, i.e. it's the first error) */
4022 if (known_errorHandler==-1) {
4023 if ((errors==NULL) || (!strcmp(errors, "strict")))
4024 known_errorHandler = 1;
4025 else if (!strcmp(errors, "replace"))
4026 known_errorHandler = 2;
4027 else if (!strcmp(errors, "ignore"))
4028 known_errorHandler = 3;
4029 else if (!strcmp(errors, "xmlcharrefreplace"))
4030 known_errorHandler = 4;
4031 else
4032 known_errorHandler = 0;
4033 }
4034 switch (known_errorHandler) {
4035 case 1: /* strict */
4036 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4037 goto onError;
4038 case 2: /* replace */
4039 /* No need to check for space, this is a 1:1 replacement */
4040 for (coll = collstart; coll<collend; ++coll)
4041 *str++ = '?';
4042 /* fall through */
4043 case 3: /* ignore */
4044 p = collend;
4045 break;
4046 case 4: /* xmlcharrefreplace */
4047 /* generate replacement (temporarily (mis)uses p) */
4048 for (p = collstart; p < collend; ++p) {
4049 char buffer[2+29+1+1];
4050 char *cp;
4051 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004052 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4054 goto onError;
4055 for (cp = buffer; *cp; ++cp)
4056 *str++ = *cp;
4057 }
4058 p = collend;
4059 break;
4060 default:
4061 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4062 reason, startp, size, &exc,
4063 collstart-startp, collend-startp, &newpos);
4064 if (repunicode == NULL)
4065 goto onError;
4066 /* generate replacement */
4067 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004068 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4070 Py_DECREF(repunicode);
4071 goto onError;
4072 }
4073 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4074 *str++ = *uni2;
4075 p = startp + newpos;
4076 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 }
4078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 /* Resize if we allocated to much */
4081 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004082 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004083 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 }
4086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
4088 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 onError:
4091 Py_XDECREF(res);
4092 Py_XDECREF(exc);
4093 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 return NULL;
4095}
4096
4097PyObject *PyUnicode_Translate(PyObject *str,
4098 PyObject *mapping,
4099 const char *errors)
4100{
4101 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 str = PyUnicode_FromObject(str);
4104 if (str == NULL)
4105 goto onError;
4106 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4107 PyUnicode_GET_SIZE(str),
4108 mapping,
4109 errors);
4110 Py_DECREF(str);
4111 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 onError:
4114 Py_XDECREF(str);
4115 return NULL;
4116}
Tim Petersced69f82003-09-16 20:30:58 +00004117
Guido van Rossum9e896b32000-04-05 20:11:21 +00004118/* --- Decimal Encoder ---------------------------------------------------- */
4119
4120int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004121 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004122 char *output,
4123 const char *errors)
4124{
4125 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 PyObject *errorHandler = NULL;
4127 PyObject *exc = NULL;
4128 const char *encoding = "decimal";
4129 const char *reason = "invalid decimal Unicode string";
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004133
4134 if (output == NULL) {
4135 PyErr_BadArgument();
4136 return -1;
4137 }
4138
4139 p = s;
4140 end = s + length;
4141 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004143 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004145 Py_ssize_t repsize;
4146 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 Py_UNICODE *uni2;
4148 Py_UNICODE *collstart;
4149 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004150
Guido van Rossum9e896b32000-04-05 20:11:21 +00004151 if (Py_UNICODE_ISSPACE(ch)) {
4152 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004154 continue;
4155 }
4156 decimal = Py_UNICODE_TODECIMAL(ch);
4157 if (decimal >= 0) {
4158 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004160 continue;
4161 }
Guido van Rossumba477042000-04-06 18:18:10 +00004162 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004163 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004165 continue;
4166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 /* All other characters are considered unencodable */
4168 collstart = p;
4169 collend = p+1;
4170 while (collend < end) {
4171 if ((0 < *collend && *collend < 256) ||
4172 !Py_UNICODE_ISSPACE(*collend) ||
4173 Py_UNICODE_TODECIMAL(*collend))
4174 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004175 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* cache callback name lookup
4177 * (if not done yet, i.e. it's the first error) */
4178 if (known_errorHandler==-1) {
4179 if ((errors==NULL) || (!strcmp(errors, "strict")))
4180 known_errorHandler = 1;
4181 else if (!strcmp(errors, "replace"))
4182 known_errorHandler = 2;
4183 else if (!strcmp(errors, "ignore"))
4184 known_errorHandler = 3;
4185 else if (!strcmp(errors, "xmlcharrefreplace"))
4186 known_errorHandler = 4;
4187 else
4188 known_errorHandler = 0;
4189 }
4190 switch (known_errorHandler) {
4191 case 1: /* strict */
4192 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4193 goto onError;
4194 case 2: /* replace */
4195 for (p = collstart; p < collend; ++p)
4196 *output++ = '?';
4197 /* fall through */
4198 case 3: /* ignore */
4199 p = collend;
4200 break;
4201 case 4: /* xmlcharrefreplace */
4202 /* generate replacement (temporarily (mis)uses p) */
4203 for (p = collstart; p < collend; ++p)
4204 output += sprintf(output, "&#%d;", (int)*p);
4205 p = collend;
4206 break;
4207 default:
4208 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4209 encoding, reason, s, length, &exc,
4210 collstart-s, collend-s, &newpos);
4211 if (repunicode == NULL)
4212 goto onError;
4213 /* generate replacement */
4214 repsize = PyUnicode_GET_SIZE(repunicode);
4215 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4216 Py_UNICODE ch = *uni2;
4217 if (Py_UNICODE_ISSPACE(ch))
4218 *output++ = ' ';
4219 else {
4220 decimal = Py_UNICODE_TODECIMAL(ch);
4221 if (decimal >= 0)
4222 *output++ = '0' + decimal;
4223 else if (0 < ch && ch < 256)
4224 *output++ = (char)ch;
4225 else {
4226 Py_DECREF(repunicode);
4227 raise_encode_exception(&exc, encoding,
4228 s, length, collstart-s, collend-s, reason);
4229 goto onError;
4230 }
4231 }
4232 }
4233 p = s + newpos;
4234 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004235 }
4236 }
4237 /* 0-terminate the output string */
4238 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 Py_XDECREF(exc);
4240 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004241 return 0;
4242
4243 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 Py_XDECREF(exc);
4245 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004246 return -1;
4247}
4248
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249/* --- Helpers ------------------------------------------------------------ */
4250
Thomas Wouters477c8d52006-05-27 19:21:47 +00004251#define STRINGLIB_CHAR Py_UNICODE
4252
4253#define STRINGLIB_LEN PyUnicode_GET_SIZE
4254#define STRINGLIB_NEW PyUnicode_FromUnicode
4255#define STRINGLIB_STR PyUnicode_AS_UNICODE
4256
4257Py_LOCAL_INLINE(int)
4258STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004260 if (str[0] != other[0])
4261 return 1;
4262 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263}
4264
Thomas Wouters477c8d52006-05-27 19:21:47 +00004265#define STRINGLIB_EMPTY unicode_empty
4266
4267#include "stringlib/fastsearch.h"
4268
4269#include "stringlib/count.h"
4270#include "stringlib/find.h"
4271#include "stringlib/partition.h"
4272
4273/* helper macro to fixup start/end slice values */
4274#define FIX_START_END(obj) \
4275 if (start < 0) \
4276 start += (obj)->length; \
4277 if (start < 0) \
4278 start = 0; \
4279 if (end > (obj)->length) \
4280 end = (obj)->length; \
4281 if (end < 0) \
4282 end += (obj)->length; \
4283 if (end < 0) \
4284 end = 0;
4285
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004287 PyObject *substr,
4288 Py_ssize_t start,
4289 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004292 PyUnicodeObject* str_obj;
4293 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004294
Thomas Wouters477c8d52006-05-27 19:21:47 +00004295 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4296 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004298 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4299 if (!sub_obj) {
4300 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 return -1;
4302 }
Tim Petersced69f82003-09-16 20:30:58 +00004303
Thomas Wouters477c8d52006-05-27 19:21:47 +00004304 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004305
Thomas Wouters477c8d52006-05-27 19:21:47 +00004306 result = stringlib_count(
4307 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4308 );
4309
4310 Py_DECREF(sub_obj);
4311 Py_DECREF(str_obj);
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return result;
4314}
4315
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004317 PyObject *sub,
4318 Py_ssize_t start,
4319 Py_ssize_t end,
4320 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004323
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004325 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004326 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004327 sub = PyUnicode_FromObject(sub);
4328 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004329 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004330 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 }
Tim Petersced69f82003-09-16 20:30:58 +00004332
Thomas Wouters477c8d52006-05-27 19:21:47 +00004333 if (direction > 0)
4334 result = stringlib_find_slice(
4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337 start, end
4338 );
4339 else
4340 result = stringlib_rfind_slice(
4341 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4342 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4343 start, end
4344 );
4345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004347 Py_DECREF(sub);
4348
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 return result;
4350}
4351
Tim Petersced69f82003-09-16 20:30:58 +00004352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353int tailmatch(PyUnicodeObject *self,
4354 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004355 Py_ssize_t start,
4356 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 int direction)
4358{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 if (substring->length == 0)
4360 return 1;
4361
Thomas Wouters477c8d52006-05-27 19:21:47 +00004362 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363
4364 end -= substring->length;
4365 if (end < start)
4366 return 0;
4367
4368 if (direction > 0) {
4369 if (Py_UNICODE_MATCH(self, end, substring))
4370 return 1;
4371 } else {
4372 if (Py_UNICODE_MATCH(self, start, substring))
4373 return 1;
4374 }
4375
4376 return 0;
4377}
4378
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t start,
4382 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 int direction)
4384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 str = PyUnicode_FromObject(str);
4388 if (str == NULL)
4389 return -1;
4390 substr = PyUnicode_FromObject(substr);
4391 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004392 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 return -1;
4394 }
Tim Petersced69f82003-09-16 20:30:58 +00004395
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 result = tailmatch((PyUnicodeObject *)str,
4397 (PyUnicodeObject *)substr,
4398 start, end, direction);
4399 Py_DECREF(str);
4400 Py_DECREF(substr);
4401 return result;
4402}
4403
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404/* Apply fixfct filter to the Unicode object self and return a
4405 reference to the modified object */
4406
Tim Petersced69f82003-09-16 20:30:58 +00004407static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408PyObject *fixup(PyUnicodeObject *self,
4409 int (*fixfct)(PyUnicodeObject *s))
4410{
4411
4412 PyUnicodeObject *u;
4413
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004414 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 if (u == NULL)
4416 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004417
4418 Py_UNICODE_COPY(u->str, self->str, self->length);
4419
Tim Peters7a29bd52001-09-12 03:03:31 +00004420 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 /* fixfct should return TRUE if it modified the buffer. If
4422 FALSE, return a reference to the original buffer instead
4423 (to save space, not time) */
4424 Py_INCREF(self);
4425 Py_DECREF(u);
4426 return (PyObject*) self;
4427 }
4428 return (PyObject*) u;
4429}
4430
Tim Petersced69f82003-09-16 20:30:58 +00004431static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432int fixupper(PyUnicodeObject *self)
4433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 Py_UNICODE *s = self->str;
4436 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 while (len-- > 0) {
4439 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004440
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 ch = Py_UNICODE_TOUPPER(*s);
4442 if (ch != *s) {
4443 status = 1;
4444 *s = ch;
4445 }
4446 s++;
4447 }
4448
4449 return status;
4450}
4451
Tim Petersced69f82003-09-16 20:30:58 +00004452static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453int fixlower(PyUnicodeObject *self)
4454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 Py_UNICODE *s = self->str;
4457 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004458
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 while (len-- > 0) {
4460 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004461
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 ch = Py_UNICODE_TOLOWER(*s);
4463 if (ch != *s) {
4464 status = 1;
4465 *s = ch;
4466 }
4467 s++;
4468 }
4469
4470 return status;
4471}
4472
Tim Petersced69f82003-09-16 20:30:58 +00004473static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474int fixswapcase(PyUnicodeObject *self)
4475{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004476 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 Py_UNICODE *s = self->str;
4478 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 while (len-- > 0) {
4481 if (Py_UNICODE_ISUPPER(*s)) {
4482 *s = Py_UNICODE_TOLOWER(*s);
4483 status = 1;
4484 } else if (Py_UNICODE_ISLOWER(*s)) {
4485 *s = Py_UNICODE_TOUPPER(*s);
4486 status = 1;
4487 }
4488 s++;
4489 }
4490
4491 return status;
4492}
4493
Tim Petersced69f82003-09-16 20:30:58 +00004494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495int fixcapitalize(PyUnicodeObject *self)
4496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004498 Py_UNICODE *s = self->str;
4499 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004500
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004501 if (len == 0)
4502 return 0;
4503 if (Py_UNICODE_ISLOWER(*s)) {
4504 *s = Py_UNICODE_TOUPPER(*s);
4505 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004507 s++;
4508 while (--len > 0) {
4509 if (Py_UNICODE_ISUPPER(*s)) {
4510 *s = Py_UNICODE_TOLOWER(*s);
4511 status = 1;
4512 }
4513 s++;
4514 }
4515 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516}
4517
4518static
4519int fixtitle(PyUnicodeObject *self)
4520{
4521 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4522 register Py_UNICODE *e;
4523 int previous_is_cased;
4524
4525 /* Shortcut for single character strings */
4526 if (PyUnicode_GET_SIZE(self) == 1) {
4527 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4528 if (*p != ch) {
4529 *p = ch;
4530 return 1;
4531 }
4532 else
4533 return 0;
4534 }
Tim Petersced69f82003-09-16 20:30:58 +00004535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 e = p + PyUnicode_GET_SIZE(self);
4537 previous_is_cased = 0;
4538 for (; p < e; p++) {
4539 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004540
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 if (previous_is_cased)
4542 *p = Py_UNICODE_TOLOWER(ch);
4543 else
4544 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004545
4546 if (Py_UNICODE_ISLOWER(ch) ||
4547 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 Py_UNICODE_ISTITLE(ch))
4549 previous_is_cased = 1;
4550 else
4551 previous_is_cased = 0;
4552 }
4553 return 1;
4554}
4555
Tim Peters8ce9f162004-08-27 01:49:32 +00004556PyObject *
4557PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558{
Tim Peters8ce9f162004-08-27 01:49:32 +00004559 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004560 const Py_UNICODE blank = ' ';
4561 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004562 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004563 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004564 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4565 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004566 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4567 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004569 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004570 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571
Tim Peters05eba1f2004-08-27 21:32:02 +00004572 fseq = PySequence_Fast(seq, "");
4573 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004574 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004575 }
4576
Tim Peters91879ab2004-08-27 22:35:44 +00004577 /* Grrrr. A codec may be invoked to convert str objects to
4578 * Unicode, and so it's possible to call back into Python code
4579 * during PyUnicode_FromObject(), and so it's possible for a sick
4580 * codec to change the size of fseq (if seq is a list). Therefore
4581 * we have to keep refetching the size -- can't assume seqlen
4582 * is invariant.
4583 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004584 seqlen = PySequence_Fast_GET_SIZE(fseq);
4585 /* If empty sequence, return u"". */
4586 if (seqlen == 0) {
4587 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4588 goto Done;
4589 }
4590 /* If singleton sequence with an exact Unicode, return that. */
4591 if (seqlen == 1) {
4592 item = PySequence_Fast_GET_ITEM(fseq, 0);
4593 if (PyUnicode_CheckExact(item)) {
4594 Py_INCREF(item);
4595 res = (PyUnicodeObject *)item;
4596 goto Done;
4597 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004598 }
4599
Tim Peters05eba1f2004-08-27 21:32:02 +00004600 /* At least two items to join, or one that isn't exact Unicode. */
4601 if (seqlen > 1) {
4602 /* Set up sep and seplen -- they're needed. */
4603 if (separator == NULL) {
4604 sep = &blank;
4605 seplen = 1;
4606 }
4607 else {
4608 internal_separator = PyUnicode_FromObject(separator);
4609 if (internal_separator == NULL)
4610 goto onError;
4611 sep = PyUnicode_AS_UNICODE(internal_separator);
4612 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004613 /* In case PyUnicode_FromObject() mutated seq. */
4614 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004615 }
4616 }
4617
4618 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004619 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004620 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004621 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004622 res_p = PyUnicode_AS_UNICODE(res);
4623 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004624
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004626 Py_ssize_t itemlen;
4627 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004628
4629 item = PySequence_Fast_GET_ITEM(fseq, i);
4630 /* Convert item to Unicode. */
4631 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4632 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004633 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004634 " %.80s found",
4635 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004636 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004637 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004638 item = PyUnicode_FromObject(item);
4639 if (item == NULL)
4640 goto onError;
4641 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004642
Tim Peters91879ab2004-08-27 22:35:44 +00004643 /* In case PyUnicode_FromObject() mutated seq. */
4644 seqlen = PySequence_Fast_GET_SIZE(fseq);
4645
Tim Peters8ce9f162004-08-27 01:49:32 +00004646 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004648 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004649 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004650 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004651 if (i < seqlen - 1) {
4652 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004654 goto Overflow;
4655 }
4656 if (new_res_used > res_alloc) {
4657 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004658 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004659 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004660 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004661 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004662 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004663 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004664 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004666 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004667 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004669
4670 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004671 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004672 res_p += itemlen;
4673 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004674 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004675 res_p += seplen;
4676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004678 res_used = new_res_used;
4679 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004680
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 /* Shrink res to match the used area; this probably can't fail,
4682 * but it's cheap to check.
4683 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004684 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004685 goto onError;
4686
4687 Done:
4688 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004689 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 return (PyObject *)res;
4691
Tim Peters8ce9f162004-08-27 01:49:32 +00004692 Overflow:
4693 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004694 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004695 Py_DECREF(item);
4696 /* fall through */
4697
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004699 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004700 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004701 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return NULL;
4703}
4704
Tim Petersced69f82003-09-16 20:30:58 +00004705static
4706PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t left,
4708 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 Py_UNICODE fill)
4710{
4711 PyUnicodeObject *u;
4712
4713 if (left < 0)
4714 left = 0;
4715 if (right < 0)
4716 right = 0;
4717
Tim Peters7a29bd52001-09-12 03:03:31 +00004718 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 Py_INCREF(self);
4720 return self;
4721 }
4722
4723 u = _PyUnicode_New(left + self->length + right);
4724 if (u) {
4725 if (left)
4726 Py_UNICODE_FILL(u->str, fill, left);
4727 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4728 if (right)
4729 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4730 }
4731
4732 return u;
4733}
4734
4735#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004736 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 if (!str) \
4738 goto onError; \
4739 if (PyList_Append(list, str)) { \
4740 Py_DECREF(str); \
4741 goto onError; \
4742 } \
4743 else \
4744 Py_DECREF(str);
4745
4746static
4747PyObject *split_whitespace(PyUnicodeObject *self,
4748 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004749 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004751 register Py_ssize_t i;
4752 register Py_ssize_t j;
4753 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 PyObject *str;
4755
4756 for (i = j = 0; i < len; ) {
4757 /* find a token */
4758 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4759 i++;
4760 j = i;
4761 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 if (j < i) {
4764 if (maxcount-- <= 0)
4765 break;
4766 SPLIT_APPEND(self->str, j, i);
4767 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4768 i++;
4769 j = i;
4770 }
4771 }
4772 if (j < len) {
4773 SPLIT_APPEND(self->str, j, len);
4774 }
4775 return list;
4776
4777 onError:
4778 Py_DECREF(list);
4779 return NULL;
4780}
4781
4782PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004783 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 register Py_ssize_t i;
4786 register Py_ssize_t j;
4787 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 PyObject *list;
4789 PyObject *str;
4790 Py_UNICODE *data;
4791
4792 string = PyUnicode_FromObject(string);
4793 if (string == NULL)
4794 return NULL;
4795 data = PyUnicode_AS_UNICODE(string);
4796 len = PyUnicode_GET_SIZE(string);
4797
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 list = PyList_New(0);
4799 if (!list)
4800 goto onError;
4801
4802 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004806 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
4809 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004810 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 if (i < len) {
4812 if (data[i] == '\r' && i + 1 < len &&
4813 data[i+1] == '\n')
4814 i += 2;
4815 else
4816 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004817 if (keepends)
4818 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
Guido van Rossum86662912000-04-11 15:38:46 +00004820 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 j = i;
4822 }
4823 if (j < len) {
4824 SPLIT_APPEND(data, j, len);
4825 }
4826
4827 Py_DECREF(string);
4828 return list;
4829
4830 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004831 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 Py_DECREF(string);
4833 return NULL;
4834}
4835
Tim Petersced69f82003-09-16 20:30:58 +00004836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837PyObject *split_char(PyUnicodeObject *self,
4838 PyObject *list,
4839 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004840 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004842 register Py_ssize_t i;
4843 register Py_ssize_t j;
4844 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 PyObject *str;
4846
4847 for (i = j = 0; i < len; ) {
4848 if (self->str[i] == ch) {
4849 if (maxcount-- <= 0)
4850 break;
4851 SPLIT_APPEND(self->str, j, i);
4852 i = j = i + 1;
4853 } else
4854 i++;
4855 }
4856 if (j <= len) {
4857 SPLIT_APPEND(self->str, j, len);
4858 }
4859 return list;
4860
4861 onError:
4862 Py_DECREF(list);
4863 return NULL;
4864}
4865
Tim Petersced69f82003-09-16 20:30:58 +00004866static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867PyObject *split_substring(PyUnicodeObject *self,
4868 PyObject *list,
4869 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004870 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 register Py_ssize_t i;
4873 register Py_ssize_t j;
4874 Py_ssize_t len = self->length;
4875 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 PyObject *str;
4877
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004878 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 if (Py_UNICODE_MATCH(self, i, substring)) {
4880 if (maxcount-- <= 0)
4881 break;
4882 SPLIT_APPEND(self->str, j, i);
4883 i = j = i + sublen;
4884 } else
4885 i++;
4886 }
4887 if (j <= len) {
4888 SPLIT_APPEND(self->str, j, len);
4889 }
4890 return list;
4891
4892 onError:
4893 Py_DECREF(list);
4894 return NULL;
4895}
4896
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004897static
4898PyObject *rsplit_whitespace(PyUnicodeObject *self,
4899 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004901{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 register Py_ssize_t i;
4903 register Py_ssize_t j;
4904 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004905 PyObject *str;
4906
4907 for (i = j = len - 1; i >= 0; ) {
4908 /* find a token */
4909 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4910 i--;
4911 j = i;
4912 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 if (j > i) {
4915 if (maxcount-- <= 0)
4916 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004917 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004918 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4919 i--;
4920 j = i;
4921 }
4922 }
4923 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004924 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004925 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004926 if (PyList_Reverse(list) < 0)
4927 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004928 return list;
4929
4930 onError:
4931 Py_DECREF(list);
4932 return NULL;
4933}
4934
4935static
4936PyObject *rsplit_char(PyUnicodeObject *self,
4937 PyObject *list,
4938 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 register Py_ssize_t i;
4942 register Py_ssize_t j;
4943 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004944 PyObject *str;
4945
4946 for (i = j = len - 1; i >= 0; ) {
4947 if (self->str[i] == ch) {
4948 if (maxcount-- <= 0)
4949 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004950 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004951 j = i = i - 1;
4952 } else
4953 i--;
4954 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004955 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004956 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004957 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004958 if (PyList_Reverse(list) < 0)
4959 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960 return list;
4961
4962 onError:
4963 Py_DECREF(list);
4964 return NULL;
4965}
4966
4967static
4968PyObject *rsplit_substring(PyUnicodeObject *self,
4969 PyObject *list,
4970 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 register Py_ssize_t i;
4974 register Py_ssize_t j;
4975 Py_ssize_t len = self->length;
4976 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004977 PyObject *str;
4978
4979 for (i = len - sublen, j = len; i >= 0; ) {
4980 if (Py_UNICODE_MATCH(self, i, substring)) {
4981 if (maxcount-- <= 0)
4982 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004983 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984 j = i;
4985 i -= sublen;
4986 } else
4987 i--;
4988 }
4989 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004990 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004992 if (PyList_Reverse(list) < 0)
4993 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004994 return list;
4995
4996 onError:
4997 Py_DECREF(list);
4998 return NULL;
4999}
5000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001#undef SPLIT_APPEND
5002
5003static
5004PyObject *split(PyUnicodeObject *self,
5005 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007{
5008 PyObject *list;
5009
5010 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005011 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
5013 list = PyList_New(0);
5014 if (!list)
5015 return NULL;
5016
5017 if (substring == NULL)
5018 return split_whitespace(self,list,maxcount);
5019
5020 else if (substring->length == 1)
5021 return split_char(self,list,substring->str[0],maxcount);
5022
5023 else if (substring->length == 0) {
5024 Py_DECREF(list);
5025 PyErr_SetString(PyExc_ValueError, "empty separator");
5026 return NULL;
5027 }
5028 else
5029 return split_substring(self,list,substring,maxcount);
5030}
5031
Tim Petersced69f82003-09-16 20:30:58 +00005032static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005033PyObject *rsplit(PyUnicodeObject *self,
5034 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036{
5037 PyObject *list;
5038
5039 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005040 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005041
5042 list = PyList_New(0);
5043 if (!list)
5044 return NULL;
5045
5046 if (substring == NULL)
5047 return rsplit_whitespace(self,list,maxcount);
5048
5049 else if (substring->length == 1)
5050 return rsplit_char(self,list,substring->str[0],maxcount);
5051
5052 else if (substring->length == 0) {
5053 Py_DECREF(list);
5054 PyErr_SetString(PyExc_ValueError, "empty separator");
5055 return NULL;
5056 }
5057 else
5058 return rsplit_substring(self,list,substring,maxcount);
5059}
5060
5061static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062PyObject *replace(PyUnicodeObject *self,
5063 PyUnicodeObject *str1,
5064 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005065 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
5067 PyUnicodeObject *u;
5068
5069 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005070 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
Thomas Wouters477c8d52006-05-27 19:21:47 +00005072 if (str1->length == str2->length) {
5073 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005074 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005075 if (str1->length == 1) {
5076 /* replace characters */
5077 Py_UNICODE u1, u2;
5078 if (!findchar(self->str, self->length, str1->str[0]))
5079 goto nothing;
5080 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5081 if (!u)
5082 return NULL;
5083 Py_UNICODE_COPY(u->str, self->str, self->length);
5084 u1 = str1->str[0];
5085 u2 = str2->str[0];
5086 for (i = 0; i < u->length; i++)
5087 if (u->str[i] == u1) {
5088 if (--maxcount < 0)
5089 break;
5090 u->str[i] = u2;
5091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005093 i = fastsearch(
5094 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005096 if (i < 0)
5097 goto nothing;
5098 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5099 if (!u)
5100 return NULL;
5101 Py_UNICODE_COPY(u->str, self->str, self->length);
5102 while (i <= self->length - str1->length)
5103 if (Py_UNICODE_MATCH(self, i, str1)) {
5104 if (--maxcount < 0)
5105 break;
5106 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5107 i += str1->length;
5108 } else
5109 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005112
5113 Py_ssize_t n, i, j, e;
5114 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 Py_UNICODE *p;
5116
5117 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005118 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 if (n > maxcount)
5120 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005121 if (n == 0)
5122 goto nothing;
5123 /* new_size = self->length + n * (str2->length - str1->length)); */
5124 delta = (str2->length - str1->length);
5125 if (delta == 0) {
5126 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005128 product = n * (str2->length - str1->length);
5129 if ((product / (str2->length - str1->length)) != n) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "replace string is too long");
5132 return NULL;
5133 }
5134 new_size = self->length + product;
5135 if (new_size < 0) {
5136 PyErr_SetString(PyExc_OverflowError,
5137 "replace string is too long");
5138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 }
5140 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005141 u = _PyUnicode_New(new_size);
5142 if (!u)
5143 return NULL;
5144 i = 0;
5145 p = u->str;
5146 e = self->length - str1->length;
5147 if (str1->length > 0) {
5148 while (n-- > 0) {
5149 /* look for next match */
5150 j = i;
5151 while (j <= e) {
5152 if (Py_UNICODE_MATCH(self, j, str1))
5153 break;
5154 j++;
5155 }
5156 if (j > i) {
5157 if (j > e)
5158 break;
5159 /* copy unchanged part [i:j] */
5160 Py_UNICODE_COPY(p, self->str+i, j-i);
5161 p += j - i;
5162 }
5163 /* copy substitution string */
5164 if (str2->length > 0) {
5165 Py_UNICODE_COPY(p, str2->str, str2->length);
5166 p += str2->length;
5167 }
5168 i = j + str1->length;
5169 }
5170 if (i < self->length)
5171 /* copy tail [i:] */
5172 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5173 } else {
5174 /* interleave */
5175 while (n > 0) {
5176 Py_UNICODE_COPY(p, str2->str, str2->length);
5177 p += str2->length;
5178 if (--n <= 0)
5179 break;
5180 *p++ = self->str[i++];
5181 }
5182 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005186
5187nothing:
5188 /* nothing to replace; return original string (when possible) */
5189 if (PyUnicode_CheckExact(self)) {
5190 Py_INCREF(self);
5191 return (PyObject *) self;
5192 }
5193 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194}
5195
5196/* --- Unicode Object Methods --------------------------------------------- */
5197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005198PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199"S.title() -> unicode\n\
5200\n\
5201Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005202characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005205unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return fixup(self, fixtitle);
5208}
5209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005210PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211"S.capitalize() -> unicode\n\
5212\n\
5213Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005214have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005217unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return fixup(self, fixcapitalize);
5220}
5221
5222#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005223PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224"S.capwords() -> unicode\n\
5225\n\
5226Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005227normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005230unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
5232 PyObject *list;
5233 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 /* Split into words */
5237 list = split(self, NULL, -1);
5238 if (!list)
5239 return NULL;
5240
5241 /* Capitalize each word */
5242 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5243 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5244 fixcapitalize);
5245 if (item == NULL)
5246 goto onError;
5247 Py_DECREF(PyList_GET_ITEM(list, i));
5248 PyList_SET_ITEM(list, i, item);
5249 }
5250
5251 /* Join the words to form a new string */
5252 item = PyUnicode_Join(NULL, list);
5253
5254onError:
5255 Py_DECREF(list);
5256 return (PyObject *)item;
5257}
5258#endif
5259
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005260/* Argument converter. Coerces to a single unicode character */
5261
5262static int
5263convert_uc(PyObject *obj, void *addr)
5264{
5265 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5266 PyObject *uniobj;
5267 Py_UNICODE *unistr;
5268
5269 uniobj = PyUnicode_FromObject(obj);
5270 if (uniobj == NULL) {
5271 PyErr_SetString(PyExc_TypeError,
5272 "The fill character cannot be converted to Unicode");
5273 return 0;
5274 }
5275 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5276 PyErr_SetString(PyExc_TypeError,
5277 "The fill character must be exactly one character long");
5278 Py_DECREF(uniobj);
5279 return 0;
5280 }
5281 unistr = PyUnicode_AS_UNICODE(uniobj);
5282 *fillcharloc = unistr[0];
5283 Py_DECREF(uniobj);
5284 return 1;
5285}
5286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005287PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005288"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005290Return S centered in a Unicode string of length width. Padding is\n\
5291done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293static PyObject *
5294unicode_center(PyUnicodeObject *self, PyObject *args)
5295{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296 Py_ssize_t marg, left;
5297 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005298 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Thomas Woutersde017742006-02-16 19:34:37 +00005300 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 return NULL;
5302
Tim Peters7a29bd52001-09-12 03:03:31 +00005303 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 Py_INCREF(self);
5305 return (PyObject*) self;
5306 }
5307
5308 marg = width - self->length;
5309 left = marg / 2 + (marg & width & 1);
5310
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005311 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312}
5313
Marc-André Lemburge5034372000-08-08 08:04:29 +00005314#if 0
5315
5316/* This code should go into some future Unicode collation support
5317 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005318 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005319
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005320/* speedy UTF-16 code point order comparison */
5321/* gleaned from: */
5322/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5323
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005324static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005325{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005326 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005327 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005329 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005330};
5331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332static int
5333unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 Py_UNICODE *s1 = str1->str;
5338 Py_UNICODE *s2 = str2->str;
5339
5340 len1 = str1->length;
5341 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005344 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005345
5346 c1 = *s1++;
5347 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005348
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005349 if (c1 > (1<<11) * 26)
5350 c1 += utf16Fixup[c1>>11];
5351 if (c2 > (1<<11) * 26)
5352 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005353 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005354
5355 if (c1 != c2)
5356 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005358 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
5360
5361 return (len1 < len2) ? -1 : (len1 != len2);
5362}
5363
Marc-André Lemburge5034372000-08-08 08:04:29 +00005364#else
5365
5366static int
5367unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005370
5371 Py_UNICODE *s1 = str1->str;
5372 Py_UNICODE *s2 = str2->str;
5373
5374 len1 = str1->length;
5375 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005376
Marc-André Lemburge5034372000-08-08 08:04:29 +00005377 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005378 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005379
Fredrik Lundh45714e92001-06-26 16:39:36 +00005380 c1 = *s1++;
5381 c2 = *s2++;
5382
5383 if (c1 != c2)
5384 return (c1 < c2) ? -1 : 1;
5385
Marc-André Lemburge5034372000-08-08 08:04:29 +00005386 len1--; len2--;
5387 }
5388
5389 return (len1 < len2) ? -1 : (len1 != len2);
5390}
5391
5392#endif
5393
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394int PyUnicode_Compare(PyObject *left,
5395 PyObject *right)
5396{
5397 PyUnicodeObject *u = NULL, *v = NULL;
5398 int result;
5399
5400 /* Coerce the two arguments */
5401 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5402 if (u == NULL)
5403 goto onError;
5404 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5405 if (v == NULL)
5406 goto onError;
5407
Thomas Wouters7e474022000-07-16 12:04:32 +00005408 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (v == u) {
5410 Py_DECREF(u);
5411 Py_DECREF(v);
5412 return 0;
5413 }
5414
5415 result = unicode_compare(u, v);
5416
5417 Py_DECREF(u);
5418 Py_DECREF(v);
5419 return result;
5420
5421onError:
5422 Py_XDECREF(u);
5423 Py_XDECREF(v);
5424 return -1;
5425}
5426
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005427PyObject *PyUnicode_RichCompare(PyObject *left,
5428 PyObject *right,
5429 int op)
5430{
5431 int result;
5432
5433 result = PyUnicode_Compare(left, right);
5434 if (result == -1 && PyErr_Occurred())
5435 goto onError;
5436
5437 /* Convert the return value to a Boolean */
5438 switch (op) {
5439 case Py_EQ:
5440 result = (result == 0);
5441 break;
5442 case Py_NE:
5443 result = (result != 0);
5444 break;
5445 case Py_LE:
5446 result = (result <= 0);
5447 break;
5448 case Py_GE:
5449 result = (result >= 0);
5450 break;
5451 case Py_LT:
5452 result = (result == -1);
5453 break;
5454 case Py_GT:
5455 result = (result == 1);
5456 break;
5457 }
5458 return PyBool_FromLong(result);
5459
5460 onError:
5461
5462 /* Standard case
5463
5464 Type errors mean that PyUnicode_FromObject() could not convert
5465 one of the arguments (usually the right hand side) to Unicode,
5466 ie. we can't handle the comparison request. However, it is
5467 possible that the other object knows a comparison method, which
5468 is why we return Py_NotImplemented to give the other object a
5469 chance.
5470
5471 */
5472 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5473 PyErr_Clear();
5474 Py_INCREF(Py_NotImplemented);
5475 return Py_NotImplemented;
5476 }
5477 if (op != Py_EQ && op != Py_NE)
5478 return NULL;
5479
5480 /* Equality comparison.
5481
5482 This is a special case: we silence any PyExc_UnicodeDecodeError
5483 and instead turn it into a PyErr_UnicodeWarning.
5484
5485 */
5486 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5487 return NULL;
5488 PyErr_Clear();
5489 if (PyErr_Warn(PyExc_UnicodeWarning,
5490 (op == Py_EQ) ?
5491 "Unicode equal comparison "
5492 "failed to convert both arguments to Unicode - "
5493 "interpreting them as being unequal" :
5494 "Unicode unequal comparison "
5495 "failed to convert both arguments to Unicode - "
5496 "interpreting them as being unequal"
5497 ) < 0)
5498 return NULL;
5499 result = (op == Py_NE);
5500 return PyBool_FromLong(result);
5501}
5502
Guido van Rossum403d68b2000-03-13 15:55:09 +00005503int PyUnicode_Contains(PyObject *container,
5504 PyObject *element)
5505{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005506 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005507 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005508
5509 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005510 sub = PyUnicode_FromObject(element);
5511 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005512 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005513 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005514 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005515 }
5516
Thomas Wouters477c8d52006-05-27 19:21:47 +00005517 str = PyUnicode_FromObject(container);
5518 if (!str) {
5519 Py_DECREF(sub);
5520 return -1;
5521 }
5522
5523 result = stringlib_contains_obj(str, sub);
5524
5525 Py_DECREF(str);
5526 Py_DECREF(sub);
5527
Guido van Rossum403d68b2000-03-13 15:55:09 +00005528 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005529}
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531/* Concat to string or Unicode object giving a new Unicode object. */
5532
5533PyObject *PyUnicode_Concat(PyObject *left,
5534 PyObject *right)
5535{
5536 PyUnicodeObject *u = NULL, *v = NULL, *w;
5537
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005538 if (PyBytes_Check(left) || PyBytes_Check(right))
5539 return PyBytes_Concat(left, right);
5540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 /* Coerce the two arguments */
5542 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5543 if (u == NULL)
5544 goto onError;
5545 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5546 if (v == NULL)
5547 goto onError;
5548
5549 /* Shortcuts */
5550 if (v == unicode_empty) {
5551 Py_DECREF(v);
5552 return (PyObject *)u;
5553 }
5554 if (u == unicode_empty) {
5555 Py_DECREF(u);
5556 return (PyObject *)v;
5557 }
5558
5559 /* Concat the two Unicode strings */
5560 w = _PyUnicode_New(u->length + v->length);
5561 if (w == NULL)
5562 goto onError;
5563 Py_UNICODE_COPY(w->str, u->str, u->length);
5564 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5565
5566 Py_DECREF(u);
5567 Py_DECREF(v);
5568 return (PyObject *)w;
5569
5570onError:
5571 Py_XDECREF(u);
5572 Py_XDECREF(v);
5573 return NULL;
5574}
5575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005576PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577"S.count(sub[, start[, end]]) -> int\n\
5578\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005579Return the number of non-overlapping occurrences of substring sub in\n\
5580Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005581interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583static PyObject *
5584unicode_count(PyUnicodeObject *self, PyObject *args)
5585{
5586 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005588 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 PyObject *result;
5590
Guido van Rossumb8872e62000-05-09 14:14:27 +00005591 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5592 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 return NULL;
5594
5595 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005596 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 if (substring == NULL)
5598 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005599
Thomas Wouters477c8d52006-05-27 19:21:47 +00005600 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 result = PyInt_FromSsize_t(
5603 stringlib_count(self->str + start, end - start,
5604 substring->str, substring->length)
5605 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606
5607 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005608
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 return result;
5610}
5611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005612PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005613"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005615Encodes S using the codec registered for encoding. encoding defaults\n\
5616to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005617handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5619'xmlcharrefreplace' as well as any other name registered with\n\
5620codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
5622static PyObject *
5623unicode_encode(PyUnicodeObject *self, PyObject *args)
5624{
5625 char *encoding = NULL;
5626 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005627 PyObject *v;
5628
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5630 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005631 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005632 if (v == NULL)
5633 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005634 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5635 PyErr_Format(PyExc_TypeError,
5636 "encoder did not return a string/unicode object "
5637 "(type=%.400s)",
5638 v->ob_type->tp_name);
5639 Py_DECREF(v);
5640 return NULL;
5641 }
5642 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005643
5644 onError:
5645 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005646}
5647
5648PyDoc_STRVAR(decode__doc__,
5649"S.decode([encoding[,errors]]) -> string or unicode\n\
5650\n\
5651Decodes S using the codec registered for encoding. encoding defaults\n\
5652to the default encoding. errors may be given to set a different error\n\
5653handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5654a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5655as well as any other name registerd with codecs.register_error that is\n\
5656able to handle UnicodeDecodeErrors.");
5657
5658static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005659unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005660{
5661 char *encoding = NULL;
5662 char *errors = NULL;
5663 PyObject *v;
5664
5665 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5666 return NULL;
5667 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005668 if (v == NULL)
5669 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005670 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5671 PyErr_Format(PyExc_TypeError,
5672 "decoder did not return a string/unicode object "
5673 "(type=%.400s)",
5674 v->ob_type->tp_name);
5675 Py_DECREF(v);
5676 return NULL;
5677 }
5678 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005679
5680 onError:
5681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682}
5683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685"S.expandtabs([tabsize]) -> unicode\n\
5686\n\
5687Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
5691unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5692{
5693 Py_UNICODE *e;
5694 Py_UNICODE *p;
5695 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005696 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 PyUnicodeObject *u;
5698 int tabsize = 8;
5699
5700 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5701 return NULL;
5702
Thomas Wouters7e474022000-07-16 12:04:32 +00005703 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 i = j = 0;
5705 e = self->str + self->length;
5706 for (p = self->str; p < e; p++)
5707 if (*p == '\t') {
5708 if (tabsize > 0)
5709 j += tabsize - (j % tabsize);
5710 }
5711 else {
5712 j++;
5713 if (*p == '\n' || *p == '\r') {
5714 i += j;
5715 j = 0;
5716 }
5717 }
5718
5719 /* Second pass: create output string and fill it */
5720 u = _PyUnicode_New(i + j);
5721 if (!u)
5722 return NULL;
5723
5724 j = 0;
5725 q = u->str;
5726
5727 for (p = self->str; p < e; p++)
5728 if (*p == '\t') {
5729 if (tabsize > 0) {
5730 i = tabsize - (j % tabsize);
5731 j += i;
5732 while (i--)
5733 *q++ = ' ';
5734 }
5735 }
5736 else {
5737 j++;
5738 *q++ = *p;
5739 if (*p == '\n' || *p == '\r')
5740 j = 0;
5741 }
5742
5743 return (PyObject*) u;
5744}
5745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005746PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747"S.find(sub [,start [,end]]) -> int\n\
5748\n\
5749Return the lowest index in S where substring sub is found,\n\
5750such that sub is contained within s[start,end]. Optional\n\
5751arguments start and end are interpreted as in slice notation.\n\
5752\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005753Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755static PyObject *
5756unicode_find(PyUnicodeObject *self, PyObject *args)
5757{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005758 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005760 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005761 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Guido van Rossumb8872e62000-05-09 14:14:27 +00005763 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5764 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005766 substring = PyUnicode_FromObject(substring);
5767 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 return NULL;
5769
Thomas Wouters477c8d52006-05-27 19:21:47 +00005770 result = stringlib_find_slice(
5771 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5772 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5773 start, end
5774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
5776 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005777
5778 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779}
5780
5781static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005782unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783{
5784 if (index < 0 || index >= self->length) {
5785 PyErr_SetString(PyExc_IndexError, "string index out of range");
5786 return NULL;
5787 }
5788
5789 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5790}
5791
5792static long
5793unicode_hash(PyUnicodeObject *self)
5794{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005795 /* Since Unicode objects compare equal to their ASCII string
5796 counterparts, they should use the individual character values
5797 as basis for their hash value. This is needed to assure that
5798 strings and Unicode objects behave in the same way as
5799 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Martin v. Löwis18e16552006-02-15 17:27:45 +00005801 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005802 register Py_UNICODE *p;
5803 register long x;
5804
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 if (self->hash != -1)
5806 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005807 len = PyUnicode_GET_SIZE(self);
5808 p = PyUnicode_AS_UNICODE(self);
5809 x = *p << 7;
5810 while (--len >= 0)
5811 x = (1000003*x) ^ *p++;
5812 x ^= PyUnicode_GET_SIZE(self);
5813 if (x == -1)
5814 x = -2;
5815 self->hash = x;
5816 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817}
5818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820"S.index(sub [,start [,end]]) -> int\n\
5821\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005822Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
5824static PyObject *
5825unicode_index(PyUnicodeObject *self, PyObject *args)
5826{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005827 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005828 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005830 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831
Guido van Rossumb8872e62000-05-09 14:14:27 +00005832 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5833 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005835 substring = PyUnicode_FromObject(substring);
5836 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 return NULL;
5838
Thomas Wouters477c8d52006-05-27 19:21:47 +00005839 result = stringlib_find_slice(
5840 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5841 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5842 start, end
5843 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005846
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 if (result < 0) {
5848 PyErr_SetString(PyExc_ValueError, "substring not found");
5849 return NULL;
5850 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853}
5854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005855PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005856"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005858Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005859at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
5861static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005862unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863{
5864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5865 register const Py_UNICODE *e;
5866 int cased;
5867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 /* Shortcut for single character strings */
5869 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005870 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005872 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005873 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005874 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 e = p + PyUnicode_GET_SIZE(self);
5877 cased = 0;
5878 for (; p < e; p++) {
5879 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005882 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 else if (!cased && Py_UNICODE_ISLOWER(ch))
5884 cased = 1;
5885 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005886 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887}
5888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005889PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005890"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005892Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005893at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005896unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
5898 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5899 register const Py_UNICODE *e;
5900 int cased;
5901
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 /* Shortcut for single character strings */
5903 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005904 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005906 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005907 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005908 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 e = p + PyUnicode_GET_SIZE(self);
5911 cased = 0;
5912 for (; p < e; p++) {
5913 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005914
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005916 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 else if (!cased && Py_UNICODE_ISUPPER(ch))
5918 cased = 1;
5919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005920 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921}
5922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005923PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005924"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005926Return True if S is a titlecased string and there is at least one\n\
5927character in S, i.e. upper- and titlecase characters may only\n\
5928follow uncased characters and lowercase characters only cased ones.\n\
5929Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
5931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005932unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933{
5934 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5935 register const Py_UNICODE *e;
5936 int cased, previous_is_cased;
5937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 /* Shortcut for single character strings */
5939 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005940 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5941 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005943 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005944 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005946
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 e = p + PyUnicode_GET_SIZE(self);
5948 cased = 0;
5949 previous_is_cased = 0;
5950 for (; p < e; p++) {
5951 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5954 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 previous_is_cased = 1;
5957 cased = 1;
5958 }
5959 else if (Py_UNICODE_ISLOWER(ch)) {
5960 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005961 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 previous_is_cased = 1;
5963 cased = 1;
5964 }
5965 else
5966 previous_is_cased = 0;
5967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005968 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005972"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005974Return True if all characters in S are whitespace\n\
5975and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005978unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
5980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5981 register const Py_UNICODE *e;
5982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 /* Shortcut for single character strings */
5984 if (PyUnicode_GET_SIZE(self) == 1 &&
5985 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005988 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005989 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005990 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005991
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 e = p + PyUnicode_GET_SIZE(self);
5993 for (; p < e; p++) {
5994 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005995 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005997 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998}
5999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006000PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006001"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006002\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006003Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006004and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006005
6006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006007unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006008{
6009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6010 register const Py_UNICODE *e;
6011
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006012 /* Shortcut for single character strings */
6013 if (PyUnicode_GET_SIZE(self) == 1 &&
6014 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006015 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006016
6017 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006018 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006019 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006020
6021 e = p + PyUnicode_GET_SIZE(self);
6022 for (; p < e; p++) {
6023 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006024 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006025 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006026 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006027}
6028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006030"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006031\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006032Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006033and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006034
6035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006036unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037{
6038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6039 register const Py_UNICODE *e;
6040
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006041 /* Shortcut for single character strings */
6042 if (PyUnicode_GET_SIZE(self) == 1 &&
6043 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006044 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006045
6046 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006047 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006048 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006049
6050 e = p + PyUnicode_GET_SIZE(self);
6051 for (; p < e; p++) {
6052 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006053 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006054 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006055 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006056}
6057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006058PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006059"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006062False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
6064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006065unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
6067 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6068 register const Py_UNICODE *e;
6069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 /* Shortcut for single character strings */
6071 if (PyUnicode_GET_SIZE(self) == 1 &&
6072 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006073 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006075 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006076 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006077 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006078
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 e = p + PyUnicode_GET_SIZE(self);
6080 for (; p < e; p++) {
6081 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006084 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085}
6086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006087PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006088"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006090Return True if all characters in S are digits\n\
6091and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092
6093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006094unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
6096 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6097 register const Py_UNICODE *e;
6098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 /* Shortcut for single character strings */
6100 if (PyUnicode_GET_SIZE(self) == 1 &&
6101 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006104 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006105 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006106 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006107
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 e = p + PyUnicode_GET_SIZE(self);
6109 for (; p < e; p++) {
6110 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006111 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006113 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114}
6115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006116PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006117"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006123unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124{
6125 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6126 register const Py_UNICODE *e;
6127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 /* Shortcut for single character strings */
6129 if (PyUnicode_GET_SIZE(self) == 1 &&
6130 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006131 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006133 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006134 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006135 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 e = p + PyUnicode_GET_SIZE(self);
6138 for (; p < e; p++) {
6139 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006142 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143}
6144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006145PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146"S.join(sequence) -> unicode\n\
6147\n\
6148Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006149sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006152unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006154 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158unicode_length(PyUnicodeObject *self)
6159{
6160 return self->length;
6161}
6162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006163PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006164"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165\n\
6166Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006167done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169static PyObject *
6170unicode_ljust(PyUnicodeObject *self, PyObject *args)
6171{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006172 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006173 Py_UNICODE fillchar = ' ';
6174
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006175 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 return NULL;
6177
Tim Peters7a29bd52001-09-12 03:03:31 +00006178 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 Py_INCREF(self);
6180 return (PyObject*) self;
6181 }
6182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006183 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187"S.lower() -> unicode\n\
6188\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006189Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
6191static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006192unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return fixup(self, fixlower);
6195}
6196
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006197#define LEFTSTRIP 0
6198#define RIGHTSTRIP 1
6199#define BOTHSTRIP 2
6200
6201/* Arrays indexed by above */
6202static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6203
6204#define STRIPNAME(i) (stripformat[i]+3)
6205
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006206/* externally visible for str.strip(unicode) */
6207PyObject *
6208_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6209{
6210 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006211 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006212 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006213 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6214 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006215
Thomas Wouters477c8d52006-05-27 19:21:47 +00006216 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6217
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006218 i = 0;
6219 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006220 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6221 i++;
6222 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006223 }
6224
6225 j = len;
6226 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006227 do {
6228 j--;
6229 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6230 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006231 }
6232
6233 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006234 Py_INCREF(self);
6235 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006236 }
6237 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006238 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006239}
6240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241
6242static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006245 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247
6248 i = 0;
6249 if (striptype != RIGHTSTRIP) {
6250 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6251 i++;
6252 }
6253 }
6254
6255 j = len;
6256 if (striptype != LEFTSTRIP) {
6257 do {
6258 j--;
6259 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6260 j++;
6261 }
6262
6263 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6264 Py_INCREF(self);
6265 return (PyObject*)self;
6266 }
6267 else
6268 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269}
6270
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006271
6272static PyObject *
6273do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6274{
6275 PyObject *sep = NULL;
6276
6277 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6278 return NULL;
6279
6280 if (sep != NULL && sep != Py_None) {
6281 if (PyUnicode_Check(sep))
6282 return _PyUnicode_XStrip(self, striptype, sep);
6283 else if (PyString_Check(sep)) {
6284 PyObject *res;
6285 sep = PyUnicode_FromObject(sep);
6286 if (sep==NULL)
6287 return NULL;
6288 res = _PyUnicode_XStrip(self, striptype, sep);
6289 Py_DECREF(sep);
6290 return res;
6291 }
6292 else {
6293 PyErr_Format(PyExc_TypeError,
6294 "%s arg must be None, unicode or str",
6295 STRIPNAME(striptype));
6296 return NULL;
6297 }
6298 }
6299
6300 return do_strip(self, striptype);
6301}
6302
6303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006304PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006305"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006306\n\
6307Return a copy of the string S with leading and trailing\n\
6308whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006309If chars is given and not None, remove characters in chars instead.\n\
6310If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006311
6312static PyObject *
6313unicode_strip(PyUnicodeObject *self, PyObject *args)
6314{
6315 if (PyTuple_GET_SIZE(args) == 0)
6316 return do_strip(self, BOTHSTRIP); /* Common case */
6317 else
6318 return do_argstrip(self, BOTHSTRIP, args);
6319}
6320
6321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006322PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006323"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006324\n\
6325Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006326If chars is given and not None, remove characters in chars instead.\n\
6327If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006328
6329static PyObject *
6330unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6331{
6332 if (PyTuple_GET_SIZE(args) == 0)
6333 return do_strip(self, LEFTSTRIP); /* Common case */
6334 else
6335 return do_argstrip(self, LEFTSTRIP, args);
6336}
6337
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006340"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006341\n\
6342Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006343If chars is given and not None, remove characters in chars instead.\n\
6344If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006345
6346static PyObject *
6347unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6348{
6349 if (PyTuple_GET_SIZE(args) == 0)
6350 return do_strip(self, RIGHTSTRIP); /* Common case */
6351 else
6352 return do_argstrip(self, RIGHTSTRIP, args);
6353}
6354
6355
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
6359 PyUnicodeObject *u;
6360 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006362 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 if (len < 0)
6365 len = 0;
6366
Tim Peters7a29bd52001-09-12 03:03:31 +00006367 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 /* no repeat, return original string */
6369 Py_INCREF(str);
6370 return (PyObject*) str;
6371 }
Tim Peters8f422462000-09-09 06:13:41 +00006372
6373 /* ensure # of chars needed doesn't overflow int and # of bytes
6374 * needed doesn't overflow size_t
6375 */
6376 nchars = len * str->length;
6377 if (len && nchars / len != str->length) {
6378 PyErr_SetString(PyExc_OverflowError,
6379 "repeated string is too long");
6380 return NULL;
6381 }
6382 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6383 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6384 PyErr_SetString(PyExc_OverflowError,
6385 "repeated string is too long");
6386 return NULL;
6387 }
6388 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 if (!u)
6390 return NULL;
6391
6392 p = u->str;
6393
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 if (str->length == 1 && len > 0) {
6395 Py_UNICODE_FILL(p, str->str[0], len);
6396 } else {
6397 Py_ssize_t done = 0; /* number of characters copied this far */
6398 if (done < nchars) {
6399 Py_UNICODE_COPY(p, str->str, str->length);
6400 done = str->length;
6401 }
6402 while (done < nchars) {
6403 int n = (done <= nchars-done) ? done : nchars-done;
6404 Py_UNICODE_COPY(p+done, p, n);
6405 done += n;
6406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 }
6408
6409 return (PyObject*) u;
6410}
6411
6412PyObject *PyUnicode_Replace(PyObject *obj,
6413 PyObject *subobj,
6414 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006415 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
6417 PyObject *self;
6418 PyObject *str1;
6419 PyObject *str2;
6420 PyObject *result;
6421
6422 self = PyUnicode_FromObject(obj);
6423 if (self == NULL)
6424 return NULL;
6425 str1 = PyUnicode_FromObject(subobj);
6426 if (str1 == NULL) {
6427 Py_DECREF(self);
6428 return NULL;
6429 }
6430 str2 = PyUnicode_FromObject(replobj);
6431 if (str2 == NULL) {
6432 Py_DECREF(self);
6433 Py_DECREF(str1);
6434 return NULL;
6435 }
Tim Petersced69f82003-09-16 20:30:58 +00006436 result = replace((PyUnicodeObject *)self,
6437 (PyUnicodeObject *)str1,
6438 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 maxcount);
6440 Py_DECREF(self);
6441 Py_DECREF(str1);
6442 Py_DECREF(str2);
6443 return result;
6444}
6445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006446PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447"S.replace (old, new[, maxsplit]) -> unicode\n\
6448\n\
6449Return a copy of S with all occurrences of substring\n\
6450old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006451given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452
6453static PyObject*
6454unicode_replace(PyUnicodeObject *self, PyObject *args)
6455{
6456 PyUnicodeObject *str1;
6457 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 PyObject *result;
6460
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return NULL;
6463 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6464 if (str1 == NULL)
6465 return NULL;
6466 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006467 if (str2 == NULL) {
6468 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472 result = replace(self, str1, str2, maxcount);
6473
6474 Py_DECREF(str1);
6475 Py_DECREF(str2);
6476 return result;
6477}
6478
6479static
6480PyObject *unicode_repr(PyObject *unicode)
6481{
6482 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6483 PyUnicode_GET_SIZE(unicode),
6484 1);
6485}
6486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006487PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488"S.rfind(sub [,start [,end]]) -> int\n\
6489\n\
6490Return the highest index in S where substring sub is found,\n\
6491such that sub is contained within s[start,end]. Optional\n\
6492arguments start and end are interpreted as in slice notation.\n\
6493\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006494Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495
6496static PyObject *
6497unicode_rfind(PyUnicodeObject *self, PyObject *args)
6498{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006499 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006500 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006501 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006502 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
Guido van Rossumb8872e62000-05-09 14:14:27 +00006504 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6505 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 substring = PyUnicode_FromObject(substring);
6508 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 return NULL;
6510
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 result = stringlib_rfind_slice(
6512 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6513 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6514 start, end
6515 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
6517 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006518
6519 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520}
6521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523"S.rindex(sub [,start [,end]]) -> int\n\
6524\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006525Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
6527static PyObject *
6528unicode_rindex(PyUnicodeObject *self, PyObject *args)
6529{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006531 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006532 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
Guido van Rossumb8872e62000-05-09 14:14:27 +00006535 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6536 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006538 substring = PyUnicode_FromObject(substring);
6539 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return NULL;
6541
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 result = stringlib_rfind_slice(
6543 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6544 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6545 start, end
6546 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547
6548 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 if (result < 0) {
6551 PyErr_SetString(PyExc_ValueError, "substring not found");
6552 return NULL;
6553 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006554 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555}
6556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006558"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559\n\
6560Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006561done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563static PyObject *
6564unicode_rjust(PyUnicodeObject *self, PyObject *args)
6565{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006566 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006567 Py_UNICODE fillchar = ' ';
6568
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006569 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 return NULL;
6571
Tim Peters7a29bd52001-09-12 03:03:31 +00006572 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 Py_INCREF(self);
6574 return (PyObject*) self;
6575 }
6576
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006577 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578}
6579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
6583 /* standard clamping */
6584 if (start < 0)
6585 start = 0;
6586 if (end < 0)
6587 end = 0;
6588 if (end > self->length)
6589 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006590 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 /* full slice, return original string */
6592 Py_INCREF(self);
6593 return (PyObject*) self;
6594 }
6595 if (start > end)
6596 start = end;
6597 /* copy slice */
6598 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6599 end - start);
6600}
6601
6602PyObject *PyUnicode_Split(PyObject *s,
6603 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006604 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
6606 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 s = PyUnicode_FromObject(s);
6609 if (s == NULL)
6610 return NULL;
6611 if (sep != NULL) {
6612 sep = PyUnicode_FromObject(sep);
6613 if (sep == NULL) {
6614 Py_DECREF(s);
6615 return NULL;
6616 }
6617 }
6618
6619 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6620
6621 Py_DECREF(s);
6622 Py_XDECREF(sep);
6623 return result;
6624}
6625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006626PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627"S.split([sep [,maxsplit]]) -> list of strings\n\
6628\n\
6629Return a list of the words in S, using sep as the\n\
6630delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006631splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006632any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
6634static PyObject*
6635unicode_split(PyUnicodeObject *self, PyObject *args)
6636{
6637 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006638 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 return NULL;
6642
6643 if (substring == Py_None)
6644 return split(self, NULL, maxcount);
6645 else if (PyUnicode_Check(substring))
6646 return split(self, (PyUnicodeObject *)substring, maxcount);
6647 else
6648 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6649}
6650
Thomas Wouters477c8d52006-05-27 19:21:47 +00006651PyObject *
6652PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6653{
6654 PyObject* str_obj;
6655 PyObject* sep_obj;
6656 PyObject* out;
6657
6658 str_obj = PyUnicode_FromObject(str_in);
6659 if (!str_obj)
6660 return NULL;
6661 sep_obj = PyUnicode_FromObject(sep_in);
6662 if (!sep_obj) {
6663 Py_DECREF(str_obj);
6664 return NULL;
6665 }
6666
6667 out = stringlib_partition(
6668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6670 );
6671
6672 Py_DECREF(sep_obj);
6673 Py_DECREF(str_obj);
6674
6675 return out;
6676}
6677
6678
6679PyObject *
6680PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6681{
6682 PyObject* str_obj;
6683 PyObject* sep_obj;
6684 PyObject* out;
6685
6686 str_obj = PyUnicode_FromObject(str_in);
6687 if (!str_obj)
6688 return NULL;
6689 sep_obj = PyUnicode_FromObject(sep_in);
6690 if (!sep_obj) {
6691 Py_DECREF(str_obj);
6692 return NULL;
6693 }
6694
6695 out = stringlib_rpartition(
6696 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6697 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6698 );
6699
6700 Py_DECREF(sep_obj);
6701 Py_DECREF(str_obj);
6702
6703 return out;
6704}
6705
6706PyDoc_STRVAR(partition__doc__,
6707"S.partition(sep) -> (head, sep, tail)\n\
6708\n\
6709Searches for the separator sep in S, and returns the part before it,\n\
6710the separator itself, and the part after it. If the separator is not\n\
6711found, returns S and two empty strings.");
6712
6713static PyObject*
6714unicode_partition(PyUnicodeObject *self, PyObject *separator)
6715{
6716 return PyUnicode_Partition((PyObject *)self, separator);
6717}
6718
6719PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006720"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006721\n\
6722Searches for the separator sep in S, starting at the end of S, and returns\n\
6723the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006724separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006725
6726static PyObject*
6727unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6728{
6729 return PyUnicode_RPartition((PyObject *)self, separator);
6730}
6731
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006732PyObject *PyUnicode_RSplit(PyObject *s,
6733 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006734 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006735{
6736 PyObject *result;
6737
6738 s = PyUnicode_FromObject(s);
6739 if (s == NULL)
6740 return NULL;
6741 if (sep != NULL) {
6742 sep = PyUnicode_FromObject(sep);
6743 if (sep == NULL) {
6744 Py_DECREF(s);
6745 return NULL;
6746 }
6747 }
6748
6749 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6750
6751 Py_DECREF(s);
6752 Py_XDECREF(sep);
6753 return result;
6754}
6755
6756PyDoc_STRVAR(rsplit__doc__,
6757"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6758\n\
6759Return a list of the words in S, using sep as the\n\
6760delimiter string, starting at the end of the string and\n\
6761working to the front. If maxsplit is given, at most maxsplit\n\
6762splits are done. If sep is not specified, any whitespace string\n\
6763is a separator.");
6764
6765static PyObject*
6766unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6767{
6768 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006770
Martin v. Löwis18e16552006-02-15 17:27:45 +00006771 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006772 return NULL;
6773
6774 if (substring == Py_None)
6775 return rsplit(self, NULL, maxcount);
6776 else if (PyUnicode_Check(substring))
6777 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6778 else
6779 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6780}
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006783"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784\n\
6785Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006786Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006787is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
6789static PyObject*
6790unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6791{
Guido van Rossum86662912000-04-11 15:38:46 +00006792 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Guido van Rossum86662912000-04-11 15:38:46 +00006794 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 return NULL;
6796
Guido van Rossum86662912000-04-11 15:38:46 +00006797 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798}
6799
6800static
6801PyObject *unicode_str(PyUnicodeObject *self)
6802{
Fred Drakee4315f52000-05-09 19:53:39 +00006803 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807"S.swapcase() -> unicode\n\
6808\n\
6809Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006813unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 return fixup(self, fixswapcase);
6816}
6817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006818PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819"S.translate(table) -> unicode\n\
6820\n\
6821Return a copy of the string S, where all characters have been mapped\n\
6822through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006823Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6824Unmapped characters are left untouched. Characters mapped to None\n\
6825are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
6827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006828unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829{
Tim Petersced69f82003-09-16 20:30:58 +00006830 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006832 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 "ignore");
6834}
6835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006836PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837"S.upper() -> unicode\n\
6838\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
6841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006842unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 return fixup(self, fixupper);
6845}
6846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848"S.zfill(width) -> unicode\n\
6849\n\
6850Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
6853static PyObject *
6854unicode_zfill(PyUnicodeObject *self, PyObject *args)
6855{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006856 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 PyUnicodeObject *u;
6858
Martin v. Löwis18e16552006-02-15 17:27:45 +00006859 Py_ssize_t width;
6860 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 return NULL;
6862
6863 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006864 if (PyUnicode_CheckExact(self)) {
6865 Py_INCREF(self);
6866 return (PyObject*) self;
6867 }
6868 else
6869 return PyUnicode_FromUnicode(
6870 PyUnicode_AS_UNICODE(self),
6871 PyUnicode_GET_SIZE(self)
6872 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 }
6874
6875 fill = width - self->length;
6876
6877 u = pad(self, fill, 0, '0');
6878
Walter Dörwald068325e2002-04-15 13:36:47 +00006879 if (u == NULL)
6880 return NULL;
6881
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 if (u->str[fill] == '+' || u->str[fill] == '-') {
6883 /* move sign to beginning of string */
6884 u->str[0] = u->str[fill];
6885 u->str[fill] = '0';
6886 }
6887
6888 return (PyObject*) u;
6889}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
6891#if 0
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 return PyInt_FromLong(unicode_freelist_size);
6896}
6897#endif
6898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006902Return True if S starts with the specified prefix, False otherwise.\n\
6903With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904With optional end, stop comparing S at that position.\n\
6905prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906
6907static PyObject *
6908unicode_startswith(PyUnicodeObject *self,
6909 PyObject *args)
6910{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006913 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006914 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006918 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920 if (PyTuple_Check(subobj)) {
6921 Py_ssize_t i;
6922 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6923 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6924 PyTuple_GET_ITEM(subobj, i));
6925 if (substring == NULL)
6926 return NULL;
6927 result = tailmatch(self, substring, start, end, -1);
6928 Py_DECREF(substring);
6929 if (result) {
6930 Py_RETURN_TRUE;
6931 }
6932 }
6933 /* nothing matched */
6934 Py_RETURN_FALSE;
6935 }
6936 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938 return NULL;
6939 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
6944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006945PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006946"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006948Return True if S ends with the specified suffix, False otherwise.\n\
6949With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950With optional end, stop comparing S at that position.\n\
6951suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
6953static PyObject *
6954unicode_endswith(PyUnicodeObject *self,
6955 PyObject *args)
6956{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006959 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006960 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006966 if (PyTuple_Check(subobj)) {
6967 Py_ssize_t i;
6968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6970 PyTuple_GET_ITEM(subobj, i));
6971 if (substring == NULL)
6972 return NULL;
6973 result = tailmatch(self, substring, start, end, +1);
6974 Py_DECREF(substring);
6975 if (result) {
6976 Py_RETURN_TRUE;
6977 }
6978 }
6979 Py_RETURN_FALSE;
6980 }
6981 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006985 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988}
6989
6990
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006991
6992static PyObject *
6993unicode_getnewargs(PyUnicodeObject *v)
6994{
6995 return Py_BuildValue("(u#)", v->str, v->length);
6996}
6997
6998
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999static PyMethodDef unicode_methods[] = {
7000
7001 /* Order is according to common usage: often used methods should
7002 appear first, since lookup is done sequentially. */
7003
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007004 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7005 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7006 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007007 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007008 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7009 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7010 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7011 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7012 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7013 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7014 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007015 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007016 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7017 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7018 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007020 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007021/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7022 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7023 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7024 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007026 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007027 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007028 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007029 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7030 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7031 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7032 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7033 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7034 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7035 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7036 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7037 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7038 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7039 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7040 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7041 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7042 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007043 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007044#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007045 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046#endif
7047
7048#if 0
7049 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007050 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051#endif
7052
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007053 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 {NULL, NULL}
7055};
7056
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007057static PyObject *
7058unicode_mod(PyObject *v, PyObject *w)
7059{
7060 if (!PyUnicode_Check(v)) {
7061 Py_INCREF(Py_NotImplemented);
7062 return Py_NotImplemented;
7063 }
7064 return PyUnicode_Format(v, w);
7065}
7066
7067static PyNumberMethods unicode_as_number = {
7068 0, /*nb_add*/
7069 0, /*nb_subtract*/
7070 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007071 unicode_mod, /*nb_remainder*/
7072};
7073
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007075 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007076 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007077 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7078 (ssizeargfunc) unicode_getitem, /* sq_item */
7079 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 0, /* sq_ass_item */
7081 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007082 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083};
7084
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007085static PyObject*
7086unicode_subscript(PyUnicodeObject* self, PyObject* item)
7087{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007088 if (PyIndex_Check(item)) {
7089 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007090 if (i == -1 && PyErr_Occurred())
7091 return NULL;
7092 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007093 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007094 return unicode_getitem(self, i);
7095 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007096 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007097 Py_UNICODE* source_buf;
7098 Py_UNICODE* result_buf;
7099 PyObject* result;
7100
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007101 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007102 &start, &stop, &step, &slicelength) < 0) {
7103 return NULL;
7104 }
7105
7106 if (slicelength <= 0) {
7107 return PyUnicode_FromUnicode(NULL, 0);
7108 } else {
7109 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007110 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7111 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007112
7113 if (result_buf == NULL)
7114 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007115
7116 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7117 result_buf[i] = source_buf[cur];
7118 }
Tim Petersced69f82003-09-16 20:30:58 +00007119
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007120 result = PyUnicode_FromUnicode(result_buf, slicelength);
7121 PyMem_FREE(result_buf);
7122 return result;
7123 }
7124 } else {
7125 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7126 return NULL;
7127 }
7128}
7129
7130static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007131 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007132 (binaryfunc)unicode_subscript, /* mp_subscript */
7133 (objobjargproc)0, /* mp_ass_subscript */
7134};
7135
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007138 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 const void **ptr)
7140{
7141 if (index != 0) {
7142 PyErr_SetString(PyExc_SystemError,
7143 "accessing non-existent unicode segment");
7144 return -1;
7145 }
7146 *ptr = (void *) self->str;
7147 return PyUnicode_GET_DATA_SIZE(self);
7148}
7149
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150static Py_ssize_t
7151unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 const void **ptr)
7153{
7154 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007155 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 return -1;
7157}
7158
7159static int
7160unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007161 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
7163 if (lenp)
7164 *lenp = PyUnicode_GET_DATA_SIZE(self);
7165 return 1;
7166}
7167
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007168static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007170 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 const void **ptr)
7172{
7173 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007174
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 if (index != 0) {
7176 PyErr_SetString(PyExc_SystemError,
7177 "accessing non-existent unicode segment");
7178 return -1;
7179 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007180 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 if (str == NULL)
7182 return -1;
7183 *ptr = (void *) PyString_AS_STRING(str);
7184 return PyString_GET_SIZE(str);
7185}
7186
7187/* Helpers for PyUnicode_Format() */
7188
7189static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 if (argidx < arglen) {
7194 (*p_argidx)++;
7195 if (arglen < 0)
7196 return args;
7197 else
7198 return PyTuple_GetItem(args, argidx);
7199 }
7200 PyErr_SetString(PyExc_TypeError,
7201 "not enough arguments for format string");
7202 return NULL;
7203}
7204
7205#define F_LJUST (1<<0)
7206#define F_SIGN (1<<1)
7207#define F_BLANK (1<<2)
7208#define F_ALT (1<<3)
7209#define F_ZERO (1<<4)
7210
Martin v. Löwis18e16552006-02-15 17:27:45 +00007211static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007212strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 register Py_ssize_t i;
7215 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 for (i = len - 1; i >= 0; i--)
7217 buffer[i] = (Py_UNICODE) charbuffer[i];
7218
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 return len;
7220}
7221
Neal Norwitzfc76d632006-01-10 06:03:13 +00007222static int
7223doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7224{
Tim Peters15231542006-02-16 01:08:01 +00007225 Py_ssize_t result;
7226
Neal Norwitzfc76d632006-01-10 06:03:13 +00007227 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007228 result = strtounicode(buffer, (char *)buffer);
7229 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007230}
7231
7232static int
7233longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7234{
Tim Peters15231542006-02-16 01:08:01 +00007235 Py_ssize_t result;
7236
Neal Norwitzfc76d632006-01-10 06:03:13 +00007237 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007238 result = strtounicode(buffer, (char *)buffer);
7239 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007240}
7241
Guido van Rossum078151d2002-08-11 04:24:12 +00007242/* XXX To save some code duplication, formatfloat/long/int could have been
7243 shared with stringobject.c, converting from 8-bit to Unicode after the
7244 formatting is done. */
7245
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246static int
7247formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007248 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 int flags,
7250 int prec,
7251 int type,
7252 PyObject *v)
7253{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007254 /* fmt = '%#.' + `prec` + `type`
7255 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 char fmt[20];
7257 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 x = PyFloat_AsDouble(v);
7260 if (x == -1.0 && PyErr_Occurred())
7261 return -1;
7262 if (prec < 0)
7263 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7265 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007266 /* Worst case length calc to ensure no buffer overrun:
7267
7268 'g' formats:
7269 fmt = %#.<prec>g
7270 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7271 for any double rep.)
7272 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7273
7274 'f' formats:
7275 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7276 len = 1 + 50 + 1 + prec = 52 + prec
7277
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007278 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007279 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007280
7281 */
7282 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7283 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007284 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007285 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007286 return -1;
7287 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007288 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7289 (flags&F_ALT) ? "#" : "",
7290 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007291 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292}
7293
Tim Peters38fd5b62000-09-21 05:43:11 +00007294static PyObject*
7295formatlong(PyObject *val, int flags, int prec, int type)
7296{
7297 char *buf;
7298 int i, len;
7299 PyObject *str; /* temporary string object. */
7300 PyUnicodeObject *result;
7301
7302 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7303 if (!str)
7304 return NULL;
7305 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007306 if (!result) {
7307 Py_DECREF(str);
7308 return NULL;
7309 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007310 for (i = 0; i < len; i++)
7311 result->str[i] = buf[i];
7312 result->str[len] = 0;
7313 Py_DECREF(str);
7314 return (PyObject*)result;
7315}
7316
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317static int
7318formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007319 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 int flags,
7321 int prec,
7322 int type,
7323 PyObject *v)
7324{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007325 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007326 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7327 * + 1 + 1
7328 * = 24
7329 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007330 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007331 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 long x;
7333
7334 x = PyInt_AsLong(v);
7335 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007336 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007337 if (x < 0 && type == 'u') {
7338 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007339 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007340 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7341 sign = "-";
7342 else
7343 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007345 prec = 1;
7346
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007347 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7348 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007349 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007350 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007351 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007352 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007353 return -1;
7354 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007355
7356 if ((flags & F_ALT) &&
7357 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007358 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007359 * of issues that cause pain:
7360 * - when 0 is being converted, the C standard leaves off
7361 * the '0x' or '0X', which is inconsistent with other
7362 * %#x/%#X conversions and inconsistent with Python's
7363 * hex() function
7364 * - there are platforms that violate the standard and
7365 * convert 0 with the '0x' or '0X'
7366 * (Metrowerks, Compaq Tru64)
7367 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007368 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007369 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007370 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007371 * We can achieve the desired consistency by inserting our
7372 * own '0x' or '0X' prefix, and substituting %x/%X in place
7373 * of %#x/%#X.
7374 *
7375 * Note that this is the same approach as used in
7376 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007377 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007378 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7379 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007380 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007381 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007382 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7383 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007384 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007385 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007386 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007387 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007388 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007389 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
7392static int
7393formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007394 size_t buflen,
7395 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007397 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007398 if (PyUnicode_Check(v)) {
7399 if (PyUnicode_GET_SIZE(v) != 1)
7400 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007404 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007405 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007406 goto onError;
7407 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410 else {
7411 /* Integer input truncated to a character */
7412 long x;
7413 x = PyInt_AsLong(v);
7414 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007415 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007416#ifdef Py_UNICODE_WIDE
7417 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007418 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007419 "%c arg not in range(0x110000) "
7420 "(wide Python build)");
7421 return -1;
7422 }
7423#else
7424 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007425 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007426 "%c arg not in range(0x10000) "
7427 "(narrow Python build)");
7428 return -1;
7429 }
7430#endif
7431 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 }
7433 buf[1] = '\0';
7434 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007435
7436 onError:
7437 PyErr_SetString(PyExc_TypeError,
7438 "%c requires int or char");
7439 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440}
7441
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007442/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7443
7444 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7445 chars are formatted. XXX This is a magic number. Each formatting
7446 routine does bounds checking to ensure no overflow, but a better
7447 solution may be to malloc a buffer of appropriate size for each
7448 format. For now, the current solution is sufficient.
7449*/
7450#define FORMATBUFLEN (size_t)120
7451
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452PyObject *PyUnicode_Format(PyObject *format,
7453 PyObject *args)
7454{
7455 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007456 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 int args_owned = 0;
7458 PyUnicodeObject *result = NULL;
7459 PyObject *dict = NULL;
7460 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 if (format == NULL || args == NULL) {
7463 PyErr_BadInternalCall();
7464 return NULL;
7465 }
7466 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007467 if (uformat == NULL)
7468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 fmt = PyUnicode_AS_UNICODE(uformat);
7470 fmtcnt = PyUnicode_GET_SIZE(uformat);
7471
7472 reslen = rescnt = fmtcnt + 100;
7473 result = _PyUnicode_New(reslen);
7474 if (result == NULL)
7475 goto onError;
7476 res = PyUnicode_AS_UNICODE(result);
7477
7478 if (PyTuple_Check(args)) {
7479 arglen = PyTuple_Size(args);
7480 argidx = 0;
7481 }
7482 else {
7483 arglen = -1;
7484 argidx = -2;
7485 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007486 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7487 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 dict = args;
7489
7490 while (--fmtcnt >= 0) {
7491 if (*fmt != '%') {
7492 if (--rescnt < 0) {
7493 rescnt = fmtcnt + 100;
7494 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007495 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007496 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7498 --rescnt;
7499 }
7500 *res++ = *fmt++;
7501 }
7502 else {
7503 /* Got a format specifier */
7504 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 Py_UNICODE c = '\0';
7508 Py_UNICODE fill;
7509 PyObject *v = NULL;
7510 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007511 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007514 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
7516 fmt++;
7517 if (*fmt == '(') {
7518 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007519 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 PyObject *key;
7521 int pcount = 1;
7522
7523 if (dict == NULL) {
7524 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007525 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 goto onError;
7527 }
7528 ++fmt;
7529 --fmtcnt;
7530 keystart = fmt;
7531 /* Skip over balanced parentheses */
7532 while (pcount > 0 && --fmtcnt >= 0) {
7533 if (*fmt == ')')
7534 --pcount;
7535 else if (*fmt == '(')
7536 ++pcount;
7537 fmt++;
7538 }
7539 keylen = fmt - keystart - 1;
7540 if (fmtcnt < 0 || pcount > 0) {
7541 PyErr_SetString(PyExc_ValueError,
7542 "incomplete format key");
7543 goto onError;
7544 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007545#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007546 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 then looked up since Python uses strings to hold
7548 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007549 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 key = PyUnicode_EncodeUTF8(keystart,
7551 keylen,
7552 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007553#else
7554 key = PyUnicode_FromUnicode(keystart, keylen);
7555#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 if (key == NULL)
7557 goto onError;
7558 if (args_owned) {
7559 Py_DECREF(args);
7560 args_owned = 0;
7561 }
7562 args = PyObject_GetItem(dict, key);
7563 Py_DECREF(key);
7564 if (args == NULL) {
7565 goto onError;
7566 }
7567 args_owned = 1;
7568 arglen = -1;
7569 argidx = -2;
7570 }
7571 while (--fmtcnt >= 0) {
7572 switch (c = *fmt++) {
7573 case '-': flags |= F_LJUST; continue;
7574 case '+': flags |= F_SIGN; continue;
7575 case ' ': flags |= F_BLANK; continue;
7576 case '#': flags |= F_ALT; continue;
7577 case '0': flags |= F_ZERO; continue;
7578 }
7579 break;
7580 }
7581 if (c == '*') {
7582 v = getnextarg(args, arglen, &argidx);
7583 if (v == NULL)
7584 goto onError;
7585 if (!PyInt_Check(v)) {
7586 PyErr_SetString(PyExc_TypeError,
7587 "* wants int");
7588 goto onError;
7589 }
7590 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007591 if (width == -1 && PyErr_Occurred())
7592 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 if (width < 0) {
7594 flags |= F_LJUST;
7595 width = -width;
7596 }
7597 if (--fmtcnt >= 0)
7598 c = *fmt++;
7599 }
7600 else if (c >= '0' && c <= '9') {
7601 width = c - '0';
7602 while (--fmtcnt >= 0) {
7603 c = *fmt++;
7604 if (c < '0' || c > '9')
7605 break;
7606 if ((width*10) / 10 != width) {
7607 PyErr_SetString(PyExc_ValueError,
7608 "width too big");
7609 goto onError;
7610 }
7611 width = width*10 + (c - '0');
7612 }
7613 }
7614 if (c == '.') {
7615 prec = 0;
7616 if (--fmtcnt >= 0)
7617 c = *fmt++;
7618 if (c == '*') {
7619 v = getnextarg(args, arglen, &argidx);
7620 if (v == NULL)
7621 goto onError;
7622 if (!PyInt_Check(v)) {
7623 PyErr_SetString(PyExc_TypeError,
7624 "* wants int");
7625 goto onError;
7626 }
7627 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007628 if (prec == -1 && PyErr_Occurred())
7629 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 if (prec < 0)
7631 prec = 0;
7632 if (--fmtcnt >= 0)
7633 c = *fmt++;
7634 }
7635 else if (c >= '0' && c <= '9') {
7636 prec = c - '0';
7637 while (--fmtcnt >= 0) {
7638 c = Py_CHARMASK(*fmt++);
7639 if (c < '0' || c > '9')
7640 break;
7641 if ((prec*10) / 10 != prec) {
7642 PyErr_SetString(PyExc_ValueError,
7643 "prec too big");
7644 goto onError;
7645 }
7646 prec = prec*10 + (c - '0');
7647 }
7648 }
7649 } /* prec */
7650 if (fmtcnt >= 0) {
7651 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 if (--fmtcnt >= 0)
7653 c = *fmt++;
7654 }
7655 }
7656 if (fmtcnt < 0) {
7657 PyErr_SetString(PyExc_ValueError,
7658 "incomplete format");
7659 goto onError;
7660 }
7661 if (c != '%') {
7662 v = getnextarg(args, arglen, &argidx);
7663 if (v == NULL)
7664 goto onError;
7665 }
7666 sign = 0;
7667 fill = ' ';
7668 switch (c) {
7669
7670 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007671 pbuf = formatbuf;
7672 /* presume that buffer length is at least 1 */
7673 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 len = 1;
7675 break;
7676
7677 case 's':
7678 case 'r':
7679 if (PyUnicode_Check(v) && c == 's') {
7680 temp = v;
7681 Py_INCREF(temp);
7682 }
7683 else {
7684 PyObject *unicode;
7685 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007686 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687 else
7688 temp = PyObject_Repr(v);
7689 if (temp == NULL)
7690 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007691 if (PyUnicode_Check(temp))
7692 /* nothing to do */;
7693 else if (PyString_Check(temp)) {
7694 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007695 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007697 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007699 Py_DECREF(temp);
7700 temp = unicode;
7701 if (temp == NULL)
7702 goto onError;
7703 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007704 else {
7705 Py_DECREF(temp);
7706 PyErr_SetString(PyExc_TypeError,
7707 "%s argument has non-string str()");
7708 goto onError;
7709 }
7710 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007711 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 len = PyUnicode_GET_SIZE(temp);
7713 if (prec >= 0 && len > prec)
7714 len = prec;
7715 break;
7716
7717 case 'i':
7718 case 'd':
7719 case 'u':
7720 case 'o':
7721 case 'x':
7722 case 'X':
7723 if (c == 'i')
7724 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007725 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007726 temp = formatlong(v, flags, prec, c);
7727 if (!temp)
7728 goto onError;
7729 pbuf = PyUnicode_AS_UNICODE(temp);
7730 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007731 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007733 else {
7734 pbuf = formatbuf;
7735 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7736 flags, prec, c, v);
7737 if (len < 0)
7738 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007739 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007740 }
7741 if (flags & F_ZERO)
7742 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 break;
7744
7745 case 'e':
7746 case 'E':
7747 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007748 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 case 'g':
7750 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007751 if (c == 'F')
7752 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007753 pbuf = formatbuf;
7754 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7755 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 if (len < 0)
7757 goto onError;
7758 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007759 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 fill = '0';
7761 break;
7762
7763 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007764 pbuf = formatbuf;
7765 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 if (len < 0)
7767 goto onError;
7768 break;
7769
7770 default:
7771 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007772 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007773 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007774 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007775 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007776 (Py_ssize_t)(fmt - 1 -
7777 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 goto onError;
7779 }
7780 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007781 if (*pbuf == '-' || *pbuf == '+') {
7782 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 len--;
7784 }
7785 else if (flags & F_SIGN)
7786 sign = '+';
7787 else if (flags & F_BLANK)
7788 sign = ' ';
7789 else
7790 sign = 0;
7791 }
7792 if (width < len)
7793 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007794 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 reslen -= rescnt;
7796 rescnt = width + fmtcnt + 100;
7797 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007798 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007799 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007800 PyErr_NoMemory();
7801 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007802 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007803 if (_PyUnicode_Resize(&result, reslen) < 0) {
7804 Py_XDECREF(temp);
7805 goto onError;
7806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 res = PyUnicode_AS_UNICODE(result)
7808 + reslen - rescnt;
7809 }
7810 if (sign) {
7811 if (fill != ' ')
7812 *res++ = sign;
7813 rescnt--;
7814 if (width > len)
7815 width--;
7816 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007817 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7818 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007819 assert(pbuf[1] == c);
7820 if (fill != ' ') {
7821 *res++ = *pbuf++;
7822 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007823 }
Tim Petersfff53252001-04-12 18:38:48 +00007824 rescnt -= 2;
7825 width -= 2;
7826 if (width < 0)
7827 width = 0;
7828 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 if (width > len && !(flags & F_LJUST)) {
7831 do {
7832 --rescnt;
7833 *res++ = fill;
7834 } while (--width > len);
7835 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007836 if (fill == ' ') {
7837 if (sign)
7838 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007839 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007840 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007841 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007842 *res++ = *pbuf++;
7843 *res++ = *pbuf++;
7844 }
7845 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007846 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 res += len;
7848 rescnt -= len;
7849 while (--width >= len) {
7850 --rescnt;
7851 *res++ = ' ';
7852 }
7853 if (dict && (argidx < arglen) && c != '%') {
7854 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007855 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007856 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 goto onError;
7858 }
7859 Py_XDECREF(temp);
7860 } /* '%' */
7861 } /* until end */
7862 if (argidx < arglen && !dict) {
7863 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007864 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 goto onError;
7866 }
7867
Thomas Woutersa96affe2006-03-12 00:29:36 +00007868 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 if (args_owned) {
7871 Py_DECREF(args);
7872 }
7873 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 return (PyObject *)result;
7875
7876 onError:
7877 Py_XDECREF(result);
7878 Py_DECREF(uformat);
7879 if (args_owned) {
7880 Py_DECREF(args);
7881 }
7882 return NULL;
7883}
7884
7885static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007886 (readbufferproc) unicode_buffer_getreadbuf,
7887 (writebufferproc) unicode_buffer_getwritebuf,
7888 (segcountproc) unicode_buffer_getsegcount,
7889 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890};
7891
Jeremy Hylton938ace62002-07-17 16:30:39 +00007892static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7894
Tim Peters6d6c1a32001-08-02 04:15:00 +00007895static PyObject *
7896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7897{
7898 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007899 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007900 char *encoding = NULL;
7901 char *errors = NULL;
7902
Guido van Rossume023fe02001-08-30 03:12:59 +00007903 if (type != &PyUnicode_Type)
7904 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007905 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7906 kwlist, &x, &encoding, &errors))
7907 return NULL;
7908 if (x == NULL)
7909 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007910 if (encoding == NULL && errors == NULL)
7911 return PyObject_Unicode(x);
7912 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007913 return PyUnicode_FromEncodedObject(x, encoding, errors);
7914}
7915
Guido van Rossume023fe02001-08-30 03:12:59 +00007916static PyObject *
7917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7918{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007919 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007920 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007921
7922 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7923 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7924 if (tmp == NULL)
7925 return NULL;
7926 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007927 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007928 if (pnew == NULL) {
7929 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007930 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007931 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007932 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7933 if (pnew->str == NULL) {
7934 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007935 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007936 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007937 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007938 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007939 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7940 pnew->length = n;
7941 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007942 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007943 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007944}
7945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007946PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007947"unicode(string [, encoding[, errors]]) -> object\n\
7948\n\
7949Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007950encoding defaults to the current default string encoding.\n\
7951errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007952
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007953static PyObject *unicode_iter(PyObject *seq);
7954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955PyTypeObject PyUnicode_Type = {
7956 PyObject_HEAD_INIT(&PyType_Type)
7957 0, /* ob_size */
7958 "unicode", /* tp_name */
7959 sizeof(PyUnicodeObject), /* tp_size */
7960 0, /* tp_itemsize */
7961 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007962 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007964 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007966 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007967 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007968 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007970 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 (hashfunc) unicode_hash, /* tp_hash*/
7972 0, /* tp_call*/
7973 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007974 PyObject_GenericGetAttr, /* tp_getattro */
7975 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00007977 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
7978 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007979 unicode_doc, /* tp_doc */
7980 0, /* tp_traverse */
7981 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007982 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007983 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007984 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007985 0, /* tp_iternext */
7986 unicode_methods, /* tp_methods */
7987 0, /* tp_members */
7988 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007989 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007990 0, /* tp_dict */
7991 0, /* tp_descr_get */
7992 0, /* tp_descr_set */
7993 0, /* tp_dictoffset */
7994 0, /* tp_init */
7995 0, /* tp_alloc */
7996 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007997 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998};
7999
8000/* Initialize the Unicode implementation */
8001
Thomas Wouters78890102000-07-22 19:25:51 +00008002void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008004 int i;
8005
Thomas Wouters477c8d52006-05-27 19:21:47 +00008006 /* XXX - move this array to unicodectype.c ? */
8007 Py_UNICODE linebreak[] = {
8008 0x000A, /* LINE FEED */
8009 0x000D, /* CARRIAGE RETURN */
8010 0x001C, /* FILE SEPARATOR */
8011 0x001D, /* GROUP SEPARATOR */
8012 0x001E, /* RECORD SEPARATOR */
8013 0x0085, /* NEXT LINE */
8014 0x2028, /* LINE SEPARATOR */
8015 0x2029, /* PARAGRAPH SEPARATOR */
8016 };
8017
Fred Drakee4315f52000-05-09 19:53:39 +00008018 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008019 unicode_freelist = NULL;
8020 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008022 if (!unicode_empty)
8023 return;
8024
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008025 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008026 for (i = 0; i < 256; i++)
8027 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008028 if (PyType_Ready(&PyUnicode_Type) < 0)
8029 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008030
8031 /* initialize the linebreak bloom filter */
8032 bloom_linebreak = make_bloom_mask(
8033 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8034 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008035
8036 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037}
8038
8039/* Finalize the Unicode implementation */
8040
8041void
Thomas Wouters78890102000-07-22 19:25:51 +00008042_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008044 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008045 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008047 Py_XDECREF(unicode_empty);
8048 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008049
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008050 for (i = 0; i < 256; i++) {
8051 if (unicode_latin1[i]) {
8052 Py_DECREF(unicode_latin1[i]);
8053 unicode_latin1[i] = NULL;
8054 }
8055 }
8056
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008057 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 PyUnicodeObject *v = u;
8059 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008060 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008061 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008062 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008063 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008065 unicode_freelist = NULL;
8066 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008068
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008069
8070
8071/********************* Unicode Iterator **************************/
8072
8073typedef struct {
8074 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008075 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008076 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8077} unicodeiterobject;
8078
8079static void
8080unicodeiter_dealloc(unicodeiterobject *it)
8081{
8082 _PyObject_GC_UNTRACK(it);
8083 Py_XDECREF(it->it_seq);
8084 PyObject_GC_Del(it);
8085}
8086
8087static int
8088unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8089{
8090 Py_VISIT(it->it_seq);
8091 return 0;
8092}
8093
8094static PyObject *
8095unicodeiter_next(unicodeiterobject *it)
8096{
8097 PyUnicodeObject *seq;
8098 PyObject *item;
8099
8100 assert(it != NULL);
8101 seq = it->it_seq;
8102 if (seq == NULL)
8103 return NULL;
8104 assert(PyUnicode_Check(seq));
8105
8106 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008107 item = PyUnicode_FromUnicode(
8108 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008109 if (item != NULL)
8110 ++it->it_index;
8111 return item;
8112 }
8113
8114 Py_DECREF(seq);
8115 it->it_seq = NULL;
8116 return NULL;
8117}
8118
8119static PyObject *
8120unicodeiter_len(unicodeiterobject *it)
8121{
8122 Py_ssize_t len = 0;
8123 if (it->it_seq)
8124 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8125 return PyInt_FromSsize_t(len);
8126}
8127
8128PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8129
8130static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008131 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8132 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008133 {NULL, NULL} /* sentinel */
8134};
8135
8136PyTypeObject PyUnicodeIter_Type = {
8137 PyObject_HEAD_INIT(&PyType_Type)
8138 0, /* ob_size */
8139 "unicodeiterator", /* tp_name */
8140 sizeof(unicodeiterobject), /* tp_basicsize */
8141 0, /* tp_itemsize */
8142 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008143 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008144 0, /* tp_print */
8145 0, /* tp_getattr */
8146 0, /* tp_setattr */
8147 0, /* tp_compare */
8148 0, /* tp_repr */
8149 0, /* tp_as_number */
8150 0, /* tp_as_sequence */
8151 0, /* tp_as_mapping */
8152 0, /* tp_hash */
8153 0, /* tp_call */
8154 0, /* tp_str */
8155 PyObject_GenericGetAttr, /* tp_getattro */
8156 0, /* tp_setattro */
8157 0, /* tp_as_buffer */
8158 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8159 0, /* tp_doc */
8160 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8161 0, /* tp_clear */
8162 0, /* tp_richcompare */
8163 0, /* tp_weaklistoffset */
8164 PyObject_SelfIter, /* tp_iter */
8165 (iternextfunc)unicodeiter_next, /* tp_iternext */
8166 unicodeiter_methods, /* tp_methods */
8167 0,
8168};
8169
8170static PyObject *
8171unicode_iter(PyObject *seq)
8172{
8173 unicodeiterobject *it;
8174
8175 if (!PyUnicode_Check(seq)) {
8176 PyErr_BadInternalCall();
8177 return NULL;
8178 }
8179 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8180 if (it == NULL)
8181 return NULL;
8182 it->it_index = 0;
8183 Py_INCREF(seq);
8184 it->it_seq = (PyUnicodeObject *)seq;
8185 _PyObject_GC_TRACK(it);
8186 return (PyObject *)it;
8187}
8188
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008189#ifdef __cplusplus
8190}
8191#endif
8192
8193
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008194/*
8195Local variables:
8196c-basic-offset: 4
8197indent-tabs-mode: nil
8198End:
8199*/