blob: 55d03844d65bee84bd2c8cb5f2e2d7c1ae5525d5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
461 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000462 ;
463 }
464
465 return (PyObject *)unicode;
466}
467
Walter Dörwaldd2034312007-05-18 16:29:38 +0000468PyObject *PyUnicode_FromString(const char *u)
469{
470 size_t size = strlen(u);
471 if (size > PY_SSIZE_T_MAX) {
472 PyErr_SetString(PyExc_OverflowError, "input too long");
473 return NULL;
474 }
475
476 return PyUnicode_FromStringAndSize(u, size);
477}
478
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479#ifdef HAVE_WCHAR_H
480
481PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000482 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483{
484 PyUnicodeObject *unicode;
485
486 if (w == NULL) {
487 PyErr_BadInternalCall();
488 return NULL;
489 }
490
491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the wchar_t data into the new object */
496#ifdef HAVE_USABLE_WCHAR_T
497 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000498#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000499 {
500 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000501 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000503 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 *u++ = *w++;
505 }
506#endif
507
508 return (PyObject *)unicode;
509}
510
Walter Dörwald346737f2007-05-31 10:44:43 +0000511static void
512makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
513{
514 *fmt++ = '%';
515 if (width) {
516 if (zeropad)
517 *fmt++ = '0';
518 fmt += sprintf(fmt, "%d", width);
519 }
520 if (precision)
521 fmt += sprintf(fmt, ".%d", precision);
522 if (longflag)
523 *fmt++ = 'l';
524 else if (size_tflag) {
525 char *f = PY_FORMAT_SIZE_T;
526 while (*f)
527 *fmt++ = *f++;
528 }
529 *fmt++ = c;
530 *fmt = '\0';
531}
532
Walter Dörwaldd2034312007-05-18 16:29:38 +0000533#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
534
535PyObject *
536PyUnicode_FromFormatV(const char *format, va_list vargs)
537{
538 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000539 Py_ssize_t callcount = 0;
540 PyObject **callresults = NULL;
541 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000542 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000543 int width = 0;
544 int precision = 0;
545 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000546 const char* f;
547 Py_UNICODE *s;
548 PyObject *string;
549 /* used by sprintf */
550 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000551 /* use abuffer instead of buffer, if we need more space
552 * (which can happen if there's a format specifier with width). */
553 char *abuffer = NULL;
554 char *realbuffer;
555 Py_ssize_t abuffersize = 0;
556 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000557 const char *copy;
558
559#ifdef VA_LIST_IS_ARRAY
560 Py_MEMCPY(count, vargs, sizeof(va_list));
561#else
562#ifdef __va_copy
563 __va_copy(count, vargs);
564#else
565 count = vargs;
566#endif
567#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000568 /* step 1: count the number of %S/%R format specifications
569 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
570 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000571 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000572 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 ++callcount;
574 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000575 /* step 2: allocate memory for the results of
576 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 if (callcount) {
578 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
579 if (!callresults) {
580 PyErr_NoMemory();
581 return NULL;
582 }
583 callresult = callresults;
584 }
585 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000586 for (f = format; *f; f++) {
587 if (*f == '%') {
588 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000589 width = 0;
590 while (isdigit(Py_CHARMASK(*f)))
591 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
593 ;
594
595 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
596 * they don't affect the amount of space we reserve.
597 */
598 if ((*f == 'l' || *f == 'z') &&
599 (f[1] == 'd' || f[1] == 'u'))
600 ++f;
601
602 switch (*f) {
603 case 'c':
604 (void)va_arg(count, int);
605 /* fall through... */
606 case '%':
607 n++;
608 break;
609 case 'd': case 'u': case 'i': case 'x':
610 (void) va_arg(count, int);
611 /* 20 bytes is enough to hold a 64-bit
612 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000613 This isn't enough for octal.
614 If a width is specified we need more
615 (which we allocate later). */
616 if (width < 20)
617 width = 20;
618 n += width;
619 if (abuffersize < width)
620 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000621 break;
622 case 's':
623 n += strlen(va_arg(count, char*));
624 break;
625 case 'U':
626 {
627 PyObject *obj = va_arg(count, PyObject *);
628 assert(obj && PyUnicode_Check(obj));
629 n += PyUnicode_GET_SIZE(obj);
630 break;
631 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000632 case 'S':
633 {
634 PyObject *obj = va_arg(count, PyObject *);
635 PyObject *str;
636 assert(obj);
637 str = PyObject_Unicode(obj);
638 if (!str)
639 goto fail;
640 n += PyUnicode_GET_SIZE(str);
641 /* Remember the str and switch to the next slot */
642 *callresult++ = str;
643 break;
644 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000645 case 'R':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *repr;
649 assert(obj);
650 repr = PyObject_Repr(obj);
651 if (!repr)
652 goto fail;
653 n += PyUnicode_GET_SIZE(repr);
654 /* Remember the repr and switch to the next slot */
655 *callresult++ = repr;
656 break;
657 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000658 case 'p':
659 (void) va_arg(count, int);
660 /* maximum 64-bit pointer representation:
661 * 0xffffffffffffffff
662 * so 19 characters is enough.
663 * XXX I count 18 -- what's the extra for?
664 */
665 n += 19;
666 break;
667 default:
668 /* if we stumble upon an unknown
669 formatting code, copy the rest of
670 the format string to the output
671 string. (we cannot just skip the
672 code, since there's no way to know
673 what's in the argument list) */
674 n += strlen(p);
675 goto expand;
676 }
677 } else
678 n++;
679 }
680 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000681 if (abuffersize > 20) {
682 abuffer = PyMem_Malloc(abuffersize);
683 if (!abuffer) {
684 PyErr_NoMemory();
685 goto fail;
686 }
687 realbuffer = abuffer;
688 }
689 else
690 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000691 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000692 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000693 we don't have to resize the string.
694 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000695 string = PyUnicode_FromUnicode(NULL, n);
696 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000697 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000698
699 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000700 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000701
702 for (f = format; *f; f++) {
703 if (*f == '%') {
704 const char* p = f++;
705 int longflag = 0;
706 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000707 zeropad = (*f == '0');
708 /* parse the width.precision part */
709 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 width = (width*10) + *f++ - '0';
712 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713 if (*f == '.') {
714 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000716 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000718 /* handle the long flag, but only for %ld and %lu.
719 others can be added when necessary. */
720 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
721 longflag = 1;
722 ++f;
723 }
724 /* handle the size_t flag. */
725 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
726 size_tflag = 1;
727 ++f;
728 }
729
730 switch (*f) {
731 case 'c':
732 *s++ = va_arg(vargs, int);
733 break;
734 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000735 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000736 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000737 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000738 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000739 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000741 sprintf(realbuffer, fmt, va_arg(vargs, int));
742 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000743 break;
744 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
752 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 break;
754 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
756 sprintf(realbuffer, fmt, va_arg(vargs, int));
757 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 break;
759 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
761 sprintf(realbuffer, fmt, va_arg(vargs, int));
762 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 break;
764 case 's':
765 p = va_arg(vargs, char*);
766 appendstring(p);
767 break;
768 case 'U':
769 {
770 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000771 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
772 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
773 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774 break;
775 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000776 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000777 case 'R':
778 {
779 /* unused, since we already have the result */
780 (void) va_arg(vargs, PyObject *);
781 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
782 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
783 Py_ssize_t upos;
784 for (upos = 0; upos<usize;)
785 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000786 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000787 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000788 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000789 ++callresult;
790 break;
791 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 case 'p':
793 sprintf(buffer, "%p", va_arg(vargs, void*));
794 /* %p is ill-defined: ensure leading 0x. */
795 if (buffer[1] == 'X')
796 buffer[1] = 'x';
797 else if (buffer[1] != 'x') {
798 memmove(buffer+2, buffer, strlen(buffer)+1);
799 buffer[0] = '0';
800 buffer[1] = 'x';
801 }
802 appendstring(buffer);
803 break;
804 case '%':
805 *s++ = '%';
806 break;
807 default:
808 appendstring(p);
809 goto end;
810 }
811 } else
812 *s++ = *f;
813 }
814
815 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 if (callresults)
817 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000818 if (abuffer)
819 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
821 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000822 fail:
823 if (callresults) {
824 PyObject **callresult2 = callresults;
825 while (callresult2 <= callresult) {
826 Py_DECREF(*callresult2);
827 ++callresult2;
828 }
829 PyMem_Free(callresults);
830 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000831 if (abuffer)
832 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000833 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000834}
835
836#undef appendstring
837
838PyObject *
839PyUnicode_FromFormat(const char *format, ...)
840{
841 PyObject* ret;
842 va_list vargs;
843
844#ifdef HAVE_STDARG_PROTOTYPES
845 va_start(vargs, format);
846#else
847 va_start(vargs);
848#endif
849 ret = PyUnicode_FromFormatV(format, vargs);
850 va_end(vargs);
851 return ret;
852}
853
Martin v. Löwis18e16552006-02-15 17:27:45 +0000854Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
855 wchar_t *w,
856 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000857{
858 if (unicode == NULL) {
859 PyErr_BadInternalCall();
860 return -1;
861 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000862
863 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000865 size = PyUnicode_GET_SIZE(unicode) + 1;
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867#ifdef HAVE_USABLE_WCHAR_T
868 memcpy(w, unicode->str, size * sizeof(wchar_t));
869#else
870 {
871 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000872 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000874 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875 *w++ = *u++;
876 }
877#endif
878
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000879 if (size > PyUnicode_GET_SIZE(unicode))
880 return PyUnicode_GET_SIZE(unicode);
881 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882 return size;
883}
884
885#endif
886
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000887PyObject *PyUnicode_FromOrdinal(int ordinal)
888{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000889 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000890
891#ifdef Py_UNICODE_WIDE
892 if (ordinal < 0 || ordinal > 0x10ffff) {
893 PyErr_SetString(PyExc_ValueError,
894 "unichr() arg not in range(0x110000) "
895 "(wide Python build)");
896 return NULL;
897 }
898#else
899 if (ordinal < 0 || ordinal > 0xffff) {
900 PyErr_SetString(PyExc_ValueError,
901 "unichr() arg not in range(0x10000) "
902 "(narrow Python build)");
903 return NULL;
904 }
905#endif
906
Hye-Shik Chang40574832004-04-06 07:24:51 +0000907 s[0] = (Py_UNICODE)ordinal;
908 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000909}
910
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911PyObject *PyUnicode_FromObject(register PyObject *obj)
912{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000913 /* XXX Perhaps we should make this API an alias of
914 PyObject_Unicode() instead ?! */
915 if (PyUnicode_CheckExact(obj)) {
916 Py_INCREF(obj);
917 return obj;
918 }
919 if (PyUnicode_Check(obj)) {
920 /* For a Unicode subtype that's not a Unicode object,
921 return a true Unicode object with the same data. */
922 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
923 PyUnicode_GET_SIZE(obj));
924 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000925 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
926}
927
928PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
929 const char *encoding,
930 const char *errors)
931{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000932 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000933 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000934 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000935
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 if (obj == NULL) {
937 PyErr_BadInternalCall();
938 return NULL;
939 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000940
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000941#if 0
942 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000943 that no encodings is given and then redirect to
944 PyObject_Unicode() which then applies the additional logic for
945 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000946
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000947 NOTE: This API should really only be used for object which
948 represent *encoded* Unicode !
949
950 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000951 if (PyUnicode_Check(obj)) {
952 if (encoding) {
953 PyErr_SetString(PyExc_TypeError,
954 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000955 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000956 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000957 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000958 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000959#else
960 if (PyUnicode_Check(obj)) {
961 PyErr_SetString(PyExc_TypeError,
962 "decoding Unicode is not supported");
963 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000964 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000965#endif
966
967 /* Coerce object */
968 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000969 s = PyString_AS_STRING(obj);
970 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000972 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
973 /* Overwrite the error message with something more useful in
974 case of a TypeError. */
975 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000977 "coercing to Unicode: need string or buffer, "
978 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000979 obj->ob_type->tp_name);
980 goto onError;
981 }
Tim Petersced69f82003-09-16 20:30:58 +0000982
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000983 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000984 if (len == 0) {
985 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000986 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 }
Tim Petersced69f82003-09-16 20:30:58 +0000988 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000989 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000990
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 return v;
992
993 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000995}
996
997PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000998 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 const char *encoding,
1000 const char *errors)
1001{
1002 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001003
1004 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001005 encoding = PyUnicode_GetDefaultEncoding();
1006
1007 /* Shortcuts for common default encodings */
1008 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001010 else if (strcmp(encoding, "latin-1") == 0)
1011 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001012#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1013 else if (strcmp(encoding, "mbcs") == 0)
1014 return PyUnicode_DecodeMBCS(s, size, errors);
1015#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001016 else if (strcmp(encoding, "ascii") == 0)
1017 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018
1019 /* Decode via the codec registry */
1020 buffer = PyBuffer_FromMemory((void *)s, size);
1021 if (buffer == NULL)
1022 goto onError;
1023 unicode = PyCodec_Decode(buffer, encoding, errors);
1024 if (unicode == NULL)
1025 goto onError;
1026 if (!PyUnicode_Check(unicode)) {
1027 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001028 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 unicode->ob_type->tp_name);
1030 Py_DECREF(unicode);
1031 goto onError;
1032 }
1033 Py_DECREF(buffer);
1034 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001035
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 onError:
1037 Py_XDECREF(buffer);
1038 return NULL;
1039}
1040
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001041PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1042 const char *encoding,
1043 const char *errors)
1044{
1045 PyObject *v;
1046
1047 if (!PyUnicode_Check(unicode)) {
1048 PyErr_BadArgument();
1049 goto onError;
1050 }
1051
1052 if (encoding == NULL)
1053 encoding = PyUnicode_GetDefaultEncoding();
1054
1055 /* Decode via the codec registry */
1056 v = PyCodec_Decode(unicode, encoding, errors);
1057 if (v == NULL)
1058 goto onError;
1059 return v;
1060
1061 onError:
1062 return NULL;
1063}
1064
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001066 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 const char *encoding,
1068 const char *errors)
1069{
1070 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 unicode = PyUnicode_FromUnicode(s, size);
1073 if (unicode == NULL)
1074 return NULL;
1075 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1076 Py_DECREF(unicode);
1077 return v;
1078}
1079
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001080PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1081 const char *encoding,
1082 const char *errors)
1083{
1084 PyObject *v;
1085
1086 if (!PyUnicode_Check(unicode)) {
1087 PyErr_BadArgument();
1088 goto onError;
1089 }
1090
1091 if (encoding == NULL)
1092 encoding = PyUnicode_GetDefaultEncoding();
1093
1094 /* Encode via the codec registry */
1095 v = PyCodec_Encode(unicode, encoding, errors);
1096 if (v == NULL)
1097 goto onError;
1098 return v;
1099
1100 onError:
1101 return NULL;
1102}
1103
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1105 const char *encoding,
1106 const char *errors)
1107{
1108 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001109
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (!PyUnicode_Check(unicode)) {
1111 PyErr_BadArgument();
1112 goto onError;
1113 }
Fred Drakee4315f52000-05-09 19:53:39 +00001114
Tim Petersced69f82003-09-16 20:30:58 +00001115 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001116 encoding = PyUnicode_GetDefaultEncoding();
1117
1118 /* Shortcuts for common default encodings */
1119 if (errors == NULL) {
1120 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001121 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001122 else if (strcmp(encoding, "latin-1") == 0)
1123 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001124#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1125 else if (strcmp(encoding, "mbcs") == 0)
1126 return PyUnicode_AsMBCSString(unicode);
1127#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001128 else if (strcmp(encoding, "ascii") == 0)
1129 return PyUnicode_AsASCIIString(unicode);
1130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131
1132 /* Encode via the codec registry */
1133 v = PyCodec_Encode(unicode, encoding, errors);
1134 if (v == NULL)
1135 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001136 if (!PyBytes_Check(v)) {
1137 if (PyString_Check(v)) {
1138 /* Old codec, turn it into bytes */
1139 PyObject *b = PyBytes_FromObject(v);
1140 Py_DECREF(v);
1141 return b;
1142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001144 "encoder did not return a bytes object "
1145 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1146 v->ob_type->tp_name,
1147 encoding ? encoding : "NULL",
1148 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 Py_DECREF(v);
1150 goto onError;
1151 }
1152 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001153
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 onError:
1155 return NULL;
1156}
1157
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001158PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1159 const char *errors)
1160{
1161 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001162 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001163 if (v)
1164 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001165 if (errors != NULL)
1166 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1167 if (errors == NULL) {
1168 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1169 PyUnicode_GET_SIZE(unicode),
1170 NULL);
1171 }
1172 else {
1173 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1174 }
1175 if (!b)
1176 return NULL;
1177 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1178 PyBytes_Size(b));
1179 Py_DECREF(b);
1180 if (!errors) {
1181 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001182 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001183 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001184 return v;
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1188{
1189 if (!PyUnicode_Check(unicode)) {
1190 PyErr_BadArgument();
1191 goto onError;
1192 }
1193 return PyUnicode_AS_UNICODE(unicode);
1194
1195 onError:
1196 return NULL;
1197}
1198
Martin v. Löwis18e16552006-02-15 17:27:45 +00001199Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200{
1201 if (!PyUnicode_Check(unicode)) {
1202 PyErr_BadArgument();
1203 goto onError;
1204 }
1205 return PyUnicode_GET_SIZE(unicode);
1206
1207 onError:
1208 return -1;
1209}
1210
Thomas Wouters78890102000-07-22 19:25:51 +00001211const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001212{
1213 return unicode_default_encoding;
1214}
1215
1216int PyUnicode_SetDefaultEncoding(const char *encoding)
1217{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001218 if (strcmp(encoding, unicode_default_encoding) != 0) {
1219 PyErr_Format(PyExc_ValueError,
1220 "Can only set default encoding to %s",
1221 unicode_default_encoding);
1222 return -1;
1223 }
Fred Drakee4315f52000-05-09 19:53:39 +00001224 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001225}
1226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227/* error handling callback helper:
1228 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001229 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001230 and adjust various state variables.
1231 return 0 on success, -1 on error
1232*/
1233
1234static
1235int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1236 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001237 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1238 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001240 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001241
1242 PyObject *restuple = NULL;
1243 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001244 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1245 Py_ssize_t requiredsize;
1246 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001247 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001248 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001249 int res = -1;
1250
1251 if (*errorHandler == NULL) {
1252 *errorHandler = PyCodec_LookupError(errors);
1253 if (*errorHandler == NULL)
1254 goto onError;
1255 }
1256
1257 if (*exceptionObject == NULL) {
1258 *exceptionObject = PyUnicodeDecodeError_Create(
1259 encoding, input, insize, *startinpos, *endinpos, reason);
1260 if (*exceptionObject == NULL)
1261 goto onError;
1262 }
1263 else {
1264 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1265 goto onError;
1266 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1267 goto onError;
1268 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1269 goto onError;
1270 }
1271
1272 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1273 if (restuple == NULL)
1274 goto onError;
1275 if (!PyTuple_Check(restuple)) {
1276 PyErr_Format(PyExc_TypeError, &argparse[4]);
1277 goto onError;
1278 }
1279 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1280 goto onError;
1281 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001282 newpos = insize+newpos;
1283 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001284 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001285 goto onError;
1286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287
1288 /* need more space? (at least enough for what we
1289 have+the replacement+the rest of the string (starting
1290 at the new input position), so we won't have to check space
1291 when there are no errors in the rest of the string) */
1292 repptr = PyUnicode_AS_UNICODE(repunicode);
1293 repsize = PyUnicode_GET_SIZE(repunicode);
1294 requiredsize = *outpos + repsize + insize-newpos;
1295 if (requiredsize > outsize) {
1296 if (requiredsize<2*outsize)
1297 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001298 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 goto onError;
1300 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1301 }
1302 *endinpos = newpos;
1303 *inptr = input + newpos;
1304 Py_UNICODE_COPY(*outptr, repptr, repsize);
1305 *outptr += repsize;
1306 *outpos += repsize;
1307 /* we made it! */
1308 res = 0;
1309
1310 onError:
1311 Py_XDECREF(restuple);
1312 return res;
1313}
1314
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001315/* --- UTF-7 Codec -------------------------------------------------------- */
1316
1317/* see RFC2152 for details */
1318
Tim Petersced69f82003-09-16 20:30:58 +00001319static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001320char utf7_special[128] = {
1321 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1322 encoded:
1323 0 - not special
1324 1 - special
1325 2 - whitespace (optional)
1326 3 - RFC2152 Set O (optional) */
1327 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1328 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1329 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1331 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1333 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1335
1336};
1337
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001338/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1339 warnings about the comparison always being false; since
1340 utf7_special[0] is 1, we can safely make that one comparison
1341 true */
1342
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001343#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001344 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001345 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001346 (encodeO && (utf7_special[(c)] == 3)))
1347
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001348#define B64(n) \
1349 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1350#define B64CHAR(c) \
1351 (isalnum(c) || (c) == '+' || (c) == '/')
1352#define UB64(c) \
1353 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1354 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001355
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001356#define ENCODE(out, ch, bits) \
1357 while (bits >= 6) { \
1358 *out++ = B64(ch >> (bits-6)); \
1359 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001360 }
1361
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001362#define DECODE(out, ch, bits, surrogate) \
1363 while (bits >= 16) { \
1364 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1365 bits -= 16; \
1366 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001367 /* We have already generated an error for the high surrogate \
1368 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001369 surrogate = 0; \
1370 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001371 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001372 it in a 16-bit character */ \
1373 surrogate = 1; \
1374 errmsg = "code pairs are not supported"; \
1375 goto utf7Error; \
1376 } else { \
1377 *out++ = outCh; \
1378 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001380
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001381PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001382 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001383 const char *errors)
1384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001386 Py_ssize_t startinpos;
1387 Py_ssize_t endinpos;
1388 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001389 const char *e;
1390 PyUnicodeObject *unicode;
1391 Py_UNICODE *p;
1392 const char *errmsg = "";
1393 int inShift = 0;
1394 unsigned int bitsleft = 0;
1395 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 int surrogate = 0;
1397 PyObject *errorHandler = NULL;
1398 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001399
1400 unicode = _PyUnicode_New(size);
1401 if (!unicode)
1402 return NULL;
1403 if (size == 0)
1404 return (PyObject *)unicode;
1405
1406 p = unicode->str;
1407 e = s + size;
1408
1409 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 Py_UNICODE ch;
1411 restart:
1412 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001413
1414 if (inShift) {
1415 if ((ch == '-') || !B64CHAR(ch)) {
1416 inShift = 0;
1417 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001418
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001419 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1420 if (bitsleft >= 6) {
1421 /* The shift sequence has a partial character in it. If
1422 bitsleft < 6 then we could just classify it as padding
1423 but that is not the case here */
1424
1425 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001426 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427 }
1428 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001429 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430 here so indicate the potential of a misencoded character. */
1431
1432 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1433 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1434 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001435 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001436 }
1437
1438 if (ch == '-') {
1439 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001440 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001441 inShift = 1;
1442 }
1443 } else if (SPECIAL(ch,0,0)) {
1444 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001445 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446 } else {
1447 *p++ = ch;
1448 }
1449 } else {
1450 charsleft = (charsleft << 6) | UB64(ch);
1451 bitsleft += 6;
1452 s++;
1453 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1454 }
1455 }
1456 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001457 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 s++;
1459 if (s < e && *s == '-') {
1460 s++;
1461 *p++ = '+';
1462 } else
1463 {
1464 inShift = 1;
1465 bitsleft = 0;
1466 }
1467 }
1468 else if (SPECIAL(ch,0,0)) {
1469 errmsg = "unexpected special character";
1470 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 }
1473 else {
1474 *p++ = ch;
1475 s++;
1476 }
1477 continue;
1478 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 outpos = p-PyUnicode_AS_UNICODE(unicode);
1480 endinpos = s-starts;
1481 if (unicode_decode_call_errorhandler(
1482 errors, &errorHandler,
1483 "utf7", errmsg,
1484 starts, size, &startinpos, &endinpos, &exc, &s,
1485 (PyObject **)&unicode, &outpos, &p))
1486 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 }
1488
1489 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001490 outpos = p-PyUnicode_AS_UNICODE(unicode);
1491 endinpos = size;
1492 if (unicode_decode_call_errorhandler(
1493 errors, &errorHandler,
1494 "utf7", "unterminated shift sequence",
1495 starts, size, &startinpos, &endinpos, &exc, &s,
1496 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 if (s < e)
1499 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001500 }
1501
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001502 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503 goto onError;
1504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001505 Py_XDECREF(errorHandler);
1506 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001507 return (PyObject *)unicode;
1508
1509onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 Py_XDECREF(errorHandler);
1511 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 Py_DECREF(unicode);
1513 return NULL;
1514}
1515
1516
1517PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001518 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 int encodeSetO,
1520 int encodeWhiteSpace,
1521 const char *errors)
1522{
1523 PyObject *v;
1524 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001527 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528 unsigned int bitsleft = 0;
1529 unsigned long charsleft = 0;
1530 char * out;
1531 char * start;
1532
1533 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001534 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535
Walter Dörwald51ab4142007-05-05 14:43:36 +00001536 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 if (v == NULL)
1538 return NULL;
1539
Walter Dörwald51ab4142007-05-05 14:43:36 +00001540 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001541 for (;i < size; ++i) {
1542 Py_UNICODE ch = s[i];
1543
1544 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001545 if (ch == '+') {
1546 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 *out++ = '-';
1548 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1549 charsleft = ch;
1550 bitsleft = 16;
1551 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001552 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001554 } else {
1555 *out++ = (char) ch;
1556 }
1557 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1559 *out++ = B64(charsleft << (6-bitsleft));
1560 charsleft = 0;
1561 bitsleft = 0;
1562 /* Characters not in the BASE64 set implicitly unshift the sequence
1563 so no '-' is required, except if the character is itself a '-' */
1564 if (B64CHAR(ch) || ch == '-') {
1565 *out++ = '-';
1566 }
1567 inShift = 0;
1568 *out++ = (char) ch;
1569 } else {
1570 bitsleft += 16;
1571 charsleft = (charsleft << 16) | ch;
1572 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1573
1574 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001575 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 or '-' then the shift sequence will be terminated implicitly and we
1577 don't have to insert a '-'. */
1578
1579 if (bitsleft == 0) {
1580 if (i + 1 < size) {
1581 Py_UNICODE ch2 = s[i+1];
1582
1583 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001584
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585 } else if (B64CHAR(ch2) || ch2 == '-') {
1586 *out++ = '-';
1587 inShift = 0;
1588 } else {
1589 inShift = 0;
1590 }
1591
1592 }
1593 else {
1594 *out++ = '-';
1595 inShift = 0;
1596 }
1597 }
Tim Petersced69f82003-09-16 20:30:58 +00001598 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601 if (bitsleft) {
1602 *out++= B64(charsleft << (6-bitsleft) );
1603 *out++ = '-';
1604 }
1605
Walter Dörwald51ab4142007-05-05 14:43:36 +00001606 if (PyBytes_Resize(v, out - start)) {
1607 Py_DECREF(v);
1608 return NULL;
1609 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 return v;
1611}
1612
1613#undef SPECIAL
1614#undef B64
1615#undef B64CHAR
1616#undef UB64
1617#undef ENCODE
1618#undef DECODE
1619
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620/* --- UTF-8 Codec -------------------------------------------------------- */
1621
Tim Petersced69f82003-09-16 20:30:58 +00001622static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623char utf8_code_length[256] = {
1624 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1625 illegal prefix. see RFC 2279 for details */
1626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1630 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1631 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1633 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1638 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1639 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1640 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1641 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1642};
1643
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001645 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 const char *errors)
1647{
Walter Dörwald69652032004-09-07 20:24:22 +00001648 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1649}
1650
1651PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001652 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001653 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001654 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001655{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001656 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001658 Py_ssize_t startinpos;
1659 Py_ssize_t endinpos;
1660 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 const char *e;
1662 PyUnicodeObject *unicode;
1663 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 PyObject *errorHandler = NULL;
1666 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
1668 /* Note: size will always be longer than the resulting Unicode
1669 character count */
1670 unicode = _PyUnicode_New(size);
1671 if (!unicode)
1672 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001673 if (size == 0) {
1674 if (consumed)
1675 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678
1679 /* Unpack UTF-8 encoded data */
1680 p = unicode->str;
1681 e = s + size;
1682
1683 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001684 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685
1686 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001687 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 s++;
1689 continue;
1690 }
1691
1692 n = utf8_code_length[ch];
1693
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001694 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001695 if (consumed)
1696 break;
1697 else {
1698 errmsg = "unexpected end of data";
1699 startinpos = s-starts;
1700 endinpos = size;
1701 goto utf8Error;
1702 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704
1705 switch (n) {
1706
1707 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001708 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 startinpos = s-starts;
1710 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001711 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712
1713 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001714 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 startinpos = s-starts;
1716 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001717 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718
1719 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001720 if ((s[1] & 0xc0) != 0x80) {
1721 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 startinpos = s-starts;
1723 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001724 goto utf8Error;
1725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001727 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 startinpos = s-starts;
1729 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001730 errmsg = "illegal encoding";
1731 goto utf8Error;
1732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001734 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 break;
1736
1737 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001738 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001739 (s[2] & 0xc0) != 0x80) {
1740 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001741 startinpos = s-starts;
1742 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001743 goto utf8Error;
1744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001746 if (ch < 0x0800) {
1747 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001748 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001749
1750 XXX For wide builds (UCS-4) we should probably try
1751 to recombine the surrogates into a single code
1752 unit.
1753 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 startinpos = s-starts;
1756 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 goto utf8Error;
1758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001760 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001761 break;
1762
1763 case 4:
1764 if ((s[1] & 0xc0) != 0x80 ||
1765 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 (s[3] & 0xc0) != 0x80) {
1767 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 startinpos = s-starts;
1769 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 goto utf8Error;
1771 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001772 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1773 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1774 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001775 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001776 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001777 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001778 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001779 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 startinpos = s-starts;
1782 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001783 goto utf8Error;
1784 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001785#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001786 *p++ = (Py_UNICODE)ch;
1787#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001788 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001789
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001790 /* translate from 10000..10FFFF to 0..FFFF */
1791 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001792
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001793 /* high surrogate = top 10 bits added to D800 */
1794 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001795
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001796 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001797 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001798#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 break;
1800
1801 default:
1802 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 startinpos = s-starts;
1805 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 }
1808 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001809 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001810
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 outpos = p-PyUnicode_AS_UNICODE(unicode);
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "utf8", errmsg,
1816 starts, size, &startinpos, &endinpos, &exc, &s,
1817 (PyObject **)&unicode, &outpos, &p))
1818 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 }
Walter Dörwald69652032004-09-07 20:24:22 +00001820 if (consumed)
1821 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
1823 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001824 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 goto onError;
1826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 Py_XDECREF(errorHandler);
1828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return (PyObject *)unicode;
1830
1831onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 Py_XDECREF(errorHandler);
1833 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 Py_DECREF(unicode);
1835 return NULL;
1836}
1837
Tim Peters602f7402002-04-27 18:03:26 +00001838/* Allocation strategy: if the string is short, convert into a stack buffer
1839 and allocate exactly as much space needed at the end. Else allocate the
1840 maximum possible needed (4 result bytes per Unicode character), and return
1841 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001842*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001843PyObject *
1844PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001845 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847{
Tim Peters602f7402002-04-27 18:03:26 +00001848#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001849
Martin v. Löwis18e16552006-02-15 17:27:45 +00001850 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001851 PyObject *v; /* result string object */
1852 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001853 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001854 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001855 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001856
Tim Peters602f7402002-04-27 18:03:26 +00001857 assert(s != NULL);
1858 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859
Tim Peters602f7402002-04-27 18:03:26 +00001860 if (size <= MAX_SHORT_UNICHARS) {
1861 /* Write into the stack buffer; nallocated can't overflow.
1862 * At the end, we'll allocate exactly as much heap space as it
1863 * turns out we need.
1864 */
1865 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1866 v = NULL; /* will allocate after we're done */
1867 p = stackbuf;
1868 }
1869 else {
1870 /* Overallocate on the heap, and give the excess back at the end. */
1871 nallocated = size * 4;
1872 if (nallocated / 4 != size) /* overflow! */
1873 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001874 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001875 if (v == NULL)
1876 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001877 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001878 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001879
Tim Peters602f7402002-04-27 18:03:26 +00001880 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001881 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001882
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001883 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001884 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001886
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001888 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001889 *p++ = (char)(0xc0 | (ch >> 6));
1890 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001892 else {
Tim Peters602f7402002-04-27 18:03:26 +00001893 /* Encode UCS2 Unicode ordinals */
1894 if (ch < 0x10000) {
1895 /* Special case: check for high surrogate */
1896 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1897 Py_UCS4 ch2 = s[i];
1898 /* Check for low surrogate and combine the two to
1899 form a UCS4 value */
1900 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001901 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001902 i++;
1903 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001904 }
Tim Peters602f7402002-04-27 18:03:26 +00001905 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001906 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001907 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001908 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1909 *p++ = (char)(0x80 | (ch & 0x3f));
1910 continue;
1911 }
1912encodeUCS4:
1913 /* Encode UCS4 Unicode ordinals */
1914 *p++ = (char)(0xf0 | (ch >> 18));
1915 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1916 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1917 *p++ = (char)(0x80 | (ch & 0x3f));
1918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001920
Tim Peters602f7402002-04-27 18:03:26 +00001921 if (v == NULL) {
1922 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001923 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001924 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001925 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001926 }
1927 else {
1928 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001929 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001930 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001931 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001934
Tim Peters602f7402002-04-27 18:03:26 +00001935#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936}
1937
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 if (!PyUnicode_Check(unicode)) {
1941 PyErr_BadArgument();
1942 return NULL;
1943 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001944 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1945 PyUnicode_GET_SIZE(unicode),
1946 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947}
1948
1949/* --- UTF-16 Codec ------------------------------------------------------- */
1950
Tim Peters772747b2001-08-09 22:21:55 +00001951PyObject *
1952PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001953 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001954 const char *errors,
1955 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956{
Walter Dörwald69652032004-09-07 20:24:22 +00001957 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1958}
1959
1960PyObject *
1961PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001962 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001963 const char *errors,
1964 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001965 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t startinpos;
1969 Py_ssize_t endinpos;
1970 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 PyUnicodeObject *unicode;
1972 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001973 const unsigned char *q, *e;
1974 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001975 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001976 /* Offsets from q for retrieving byte pairs in the right order. */
1977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1978 int ihi = 1, ilo = 0;
1979#else
1980 int ihi = 0, ilo = 1;
1981#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982 PyObject *errorHandler = NULL;
1983 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984
1985 /* Note: size will always be longer than the resulting Unicode
1986 character count */
1987 unicode = _PyUnicode_New(size);
1988 if (!unicode)
1989 return NULL;
1990 if (size == 0)
1991 return (PyObject *)unicode;
1992
1993 /* Unpack UTF-16 encoded data */
1994 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001995 q = (unsigned char *)s;
1996 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997
1998 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001999 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002001 /* Check for BOM marks (U+FEFF) in the input and adjust current
2002 byte order setting accordingly. In native mode, the leading BOM
2003 mark is skipped, in all other modes, it is copied to the output
2004 stream as-is (giving a ZWNBSP character). */
2005 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002006 if (size >= 2) {
2007 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002009 if (bom == 0xFEFF) {
2010 q += 2;
2011 bo = -1;
2012 }
2013 else if (bom == 0xFFFE) {
2014 q += 2;
2015 bo = 1;
2016 }
Tim Petersced69f82003-09-16 20:30:58 +00002017#else
Walter Dörwald69652032004-09-07 20:24:22 +00002018 if (bom == 0xFEFF) {
2019 q += 2;
2020 bo = 1;
2021 }
2022 else if (bom == 0xFFFE) {
2023 q += 2;
2024 bo = -1;
2025 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002026#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002027 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029
Tim Peters772747b2001-08-09 22:21:55 +00002030 if (bo == -1) {
2031 /* force LE */
2032 ihi = 1;
2033 ilo = 0;
2034 }
2035 else if (bo == 1) {
2036 /* force BE */
2037 ihi = 0;
2038 ilo = 1;
2039 }
2040
2041 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002043 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002045 if (consumed)
2046 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 errmsg = "truncated data";
2048 startinpos = ((const char *)q)-starts;
2049 endinpos = ((const char *)e)-starts;
2050 goto utf16Error;
2051 /* The remaining input chars are ignored if the callback
2052 chooses to skip the input */
2053 }
2054 ch = (q[ihi] << 8) | q[ilo];
2055
Tim Peters772747b2001-08-09 22:21:55 +00002056 q += 2;
2057
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 if (ch < 0xD800 || ch > 0xDFFF) {
2059 *p++ = ch;
2060 continue;
2061 }
2062
2063 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002064 if (q >= e) {
2065 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 startinpos = (((const char *)q)-2)-starts;
2067 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002068 goto utf16Error;
2069 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002070 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002071 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2072 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002073 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002074#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002075 *p++ = ch;
2076 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077#else
2078 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002079#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002080 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002081 }
2082 else {
2083 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 startinpos = (((const char *)q)-4)-starts;
2085 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002086 goto utf16Error;
2087 }
2088
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002090 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 startinpos = (((const char *)q)-2)-starts;
2092 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002093 /* Fall through to report the error */
2094
2095 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096 outpos = p-PyUnicode_AS_UNICODE(unicode);
2097 if (unicode_decode_call_errorhandler(
2098 errors, &errorHandler,
2099 "utf16", errmsg,
2100 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2101 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 }
2104
2105 if (byteorder)
2106 *byteorder = bo;
2107
Walter Dörwald69652032004-09-07 20:24:22 +00002108 if (consumed)
2109 *consumed = (const char *)q-starts;
2110
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002112 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 goto onError;
2114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002115 Py_XDECREF(errorHandler);
2116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 return (PyObject *)unicode;
2118
2119onError:
2120 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 Py_XDECREF(errorHandler);
2122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 return NULL;
2124}
2125
Tim Peters772747b2001-08-09 22:21:55 +00002126PyObject *
2127PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002128 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002129 const char *errors,
2130 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131{
2132 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002133 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002134#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002135 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002136#else
2137 const int pairs = 0;
2138#endif
Tim Peters772747b2001-08-09 22:21:55 +00002139 /* Offsets from p for storing byte pairs in the right order. */
2140#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2141 int ihi = 1, ilo = 0;
2142#else
2143 int ihi = 0, ilo = 1;
2144#endif
2145
2146#define STORECHAR(CH) \
2147 do { \
2148 p[ihi] = ((CH) >> 8) & 0xff; \
2149 p[ilo] = (CH) & 0xff; \
2150 p += 2; \
2151 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002153#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002154 for (i = pairs = 0; i < size; i++)
2155 if (s[i] >= 0x10000)
2156 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002157#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002158 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002159 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 if (v == NULL)
2161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162
Walter Dörwald3cc34522007-05-04 10:48:27 +00002163 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002165 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002166 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002167 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002168
2169 if (byteorder == -1) {
2170 /* force LE */
2171 ihi = 1;
2172 ilo = 0;
2173 }
2174 else if (byteorder == 1) {
2175 /* force BE */
2176 ihi = 0;
2177 ilo = 1;
2178 }
2179
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002180 while (size-- > 0) {
2181 Py_UNICODE ch = *s++;
2182 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002183#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002184 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002185 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2186 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002188#endif
Tim Peters772747b2001-08-09 22:21:55 +00002189 STORECHAR(ch);
2190 if (ch2)
2191 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002194#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195}
2196
2197PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2198{
2199 if (!PyUnicode_Check(unicode)) {
2200 PyErr_BadArgument();
2201 return NULL;
2202 }
2203 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2204 PyUnicode_GET_SIZE(unicode),
2205 NULL,
2206 0);
2207}
2208
2209/* --- Unicode Escape Codec ----------------------------------------------- */
2210
Fredrik Lundh06d12682001-01-24 07:59:11 +00002211static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002214 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 const char *errors)
2216{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002217 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002218 Py_ssize_t startinpos;
2219 Py_ssize_t endinpos;
2220 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002221 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002223 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002225 char* message;
2226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 PyObject *errorHandler = NULL;
2228 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002229
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 /* Escaped strings will always be longer than the resulting
2231 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 length after conversion to the true value.
2233 (but if the error callback returns a long replacement string
2234 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 v = _PyUnicode_New(size);
2236 if (v == NULL)
2237 goto onError;
2238 if (size == 0)
2239 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002241 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002243
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 while (s < end) {
2245 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002246 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002247 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248
2249 /* Non-escape characters are interpreted as Unicode ordinals */
2250 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002251 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 continue;
2253 }
2254
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 /* \ - Escapes */
2257 s++;
2258 switch (*s++) {
2259
2260 /* \x escapes */
2261 case '\n': break;
2262 case '\\': *p++ = '\\'; break;
2263 case '\'': *p++ = '\''; break;
2264 case '\"': *p++ = '\"'; break;
2265 case 'b': *p++ = '\b'; break;
2266 case 'f': *p++ = '\014'; break; /* FF */
2267 case 't': *p++ = '\t'; break;
2268 case 'n': *p++ = '\n'; break;
2269 case 'r': *p++ = '\r'; break;
2270 case 'v': *p++ = '\013'; break; /* VT */
2271 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2272
2273 /* \OOO (octal) escapes */
2274 case '0': case '1': case '2': case '3':
2275 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002276 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002278 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002280 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002282 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 break;
2284
Fredrik Lundhccc74732001-02-18 22:13:49 +00002285 /* hex escapes */
2286 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002288 digits = 2;
2289 message = "truncated \\xXX escape";
2290 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Fredrik Lundhccc74732001-02-18 22:13:49 +00002292 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002294 digits = 4;
2295 message = "truncated \\uXXXX escape";
2296 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297
Fredrik Lundhccc74732001-02-18 22:13:49 +00002298 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002299 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002300 digits = 8;
2301 message = "truncated \\UXXXXXXXX escape";
2302 hexescape:
2303 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002304 outpos = p-PyUnicode_AS_UNICODE(v);
2305 if (s+digits>end) {
2306 endinpos = size;
2307 if (unicode_decode_call_errorhandler(
2308 errors, &errorHandler,
2309 "unicodeescape", "end of string in escape sequence",
2310 starts, size, &startinpos, &endinpos, &exc, &s,
2311 (PyObject **)&v, &outpos, &p))
2312 goto onError;
2313 goto nextByte;
2314 }
2315 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002316 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002317 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002318 endinpos = (s+i+1)-starts;
2319 if (unicode_decode_call_errorhandler(
2320 errors, &errorHandler,
2321 "unicodeescape", message,
2322 starts, size, &startinpos, &endinpos, &exc, &s,
2323 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002324 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002325 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002326 }
2327 chr = (chr<<4) & ~0xF;
2328 if (c >= '0' && c <= '9')
2329 chr += c - '0';
2330 else if (c >= 'a' && c <= 'f')
2331 chr += 10 + c - 'a';
2332 else
2333 chr += 10 + c - 'A';
2334 }
2335 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002336 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002337 /* _decoding_error will have already written into the
2338 target buffer. */
2339 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002340 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002341 /* when we get here, chr is a 32-bit unicode character */
2342 if (chr <= 0xffff)
2343 /* UCS-2 character */
2344 *p++ = (Py_UNICODE) chr;
2345 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002346 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002347 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002349 *p++ = chr;
2350#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002351 chr -= 0x10000L;
2352 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002353 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002354#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002355 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 endinpos = s-starts;
2357 outpos = p-PyUnicode_AS_UNICODE(v);
2358 if (unicode_decode_call_errorhandler(
2359 errors, &errorHandler,
2360 "unicodeescape", "illegal Unicode character",
2361 starts, size, &startinpos, &endinpos, &exc, &s,
2362 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 goto onError;
2364 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002365 break;
2366
2367 /* \N{name} */
2368 case 'N':
2369 message = "malformed \\N character escape";
2370 if (ucnhash_CAPI == NULL) {
2371 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002372 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002373 m = PyImport_ImportModule("unicodedata");
2374 if (m == NULL)
2375 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002376 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002377 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002378 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002379 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002380 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002381 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002382 if (ucnhash_CAPI == NULL)
2383 goto ucnhashError;
2384 }
2385 if (*s == '{') {
2386 const char *start = s+1;
2387 /* look for the closing brace */
2388 while (*s != '}' && s < end)
2389 s++;
2390 if (s > start && s < end && *s == '}') {
2391 /* found a name. look it up in the unicode database */
2392 message = "unknown Unicode character name";
2393 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002394 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002395 goto store;
2396 }
2397 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 endinpos = s-starts;
2399 outpos = p-PyUnicode_AS_UNICODE(v);
2400 if (unicode_decode_call_errorhandler(
2401 errors, &errorHandler,
2402 "unicodeescape", message,
2403 starts, size, &startinpos, &endinpos, &exc, &s,
2404 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002405 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002406 break;
2407
2408 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002409 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002410 message = "\\ at end of string";
2411 s--;
2412 endinpos = s-starts;
2413 outpos = p-PyUnicode_AS_UNICODE(v);
2414 if (unicode_decode_call_errorhandler(
2415 errors, &errorHandler,
2416 "unicodeescape", message,
2417 starts, size, &startinpos, &endinpos, &exc, &s,
2418 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002419 goto onError;
2420 }
2421 else {
2422 *p++ = '\\';
2423 *p++ = (unsigned char)s[-1];
2424 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002425 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002427 nextByte:
2428 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002431 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002435
Fredrik Lundhccc74732001-02-18 22:13:49 +00002436ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002437 PyErr_SetString(
2438 PyExc_UnicodeError,
2439 "\\N escapes not supported (can't load unicodedata module)"
2440 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002441 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442 Py_XDECREF(errorHandler);
2443 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002444 return NULL;
2445
Fredrik Lundhccc74732001-02-18 22:13:49 +00002446onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 Py_XDECREF(errorHandler);
2449 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450 return NULL;
2451}
2452
2453/* Return a Unicode-Escape string version of the Unicode object.
2454
2455 If quotes is true, the string is enclosed in u"" or u'' quotes as
2456 appropriate.
2457
2458*/
2459
Thomas Wouters477c8d52006-05-27 19:21:47 +00002460Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2461 Py_ssize_t size,
2462 Py_UNICODE ch)
2463{
2464 /* like wcschr, but doesn't stop at NULL characters */
2465
2466 while (size-- > 0) {
2467 if (*s == ch)
2468 return s;
2469 s++;
2470 }
2471
2472 return NULL;
2473}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002474
Walter Dörwald79e913e2007-05-12 11:08:06 +00002475static const char *hexdigits = "0123456789abcdef";
2476
2477PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2478 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479{
2480 PyObject *repr;
2481 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482
Thomas Wouters89f507f2006-12-13 04:49:30 +00002483 /* XXX(nnorwitz): rather than over-allocating, it would be
2484 better to choose a different scheme. Perhaps scan the
2485 first N-chars of the string and allocate based on that size.
2486 */
2487 /* Initial allocation is based on the longest-possible unichr
2488 escape.
2489
2490 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2491 unichr, so in this case it's the longest unichr escape. In
2492 narrow (UTF-16) builds this is five chars per source unichr
2493 since there are two unichrs in the surrogate pair, so in narrow
2494 (UTF-16) builds it's not the longest unichr escape.
2495
2496 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2497 so in the narrow (UTF-16) build case it's the longest unichr
2498 escape.
2499 */
2500
Walter Dörwald79e913e2007-05-12 11:08:06 +00002501 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002502#ifdef Py_UNICODE_WIDE
2503 + 10*size
2504#else
2505 + 6*size
2506#endif
2507 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 if (repr == NULL)
2509 return NULL;
2510
Walter Dörwald79e913e2007-05-12 11:08:06 +00002511 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 while (size-- > 0) {
2514 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002515
Walter Dörwald79e913e2007-05-12 11:08:06 +00002516 /* Escape backslashes */
2517 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 *p++ = '\\';
2519 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002520 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002521 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002522
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002523#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002524 /* Map 21-bit characters to '\U00xxxxxx' */
2525 else if (ch >= 0x10000) {
2526 *p++ = '\\';
2527 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002528 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2529 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2530 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2531 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2532 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2533 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2534 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2535 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002536 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002537 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002538#else
2539 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002540 else if (ch >= 0xD800 && ch < 0xDC00) {
2541 Py_UNICODE ch2;
2542 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002543
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002544 ch2 = *s++;
2545 size--;
2546 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2547 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2548 *p++ = '\\';
2549 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002550 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2551 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2552 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2553 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2554 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2555 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2556 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2557 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002558 continue;
2559 }
2560 /* Fall through: isolated surrogates are copied as-is */
2561 s--;
2562 size++;
2563 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002564#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002567 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 *p++ = '\\';
2569 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002570 *p++ = hexdigits[(ch >> 12) & 0x000F];
2571 *p++ = hexdigits[(ch >> 8) & 0x000F];
2572 *p++ = hexdigits[(ch >> 4) & 0x000F];
2573 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002575
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002576 /* Map special whitespace to '\t', \n', '\r' */
2577 else if (ch == '\t') {
2578 *p++ = '\\';
2579 *p++ = 't';
2580 }
2581 else if (ch == '\n') {
2582 *p++ = '\\';
2583 *p++ = 'n';
2584 }
2585 else if (ch == '\r') {
2586 *p++ = '\\';
2587 *p++ = 'r';
2588 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002589
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002590 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002591 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002593 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002594 *p++ = hexdigits[(ch >> 4) & 0x000F];
2595 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002596 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002597
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 /* Copy everything else as-is */
2599 else
2600 *p++ = (char) ch;
2601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602
2603 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002604 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2605 Py_DECREF(repr);
2606 return NULL;
2607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 return repr;
2609}
2610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2612{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002613 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 if (!PyUnicode_Check(unicode)) {
2615 PyErr_BadArgument();
2616 return NULL;
2617 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002618 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2619 PyUnicode_GET_SIZE(unicode));
2620
2621 if (!s)
2622 return NULL;
2623 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2624 PyBytes_GET_SIZE(s));
2625 Py_DECREF(s);
2626 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627}
2628
2629/* --- Raw Unicode Escape Codec ------------------------------------------- */
2630
2631PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002632 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 const char *errors)
2634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002636 Py_ssize_t startinpos;
2637 Py_ssize_t endinpos;
2638 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 const char *end;
2642 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 PyObject *errorHandler = NULL;
2644 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002645
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 /* Escaped strings will always be longer than the resulting
2647 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002648 length after conversion to the true value. (But decoding error
2649 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 v = _PyUnicode_New(size);
2651 if (v == NULL)
2652 goto onError;
2653 if (size == 0)
2654 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 end = s + size;
2657 while (s < end) {
2658 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002659 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002661 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662
2663 /* Non-escape characters are interpreted as Unicode ordinals */
2664 if (*s != '\\') {
2665 *p++ = (unsigned char)*s++;
2666 continue;
2667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
2670 /* \u-escapes are only interpreted iff the number of leading
2671 backslashes if odd */
2672 bs = s;
2673 for (;s < end;) {
2674 if (*s != '\\')
2675 break;
2676 *p++ = (unsigned char)*s++;
2677 }
2678 if (((s - bs) & 1) == 0 ||
2679 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002680 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 continue;
2682 }
2683 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002684 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 s++;
2686
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002687 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002689 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 endinpos = s-starts;
2693 if (unicode_decode_call_errorhandler(
2694 errors, &errorHandler,
2695 "rawunicodeescape", "truncated \\uXXXX",
2696 starts, size, &startinpos, &endinpos, &exc, &s,
2697 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002699 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 }
2701 x = (x<<4) & ~0xF;
2702 if (c >= '0' && c <= '9')
2703 x += c - '0';
2704 else if (c >= 'a' && c <= 'f')
2705 x += 10 + c - 'a';
2706 else
2707 x += 10 + c - 'A';
2708 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002709#ifndef Py_UNICODE_WIDE
2710 if (x > 0x10000) {
2711 if (unicode_decode_call_errorhandler(
2712 errors, &errorHandler,
2713 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2714 starts, size, &startinpos, &endinpos, &exc, &s,
2715 (PyObject **)&v, &outpos, &p))
2716 goto onError;
2717 }
2718#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 *p++ = x;
2720 nextByte:
2721 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002723 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 Py_XDECREF(errorHandler);
2726 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002728
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 onError:
2730 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 Py_XDECREF(errorHandler);
2732 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 return NULL;
2734}
2735
2736PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738{
2739 PyObject *repr;
2740 char *p;
2741 char *q;
2742
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002743#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002744 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002745#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002746 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002747#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 if (repr == NULL)
2749 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002750 if (size == 0)
2751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752
Walter Dörwald711005d2007-05-12 12:03:26 +00002753 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 while (size-- > 0) {
2755 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002756#ifdef Py_UNICODE_WIDE
2757 /* Map 32-bit characters to '\Uxxxxxxxx' */
2758 if (ch >= 0x10000) {
2759 *p++ = '\\';
2760 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002761 *p++ = hexdigits[(ch >> 28) & 0xf];
2762 *p++ = hexdigits[(ch >> 24) & 0xf];
2763 *p++ = hexdigits[(ch >> 20) & 0xf];
2764 *p++ = hexdigits[(ch >> 16) & 0xf];
2765 *p++ = hexdigits[(ch >> 12) & 0xf];
2766 *p++ = hexdigits[(ch >> 8) & 0xf];
2767 *p++ = hexdigits[(ch >> 4) & 0xf];
2768 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002769 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002770 else
2771#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 /* Map 16-bit characters to '\uxxxx' */
2773 if (ch >= 256) {
2774 *p++ = '\\';
2775 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002776 *p++ = hexdigits[(ch >> 12) & 0xf];
2777 *p++ = hexdigits[(ch >> 8) & 0xf];
2778 *p++ = hexdigits[(ch >> 4) & 0xf];
2779 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
2781 /* Copy everything else as-is */
2782 else
2783 *p++ = (char) ch;
2784 }
2785 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002786 if (PyBytes_Resize(repr, p - q)) {
2787 Py_DECREF(repr);
2788 return NULL;
2789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return repr;
2791}
2792
2793PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2794{
Walter Dörwald711005d2007-05-12 12:03:26 +00002795 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002797 PyErr_BadArgument();
2798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002800 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2801 PyUnicode_GET_SIZE(unicode));
2802
2803 if (!s)
2804 return NULL;
2805 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2806 PyBytes_GET_SIZE(s));
2807 Py_DECREF(s);
2808 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809}
2810
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002811/* --- Unicode Internal Codec ------------------------------------------- */
2812
2813PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002815 const char *errors)
2816{
2817 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002818 Py_ssize_t startinpos;
2819 Py_ssize_t endinpos;
2820 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
2823 const char *end;
2824 const char *reason;
2825 PyObject *errorHandler = NULL;
2826 PyObject *exc = NULL;
2827
Neal Norwitzd43069c2006-01-08 01:12:10 +00002828#ifdef Py_UNICODE_WIDE
2829 Py_UNICODE unimax = PyUnicode_GetMax();
2830#endif
2831
Thomas Wouters89f507f2006-12-13 04:49:30 +00002832 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002833 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2834 if (v == NULL)
2835 goto onError;
2836 if (PyUnicode_GetSize((PyObject *)v) == 0)
2837 return (PyObject *)v;
2838 p = PyUnicode_AS_UNICODE(v);
2839 end = s + size;
2840
2841 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002842 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002843 /* We have to sanity check the raw data, otherwise doom looms for
2844 some malformed UCS-4 data. */
2845 if (
2846 #ifdef Py_UNICODE_WIDE
2847 *p > unimax || *p < 0 ||
2848 #endif
2849 end-s < Py_UNICODE_SIZE
2850 )
2851 {
2852 startinpos = s - starts;
2853 if (end-s < Py_UNICODE_SIZE) {
2854 endinpos = end-starts;
2855 reason = "truncated input";
2856 }
2857 else {
2858 endinpos = s - starts + Py_UNICODE_SIZE;
2859 reason = "illegal code point (> 0x10FFFF)";
2860 }
2861 outpos = p - PyUnicode_AS_UNICODE(v);
2862 if (unicode_decode_call_errorhandler(
2863 errors, &errorHandler,
2864 "unicode_internal", reason,
2865 starts, size, &startinpos, &endinpos, &exc, &s,
2866 (PyObject **)&v, &outpos, &p)) {
2867 goto onError;
2868 }
2869 }
2870 else {
2871 p++;
2872 s += Py_UNICODE_SIZE;
2873 }
2874 }
2875
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002876 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002877 goto onError;
2878 Py_XDECREF(errorHandler);
2879 Py_XDECREF(exc);
2880 return (PyObject *)v;
2881
2882 onError:
2883 Py_XDECREF(v);
2884 Py_XDECREF(errorHandler);
2885 Py_XDECREF(exc);
2886 return NULL;
2887}
2888
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889/* --- Latin-1 Codec ------------------------------------------------------ */
2890
2891PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002892 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 const char *errors)
2894{
2895 PyUnicodeObject *v;
2896 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002897
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002899 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002900 Py_UNICODE r = *(unsigned char*)s;
2901 return PyUnicode_FromUnicode(&r, 1);
2902 }
2903
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 v = _PyUnicode_New(size);
2905 if (v == NULL)
2906 goto onError;
2907 if (size == 0)
2908 return (PyObject *)v;
2909 p = PyUnicode_AS_UNICODE(v);
2910 while (size-- > 0)
2911 *p++ = (unsigned char)*s++;
2912 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002913
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 onError:
2915 Py_XDECREF(v);
2916 return NULL;
2917}
2918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919/* create or adjust a UnicodeEncodeError */
2920static void make_encode_exception(PyObject **exceptionObject,
2921 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002922 const Py_UNICODE *unicode, Py_ssize_t size,
2923 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002926 if (*exceptionObject == NULL) {
2927 *exceptionObject = PyUnicodeEncodeError_Create(
2928 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 }
2930 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2932 goto onError;
2933 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2934 goto onError;
2935 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2936 goto onError;
2937 return;
2938 onError:
2939 Py_DECREF(*exceptionObject);
2940 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 }
2942}
2943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002944/* raises a UnicodeEncodeError */
2945static void raise_encode_exception(PyObject **exceptionObject,
2946 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002947 const Py_UNICODE *unicode, Py_ssize_t size,
2948 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002949 const char *reason)
2950{
2951 make_encode_exception(exceptionObject,
2952 encoding, unicode, size, startpos, endpos, reason);
2953 if (*exceptionObject != NULL)
2954 PyCodec_StrictErrors(*exceptionObject);
2955}
2956
2957/* error handling callback helper:
2958 build arguments, call the callback and check the arguments,
2959 put the result into newpos and return the replacement string, which
2960 has to be freed by the caller */
2961static PyObject *unicode_encode_call_errorhandler(const char *errors,
2962 PyObject **errorHandler,
2963 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002964 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2965 Py_ssize_t startpos, Py_ssize_t endpos,
2966 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002968 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969
2970 PyObject *restuple;
2971 PyObject *resunicode;
2972
2973 if (*errorHandler == NULL) {
2974 *errorHandler = PyCodec_LookupError(errors);
2975 if (*errorHandler == NULL)
2976 return NULL;
2977 }
2978
2979 make_encode_exception(exceptionObject,
2980 encoding, unicode, size, startpos, endpos, reason);
2981 if (*exceptionObject == NULL)
2982 return NULL;
2983
2984 restuple = PyObject_CallFunctionObjArgs(
2985 *errorHandler, *exceptionObject, NULL);
2986 if (restuple == NULL)
2987 return NULL;
2988 if (!PyTuple_Check(restuple)) {
2989 PyErr_Format(PyExc_TypeError, &argparse[4]);
2990 Py_DECREF(restuple);
2991 return NULL;
2992 }
2993 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2994 &resunicode, newpos)) {
2995 Py_DECREF(restuple);
2996 return NULL;
2997 }
2998 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002999 *newpos = size+*newpos;
3000 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003001 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003002 Py_DECREF(restuple);
3003 return NULL;
3004 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 Py_INCREF(resunicode);
3006 Py_DECREF(restuple);
3007 return resunicode;
3008}
3009
3010static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003011 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 const char *errors,
3013 int limit)
3014{
3015 /* output object */
3016 PyObject *res;
3017 /* pointers to the beginning and end+1 of input */
3018 const Py_UNICODE *startp = p;
3019 const Py_UNICODE *endp = p + size;
3020 /* pointer to the beginning of the unencodable characters */
3021 /* const Py_UNICODE *badp = NULL; */
3022 /* pointer into the output */
3023 char *str;
3024 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t respos = 0;
3026 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003027 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3028 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 PyObject *errorHandler = NULL;
3030 PyObject *exc = NULL;
3031 /* the following variable is used for caching string comparisons
3032 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3033 int known_errorHandler = -1;
3034
3035 /* allocate enough for a simple encoding without
3036 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003037 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (res == NULL)
3039 goto onError;
3040 if (size == 0)
3041 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003042 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 ressize = size;
3044
3045 while (p<endp) {
3046 Py_UNICODE c = *p;
3047
3048 /* can we encode this? */
3049 if (c<limit) {
3050 /* no overflow check, because we know that the space is enough */
3051 *str++ = (char)c;
3052 ++p;
3053 }
3054 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t unicodepos = p-startp;
3056 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003058 Py_ssize_t repsize;
3059 Py_ssize_t newpos;
3060 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 Py_UNICODE *uni2;
3062 /* startpos for collecting unencodable chars */
3063 const Py_UNICODE *collstart = p;
3064 const Py_UNICODE *collend = p;
3065 /* find all unecodable characters */
3066 while ((collend < endp) && ((*collend)>=limit))
3067 ++collend;
3068 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3069 if (known_errorHandler==-1) {
3070 if ((errors==NULL) || (!strcmp(errors, "strict")))
3071 known_errorHandler = 1;
3072 else if (!strcmp(errors, "replace"))
3073 known_errorHandler = 2;
3074 else if (!strcmp(errors, "ignore"))
3075 known_errorHandler = 3;
3076 else if (!strcmp(errors, "xmlcharrefreplace"))
3077 known_errorHandler = 4;
3078 else
3079 known_errorHandler = 0;
3080 }
3081 switch (known_errorHandler) {
3082 case 1: /* strict */
3083 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3084 goto onError;
3085 case 2: /* replace */
3086 while (collstart++<collend)
3087 *str++ = '?'; /* fall through */
3088 case 3: /* ignore */
3089 p = collend;
3090 break;
3091 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003092 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 /* determine replacement size (temporarily (mis)uses p) */
3094 for (p = collstart, repsize = 0; p < collend; ++p) {
3095 if (*p<10)
3096 repsize += 2+1+1;
3097 else if (*p<100)
3098 repsize += 2+2+1;
3099 else if (*p<1000)
3100 repsize += 2+3+1;
3101 else if (*p<10000)
3102 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003103#ifndef Py_UNICODE_WIDE
3104 else
3105 repsize += 2+5+1;
3106#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 else if (*p<100000)
3108 repsize += 2+5+1;
3109 else if (*p<1000000)
3110 repsize += 2+6+1;
3111 else
3112 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003113#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 }
3115 requiredsize = respos+repsize+(endp-collend);
3116 if (requiredsize > ressize) {
3117 if (requiredsize<2*ressize)
3118 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003119 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003121 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 ressize = requiredsize;
3123 }
3124 /* generate replacement (temporarily (mis)uses p) */
3125 for (p = collstart; p < collend; ++p) {
3126 str += sprintf(str, "&#%d;", (int)*p);
3127 }
3128 p = collend;
3129 break;
3130 default:
3131 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3132 encoding, reason, startp, size, &exc,
3133 collstart-startp, collend-startp, &newpos);
3134 if (repunicode == NULL)
3135 goto onError;
3136 /* need more space? (at least enough for what we
3137 have+the replacement+the rest of the string, so
3138 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003139 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 repsize = PyUnicode_GET_SIZE(repunicode);
3141 requiredsize = respos+repsize+(endp-collend);
3142 if (requiredsize > ressize) {
3143 if (requiredsize<2*ressize)
3144 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003145 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 Py_DECREF(repunicode);
3147 goto onError;
3148 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003149 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 ressize = requiredsize;
3151 }
3152 /* check if there is anything unencodable in the replacement
3153 and copy it to the output */
3154 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3155 c = *uni2;
3156 if (c >= limit) {
3157 raise_encode_exception(&exc, encoding, startp, size,
3158 unicodepos, unicodepos+1, reason);
3159 Py_DECREF(repunicode);
3160 goto onError;
3161 }
3162 *str = (char)c;
3163 }
3164 p = startp + newpos;
3165 Py_DECREF(repunicode);
3166 }
3167 }
3168 }
3169 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003170 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 if (respos<ressize)
3172 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003173 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 Py_XDECREF(errorHandler);
3175 Py_XDECREF(exc);
3176 return res;
3177
3178 onError:
3179 Py_XDECREF(res);
3180 Py_XDECREF(errorHandler);
3181 Py_XDECREF(exc);
3182 return NULL;
3183}
3184
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003186 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 const char *errors)
3188{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190}
3191
3192PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3193{
3194 if (!PyUnicode_Check(unicode)) {
3195 PyErr_BadArgument();
3196 return NULL;
3197 }
3198 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3199 PyUnicode_GET_SIZE(unicode),
3200 NULL);
3201}
3202
3203/* --- 7-bit ASCII Codec -------------------------------------------------- */
3204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003206 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 const char *errors)
3208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 PyUnicodeObject *v;
3211 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003212 Py_ssize_t startinpos;
3213 Py_ssize_t endinpos;
3214 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003215 const char *e;
3216 PyObject *errorHandler = NULL;
3217 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003218
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003220 if (size == 1 && *(unsigned char*)s < 128) {
3221 Py_UNICODE r = *(unsigned char*)s;
3222 return PyUnicode_FromUnicode(&r, 1);
3223 }
Tim Petersced69f82003-09-16 20:30:58 +00003224
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 v = _PyUnicode_New(size);
3226 if (v == NULL)
3227 goto onError;
3228 if (size == 0)
3229 return (PyObject *)v;
3230 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 e = s + size;
3232 while (s < e) {
3233 register unsigned char c = (unsigned char)*s;
3234 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 ++s;
3237 }
3238 else {
3239 startinpos = s-starts;
3240 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003241 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 if (unicode_decode_call_errorhandler(
3243 errors, &errorHandler,
3244 "ascii", "ordinal not in range(128)",
3245 starts, size, &startinpos, &endinpos, &exc, &s,
3246 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003250 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003251 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003252 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003256
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 onError:
3258 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 Py_XDECREF(errorHandler);
3260 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 return NULL;
3262}
3263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 const char *errors)
3267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269}
3270
3271PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3272{
3273 if (!PyUnicode_Check(unicode)) {
3274 PyErr_BadArgument();
3275 return NULL;
3276 }
3277 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3278 PyUnicode_GET_SIZE(unicode),
3279 NULL);
3280}
3281
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003282#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003283
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003284/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003285
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003286#if SIZEOF_INT < SIZEOF_SSIZE_T
3287#define NEED_RETRY
3288#endif
3289
3290/* XXX This code is limited to "true" double-byte encodings, as
3291 a) it assumes an incomplete character consists of a single byte, and
3292 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3293 encodings, see IsDBCSLeadByteEx documentation. */
3294
3295static int is_dbcs_lead_byte(const char *s, int offset)
3296{
3297 const char *curr = s + offset;
3298
3299 if (IsDBCSLeadByte(*curr)) {
3300 const char *prev = CharPrev(s, curr);
3301 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3302 }
3303 return 0;
3304}
3305
3306/*
3307 * Decode MBCS string into unicode object. If 'final' is set, converts
3308 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3309 */
3310static int decode_mbcs(PyUnicodeObject **v,
3311 const char *s, /* MBCS string */
3312 int size, /* sizeof MBCS string */
3313 int final)
3314{
3315 Py_UNICODE *p;
3316 Py_ssize_t n = 0;
3317 int usize = 0;
3318
3319 assert(size >= 0);
3320
3321 /* Skip trailing lead-byte unless 'final' is set */
3322 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3323 --size;
3324
3325 /* First get the size of the result */
3326 if (size > 0) {
3327 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3328 if (usize == 0) {
3329 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3330 return -1;
3331 }
3332 }
3333
3334 if (*v == NULL) {
3335 /* Create unicode object */
3336 *v = _PyUnicode_New(usize);
3337 if (*v == NULL)
3338 return -1;
3339 }
3340 else {
3341 /* Extend unicode object */
3342 n = PyUnicode_GET_SIZE(*v);
3343 if (_PyUnicode_Resize(v, n + usize) < 0)
3344 return -1;
3345 }
3346
3347 /* Do the conversion */
3348 if (size > 0) {
3349 p = PyUnicode_AS_UNICODE(*v) + n;
3350 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3351 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3352 return -1;
3353 }
3354 }
3355
3356 return size;
3357}
3358
3359PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3360 Py_ssize_t size,
3361 const char *errors,
3362 Py_ssize_t *consumed)
3363{
3364 PyUnicodeObject *v = NULL;
3365 int done;
3366
3367 if (consumed)
3368 *consumed = 0;
3369
3370#ifdef NEED_RETRY
3371 retry:
3372 if (size > INT_MAX)
3373 done = decode_mbcs(&v, s, INT_MAX, 0);
3374 else
3375#endif
3376 done = decode_mbcs(&v, s, (int)size, !consumed);
3377
3378 if (done < 0) {
3379 Py_XDECREF(v);
3380 return NULL;
3381 }
3382
3383 if (consumed)
3384 *consumed += done;
3385
3386#ifdef NEED_RETRY
3387 if (size > INT_MAX) {
3388 s += done;
3389 size -= done;
3390 goto retry;
3391 }
3392#endif
3393
3394 return (PyObject *)v;
3395}
3396
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003397PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003399 const char *errors)
3400{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003401 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3402}
3403
3404/*
3405 * Convert unicode into string object (MBCS).
3406 * Returns 0 if succeed, -1 otherwise.
3407 */
3408static int encode_mbcs(PyObject **repr,
3409 const Py_UNICODE *p, /* unicode */
3410 int size) /* size of unicode */
3411{
3412 int mbcssize = 0;
3413 Py_ssize_t n = 0;
3414
3415 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003416
3417 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003418 if (size > 0) {
3419 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3420 if (mbcssize == 0) {
3421 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3422 return -1;
3423 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003424 }
3425
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003426 if (*repr == NULL) {
3427 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003428 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003429 if (*repr == NULL)
3430 return -1;
3431 }
3432 else {
3433 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003434 n = PyBytes_Size(*repr);
3435 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003436 return -1;
3437 }
3438
3439 /* Do the conversion */
3440 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003441 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003442 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3443 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3444 return -1;
3445 }
3446 }
3447
3448 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003449}
3450
3451PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003452 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003453 const char *errors)
3454{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003455 PyObject *repr = NULL;
3456 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003457
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003458#ifdef NEED_RETRY
3459 retry:
3460 if (size > INT_MAX)
3461 ret = encode_mbcs(&repr, p, INT_MAX);
3462 else
3463#endif
3464 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003465
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003466 if (ret < 0) {
3467 Py_XDECREF(repr);
3468 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003469 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003470
3471#ifdef NEED_RETRY
3472 if (size > INT_MAX) {
3473 p += INT_MAX;
3474 size -= INT_MAX;
3475 goto retry;
3476 }
3477#endif
3478
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003479 return repr;
3480}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003481
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003482PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3483{
3484 if (!PyUnicode_Check(unicode)) {
3485 PyErr_BadArgument();
3486 return NULL;
3487 }
3488 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3489 PyUnicode_GET_SIZE(unicode),
3490 NULL);
3491}
3492
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003493#undef NEED_RETRY
3494
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003495#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497/* --- Character Mapping Codec -------------------------------------------- */
3498
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003500 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 PyObject *mapping,
3502 const char *errors)
3503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 Py_ssize_t startinpos;
3506 Py_ssize_t endinpos;
3507 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 PyUnicodeObject *v;
3510 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 PyObject *errorHandler = NULL;
3513 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003514 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003515 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003516
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 /* Default to Latin-1 */
3518 if (mapping == NULL)
3519 return PyUnicode_DecodeLatin1(s, size, errors);
3520
3521 v = _PyUnicode_New(size);
3522 if (v == NULL)
3523 goto onError;
3524 if (size == 0)
3525 return (PyObject *)v;
3526 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003528 if (PyUnicode_CheckExact(mapping)) {
3529 mapstring = PyUnicode_AS_UNICODE(mapping);
3530 maplen = PyUnicode_GET_SIZE(mapping);
3531 while (s < e) {
3532 unsigned char ch = *s;
3533 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003535 if (ch < maplen)
3536 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003538 if (x == 0xfffe) {
3539 /* undefined mapping */
3540 outpos = p-PyUnicode_AS_UNICODE(v);
3541 startinpos = s-starts;
3542 endinpos = startinpos+1;
3543 if (unicode_decode_call_errorhandler(
3544 errors, &errorHandler,
3545 "charmap", "character maps to <undefined>",
3546 starts, size, &startinpos, &endinpos, &exc, &s,
3547 (PyObject **)&v, &outpos, &p)) {
3548 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003549 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003550 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003551 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003552 *p++ = x;
3553 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003555 }
3556 else {
3557 while (s < e) {
3558 unsigned char ch = *s;
3559 PyObject *w, *x;
3560
3561 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3562 w = PyInt_FromLong((long)ch);
3563 if (w == NULL)
3564 goto onError;
3565 x = PyObject_GetItem(mapping, w);
3566 Py_DECREF(w);
3567 if (x == NULL) {
3568 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3569 /* No mapping found means: mapping is undefined. */
3570 PyErr_Clear();
3571 x = Py_None;
3572 Py_INCREF(x);
3573 } else
3574 goto onError;
3575 }
3576
3577 /* Apply mapping */
3578 if (PyInt_Check(x)) {
3579 long value = PyInt_AS_LONG(x);
3580 if (value < 0 || value > 65535) {
3581 PyErr_SetString(PyExc_TypeError,
3582 "character mapping must be in range(65536)");
3583 Py_DECREF(x);
3584 goto onError;
3585 }
3586 *p++ = (Py_UNICODE)value;
3587 }
3588 else if (x == Py_None) {
3589 /* undefined mapping */
3590 outpos = p-PyUnicode_AS_UNICODE(v);
3591 startinpos = s-starts;
3592 endinpos = startinpos+1;
3593 if (unicode_decode_call_errorhandler(
3594 errors, &errorHandler,
3595 "charmap", "character maps to <undefined>",
3596 starts, size, &startinpos, &endinpos, &exc, &s,
3597 (PyObject **)&v, &outpos, &p)) {
3598 Py_DECREF(x);
3599 goto onError;
3600 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003601 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003602 continue;
3603 }
3604 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003605 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003606
3607 if (targetsize == 1)
3608 /* 1-1 mapping */
3609 *p++ = *PyUnicode_AS_UNICODE(x);
3610
3611 else if (targetsize > 1) {
3612 /* 1-n mapping */
3613 if (targetsize > extrachars) {
3614 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003615 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3616 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003617 (targetsize << 2);
3618 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003619 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003620 if (_PyUnicode_Resize(&v,
3621 PyUnicode_GET_SIZE(v) + needed) < 0) {
3622 Py_DECREF(x);
3623 goto onError;
3624 }
3625 p = PyUnicode_AS_UNICODE(v) + oldpos;
3626 }
3627 Py_UNICODE_COPY(p,
3628 PyUnicode_AS_UNICODE(x),
3629 targetsize);
3630 p += targetsize;
3631 extrachars -= targetsize;
3632 }
3633 /* 1-0 mapping: skip the character */
3634 }
3635 else {
3636 /* wrong return value */
3637 PyErr_SetString(PyExc_TypeError,
3638 "character mapping must return integer, None or unicode");
3639 Py_DECREF(x);
3640 goto onError;
3641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003643 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 }
3646 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003647 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 Py_XDECREF(errorHandler);
3650 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_XDECREF(errorHandler);
3655 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 Py_XDECREF(v);
3657 return NULL;
3658}
3659
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003660/* Charmap encoding: the lookup table */
3661
3662struct encoding_map{
3663 PyObject_HEAD
3664 unsigned char level1[32];
3665 int count2, count3;
3666 unsigned char level23[1];
3667};
3668
3669static PyObject*
3670encoding_map_size(PyObject *obj, PyObject* args)
3671{
3672 struct encoding_map *map = (struct encoding_map*)obj;
3673 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3674 128*map->count3);
3675}
3676
3677static PyMethodDef encoding_map_methods[] = {
3678 {"size", encoding_map_size, METH_NOARGS,
3679 PyDoc_STR("Return the size (in bytes) of this object") },
3680 { 0 }
3681};
3682
3683static void
3684encoding_map_dealloc(PyObject* o)
3685{
3686 PyObject_FREE(o);
3687}
3688
3689static PyTypeObject EncodingMapType = {
3690 PyObject_HEAD_INIT(NULL)
3691 0, /*ob_size*/
3692 "EncodingMap", /*tp_name*/
3693 sizeof(struct encoding_map), /*tp_basicsize*/
3694 0, /*tp_itemsize*/
3695 /* methods */
3696 encoding_map_dealloc, /*tp_dealloc*/
3697 0, /*tp_print*/
3698 0, /*tp_getattr*/
3699 0, /*tp_setattr*/
3700 0, /*tp_compare*/
3701 0, /*tp_repr*/
3702 0, /*tp_as_number*/
3703 0, /*tp_as_sequence*/
3704 0, /*tp_as_mapping*/
3705 0, /*tp_hash*/
3706 0, /*tp_call*/
3707 0, /*tp_str*/
3708 0, /*tp_getattro*/
3709 0, /*tp_setattro*/
3710 0, /*tp_as_buffer*/
3711 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3712 0, /*tp_doc*/
3713 0, /*tp_traverse*/
3714 0, /*tp_clear*/
3715 0, /*tp_richcompare*/
3716 0, /*tp_weaklistoffset*/
3717 0, /*tp_iter*/
3718 0, /*tp_iternext*/
3719 encoding_map_methods, /*tp_methods*/
3720 0, /*tp_members*/
3721 0, /*tp_getset*/
3722 0, /*tp_base*/
3723 0, /*tp_dict*/
3724 0, /*tp_descr_get*/
3725 0, /*tp_descr_set*/
3726 0, /*tp_dictoffset*/
3727 0, /*tp_init*/
3728 0, /*tp_alloc*/
3729 0, /*tp_new*/
3730 0, /*tp_free*/
3731 0, /*tp_is_gc*/
3732};
3733
3734PyObject*
3735PyUnicode_BuildEncodingMap(PyObject* string)
3736{
3737 Py_UNICODE *decode;
3738 PyObject *result;
3739 struct encoding_map *mresult;
3740 int i;
3741 int need_dict = 0;
3742 unsigned char level1[32];
3743 unsigned char level2[512];
3744 unsigned char *mlevel1, *mlevel2, *mlevel3;
3745 int count2 = 0, count3 = 0;
3746
3747 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3748 PyErr_BadArgument();
3749 return NULL;
3750 }
3751 decode = PyUnicode_AS_UNICODE(string);
3752 memset(level1, 0xFF, sizeof level1);
3753 memset(level2, 0xFF, sizeof level2);
3754
3755 /* If there isn't a one-to-one mapping of NULL to \0,
3756 or if there are non-BMP characters, we need to use
3757 a mapping dictionary. */
3758 if (decode[0] != 0)
3759 need_dict = 1;
3760 for (i = 1; i < 256; i++) {
3761 int l1, l2;
3762 if (decode[i] == 0
3763 #ifdef Py_UNICODE_WIDE
3764 || decode[i] > 0xFFFF
3765 #endif
3766 ) {
3767 need_dict = 1;
3768 break;
3769 }
3770 if (decode[i] == 0xFFFE)
3771 /* unmapped character */
3772 continue;
3773 l1 = decode[i] >> 11;
3774 l2 = decode[i] >> 7;
3775 if (level1[l1] == 0xFF)
3776 level1[l1] = count2++;
3777 if (level2[l2] == 0xFF)
3778 level2[l2] = count3++;
3779 }
3780
3781 if (count2 >= 0xFF || count3 >= 0xFF)
3782 need_dict = 1;
3783
3784 if (need_dict) {
3785 PyObject *result = PyDict_New();
3786 PyObject *key, *value;
3787 if (!result)
3788 return NULL;
3789 for (i = 0; i < 256; i++) {
3790 key = value = NULL;
3791 key = PyInt_FromLong(decode[i]);
3792 value = PyInt_FromLong(i);
3793 if (!key || !value)
3794 goto failed1;
3795 if (PyDict_SetItem(result, key, value) == -1)
3796 goto failed1;
3797 Py_DECREF(key);
3798 Py_DECREF(value);
3799 }
3800 return result;
3801 failed1:
3802 Py_XDECREF(key);
3803 Py_XDECREF(value);
3804 Py_DECREF(result);
3805 return NULL;
3806 }
3807
3808 /* Create a three-level trie */
3809 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3810 16*count2 + 128*count3 - 1);
3811 if (!result)
3812 return PyErr_NoMemory();
3813 PyObject_Init(result, &EncodingMapType);
3814 mresult = (struct encoding_map*)result;
3815 mresult->count2 = count2;
3816 mresult->count3 = count3;
3817 mlevel1 = mresult->level1;
3818 mlevel2 = mresult->level23;
3819 mlevel3 = mresult->level23 + 16*count2;
3820 memcpy(mlevel1, level1, 32);
3821 memset(mlevel2, 0xFF, 16*count2);
3822 memset(mlevel3, 0, 128*count3);
3823 count3 = 0;
3824 for (i = 1; i < 256; i++) {
3825 int o1, o2, o3, i2, i3;
3826 if (decode[i] == 0xFFFE)
3827 /* unmapped character */
3828 continue;
3829 o1 = decode[i]>>11;
3830 o2 = (decode[i]>>7) & 0xF;
3831 i2 = 16*mlevel1[o1] + o2;
3832 if (mlevel2[i2] == 0xFF)
3833 mlevel2[i2] = count3++;
3834 o3 = decode[i] & 0x7F;
3835 i3 = 128*mlevel2[i2] + o3;
3836 mlevel3[i3] = i;
3837 }
3838 return result;
3839}
3840
3841static int
3842encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3843{
3844 struct encoding_map *map = (struct encoding_map*)mapping;
3845 int l1 = c>>11;
3846 int l2 = (c>>7) & 0xF;
3847 int l3 = c & 0x7F;
3848 int i;
3849
3850#ifdef Py_UNICODE_WIDE
3851 if (c > 0xFFFF) {
3852 return -1;
3853 }
3854#endif
3855 if (c == 0)
3856 return 0;
3857 /* level 1*/
3858 i = map->level1[l1];
3859 if (i == 0xFF) {
3860 return -1;
3861 }
3862 /* level 2*/
3863 i = map->level23[16*i+l2];
3864 if (i == 0xFF) {
3865 return -1;
3866 }
3867 /* level 3 */
3868 i = map->level23[16*map->count2 + 128*i + l3];
3869 if (i == 0) {
3870 return -1;
3871 }
3872 return i;
3873}
3874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875/* Lookup the character ch in the mapping. If the character
3876 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003877 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 PyObject *w = PyInt_FromLong((long)c);
3881 PyObject *x;
3882
3883 if (w == NULL)
3884 return NULL;
3885 x = PyObject_GetItem(mapping, w);
3886 Py_DECREF(w);
3887 if (x == NULL) {
3888 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3889 /* No mapping found means: mapping is undefined. */
3890 PyErr_Clear();
3891 x = Py_None;
3892 Py_INCREF(x);
3893 return x;
3894 } else
3895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003897 else if (x == Py_None)
3898 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 else if (PyInt_Check(x)) {
3900 long value = PyInt_AS_LONG(x);
3901 if (value < 0 || value > 255) {
3902 PyErr_SetString(PyExc_TypeError,
3903 "character mapping must be in range(256)");
3904 Py_DECREF(x);
3905 return NULL;
3906 }
3907 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 else if (PyString_Check(x))
3910 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003913 PyErr_Format(PyExc_TypeError,
3914 "character mapping must return integer, None or str8, not %.400s",
3915 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 Py_DECREF(x);
3917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 }
3919}
3920
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003921static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003922charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003923{
Walter Dörwald827b0552007-05-12 13:23:53 +00003924 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003925 /* exponentially overallocate to minimize reallocations */
3926 if (requiredsize < 2*outsize)
3927 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003928 if (PyBytes_Resize(outobj, requiredsize)) {
3929 Py_DECREF(outobj);
3930 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003931 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003932 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003933}
3934
3935typedef enum charmapencode_result {
3936 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3937}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003939 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 space is available. Return a new reference to the object that
3941 was put in the output buffer, or Py_None, if the mapping was undefined
3942 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003943 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003945charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003946 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003948 PyObject *rep;
3949 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003950 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003952 if (mapping->ob_type == &EncodingMapType) {
3953 int res = encoding_map_lookup(c, mapping);
3954 Py_ssize_t requiredsize = *outpos+1;
3955 if (res == -1)
3956 return enc_FAILED;
3957 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003958 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003959 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003960 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003961 outstart[(*outpos)++] = (char)res;
3962 return enc_SUCCESS;
3963 }
3964
3965 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967 return enc_EXCEPTION;
3968 else if (rep==Py_None) {
3969 Py_DECREF(rep);
3970 return enc_FAILED;
3971 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003973 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003974 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003975 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003977 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003979 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3981 }
3982 else {
3983 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3985 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003986 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003987 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003989 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003991 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 memcpy(outstart + *outpos, repchars, repsize);
3993 *outpos += repsize;
3994 }
3995 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003996 Py_DECREF(rep);
3997 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998}
3999
4000/* handle an error in PyUnicode_EncodeCharmap
4001 Return 0 on success, -1 on error */
4002static
4003int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004004 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004006 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004007 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008{
4009 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004010 Py_ssize_t repsize;
4011 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 Py_UNICODE *uni2;
4013 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004014 Py_ssize_t collstartpos = *inpos;
4015 Py_ssize_t collendpos = *inpos+1;
4016 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 char *encoding = "charmap";
4018 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004019 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 /* find all unencodable characters */
4022 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 PyObject *rep;
4024 if (mapping->ob_type == &EncodingMapType) {
4025 int res = encoding_map_lookup(p[collendpos], mapping);
4026 if (res != -1)
4027 break;
4028 ++collendpos;
4029 continue;
4030 }
4031
4032 rep = charmapencode_lookup(p[collendpos], mapping);
4033 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 else if (rep!=Py_None) {
4036 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 break;
4038 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004039 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 ++collendpos;
4041 }
4042 /* cache callback name lookup
4043 * (if not done yet, i.e. it's the first error) */
4044 if (*known_errorHandler==-1) {
4045 if ((errors==NULL) || (!strcmp(errors, "strict")))
4046 *known_errorHandler = 1;
4047 else if (!strcmp(errors, "replace"))
4048 *known_errorHandler = 2;
4049 else if (!strcmp(errors, "ignore"))
4050 *known_errorHandler = 3;
4051 else if (!strcmp(errors, "xmlcharrefreplace"))
4052 *known_errorHandler = 4;
4053 else
4054 *known_errorHandler = 0;
4055 }
4056 switch (*known_errorHandler) {
4057 case 1: /* strict */
4058 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4059 return -1;
4060 case 2: /* replace */
4061 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4062 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004063 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 return -1;
4065 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004066 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4068 return -1;
4069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 }
4071 /* fall through */
4072 case 3: /* ignore */
4073 *inpos = collendpos;
4074 break;
4075 case 4: /* xmlcharrefreplace */
4076 /* generate replacement (temporarily (mis)uses p) */
4077 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4078 char buffer[2+29+1+1];
4079 char *cp;
4080 sprintf(buffer, "&#%d;", (int)p[collpos]);
4081 for (cp = buffer; *cp; ++cp) {
4082 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004083 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004085 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4087 return -1;
4088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 }
4090 }
4091 *inpos = collendpos;
4092 break;
4093 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004094 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095 encoding, reason, p, size, exceptionObject,
4096 collstartpos, collendpos, &newpos);
4097 if (repunicode == NULL)
4098 return -1;
4099 /* generate replacement */
4100 repsize = PyUnicode_GET_SIZE(repunicode);
4101 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4102 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004103 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 return -1;
4105 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004106 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4109 return -1;
4110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 }
4112 *inpos = newpos;
4113 Py_DECREF(repunicode);
4114 }
4115 return 0;
4116}
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004119 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 PyObject *mapping,
4121 const char *errors)
4122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 /* output object */
4124 PyObject *res = NULL;
4125 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004126 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004128 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 PyObject *errorHandler = NULL;
4130 PyObject *exc = NULL;
4131 /* the following variable is used for caching string comparisons
4132 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4133 * 3=ignore, 4=xmlcharrefreplace */
4134 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
4136 /* Default to Latin-1 */
4137 if (mapping == NULL)
4138 return PyUnicode_EncodeLatin1(p, size, errors);
4139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 /* allocate enough for a simple encoding without
4141 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004142 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 if (res == NULL)
4144 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004145 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 while (inpos<size) {
4149 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004150 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004151 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004153 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 if (charmap_encoding_error(p, size, &inpos, mapping,
4155 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004156 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004157 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004158 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 else
4162 /* done with this character => adjust input position */
4163 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004167 if (respos<PyBytes_GET_SIZE(res)) {
4168 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 goto onError;
4170 }
4171 Py_XDECREF(exc);
4172 Py_XDECREF(errorHandler);
4173 return res;
4174
4175 onError:
4176 Py_XDECREF(res);
4177 Py_XDECREF(exc);
4178 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 return NULL;
4180}
4181
4182PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4183 PyObject *mapping)
4184{
4185 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4186 PyErr_BadArgument();
4187 return NULL;
4188 }
4189 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4190 PyUnicode_GET_SIZE(unicode),
4191 mapping,
4192 NULL);
4193}
4194
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195/* create or adjust a UnicodeTranslateError */
4196static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004197 const Py_UNICODE *unicode, Py_ssize_t size,
4198 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 if (*exceptionObject == NULL) {
4202 *exceptionObject = PyUnicodeTranslateError_Create(
4203 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 }
4205 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4207 goto onError;
4208 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4209 goto onError;
4210 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4211 goto onError;
4212 return;
4213 onError:
4214 Py_DECREF(*exceptionObject);
4215 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
4217}
4218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219/* raises a UnicodeTranslateError */
4220static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 const Py_UNICODE *unicode, Py_ssize_t size,
4222 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 const char *reason)
4224{
4225 make_translate_exception(exceptionObject,
4226 unicode, size, startpos, endpos, reason);
4227 if (*exceptionObject != NULL)
4228 PyCodec_StrictErrors(*exceptionObject);
4229}
4230
4231/* error handling callback helper:
4232 build arguments, call the callback and check the arguments,
4233 put the result into newpos and return the replacement string, which
4234 has to be freed by the caller */
4235static PyObject *unicode_translate_call_errorhandler(const char *errors,
4236 PyObject **errorHandler,
4237 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004238 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4239 Py_ssize_t startpos, Py_ssize_t endpos,
4240 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004242 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004244 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 PyObject *restuple;
4246 PyObject *resunicode;
4247
4248 if (*errorHandler == NULL) {
4249 *errorHandler = PyCodec_LookupError(errors);
4250 if (*errorHandler == NULL)
4251 return NULL;
4252 }
4253
4254 make_translate_exception(exceptionObject,
4255 unicode, size, startpos, endpos, reason);
4256 if (*exceptionObject == NULL)
4257 return NULL;
4258
4259 restuple = PyObject_CallFunctionObjArgs(
4260 *errorHandler, *exceptionObject, NULL);
4261 if (restuple == NULL)
4262 return NULL;
4263 if (!PyTuple_Check(restuple)) {
4264 PyErr_Format(PyExc_TypeError, &argparse[4]);
4265 Py_DECREF(restuple);
4266 return NULL;
4267 }
4268 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004269 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 Py_DECREF(restuple);
4271 return NULL;
4272 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 if (i_newpos<0)
4274 *newpos = size+i_newpos;
4275 else
4276 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004277 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004278 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004279 Py_DECREF(restuple);
4280 return NULL;
4281 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 Py_INCREF(resunicode);
4283 Py_DECREF(restuple);
4284 return resunicode;
4285}
4286
4287/* Lookup the character ch in the mapping and put the result in result,
4288 which must be decrefed by the caller.
4289 Return 0 on success, -1 on error */
4290static
4291int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4292{
4293 PyObject *w = PyInt_FromLong((long)c);
4294 PyObject *x;
4295
4296 if (w == NULL)
4297 return -1;
4298 x = PyObject_GetItem(mapping, w);
4299 Py_DECREF(w);
4300 if (x == NULL) {
4301 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4302 /* No mapping found means: use 1:1 mapping. */
4303 PyErr_Clear();
4304 *result = NULL;
4305 return 0;
4306 } else
4307 return -1;
4308 }
4309 else if (x == Py_None) {
4310 *result = x;
4311 return 0;
4312 }
4313 else if (PyInt_Check(x)) {
4314 long value = PyInt_AS_LONG(x);
4315 long max = PyUnicode_GetMax();
4316 if (value < 0 || value > max) {
4317 PyErr_Format(PyExc_TypeError,
4318 "character mapping must be in range(0x%lx)", max+1);
4319 Py_DECREF(x);
4320 return -1;
4321 }
4322 *result = x;
4323 return 0;
4324 }
4325 else if (PyUnicode_Check(x)) {
4326 *result = x;
4327 return 0;
4328 }
4329 else {
4330 /* wrong return value */
4331 PyErr_SetString(PyExc_TypeError,
4332 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004333 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 return -1;
4335 }
4336}
4337/* ensure that *outobj is at least requiredsize characters long,
4338if not reallocate and adjust various state variables.
4339Return 0 on success, -1 on error */
4340static
Walter Dörwald4894c302003-10-24 14:25:28 +00004341int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004342 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004344 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004345 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004347 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004349 if (requiredsize < 2 * oldsize)
4350 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004351 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 return -1;
4353 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 }
4355 return 0;
4356}
4357/* lookup the character, put the result in the output string and adjust
4358 various state variables. Return a new reference to the object that
4359 was put in the output buffer in *result, or Py_None, if the mapping was
4360 undefined (in which case no character was written).
4361 The called must decref result.
4362 Return 0 on success, -1 on error. */
4363static
Walter Dörwald4894c302003-10-24 14:25:28 +00004364int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004365 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004366 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367{
Walter Dörwald4894c302003-10-24 14:25:28 +00004368 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 return -1;
4370 if (*res==NULL) {
4371 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004372 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 }
4374 else if (*res==Py_None)
4375 ;
4376 else if (PyInt_Check(*res)) {
4377 /* no overflow check, because we know that the space is enough */
4378 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4379 }
4380 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 if (repsize==1) {
4383 /* no overflow check, because we know that the space is enough */
4384 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4385 }
4386 else if (repsize!=0) {
4387 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004389 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004390 repsize - 1;
4391 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 return -1;
4393 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4394 *outp += repsize;
4395 }
4396 }
4397 else
4398 return -1;
4399 return 0;
4400}
4401
4402PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004403 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 PyObject *mapping,
4405 const char *errors)
4406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 /* output object */
4408 PyObject *res = NULL;
4409 /* pointers to the beginning and end+1 of input */
4410 const Py_UNICODE *startp = p;
4411 const Py_UNICODE *endp = p + size;
4412 /* pointer into the output */
4413 Py_UNICODE *str;
4414 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 char *reason = "character maps to <undefined>";
4417 PyObject *errorHandler = NULL;
4418 PyObject *exc = NULL;
4419 /* the following variable is used for caching string comparisons
4420 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4421 * 3=ignore, 4=xmlcharrefreplace */
4422 int known_errorHandler = -1;
4423
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 if (mapping == NULL) {
4425 PyErr_BadArgument();
4426 return NULL;
4427 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428
4429 /* allocate enough for a simple 1:1 translation without
4430 replacements, if we need more, we'll resize */
4431 res = PyUnicode_FromUnicode(NULL, size);
4432 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 return res;
4436 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 while (p<endp) {
4439 /* try to encode it */
4440 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004441 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 goto onError;
4444 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004445 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 if (x!=Py_None) /* it worked => adjust input pointer */
4447 ++p;
4448 else { /* untranslatable character */
4449 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t repsize;
4451 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 Py_UNICODE *uni2;
4453 /* startpos for collecting untranslatable chars */
4454 const Py_UNICODE *collstart = p;
4455 const Py_UNICODE *collend = p+1;
4456 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 /* find all untranslatable characters */
4459 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004460 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 goto onError;
4462 Py_XDECREF(x);
4463 if (x!=Py_None)
4464 break;
4465 ++collend;
4466 }
4467 /* cache callback name lookup
4468 * (if not done yet, i.e. it's the first error) */
4469 if (known_errorHandler==-1) {
4470 if ((errors==NULL) || (!strcmp(errors, "strict")))
4471 known_errorHandler = 1;
4472 else if (!strcmp(errors, "replace"))
4473 known_errorHandler = 2;
4474 else if (!strcmp(errors, "ignore"))
4475 known_errorHandler = 3;
4476 else if (!strcmp(errors, "xmlcharrefreplace"))
4477 known_errorHandler = 4;
4478 else
4479 known_errorHandler = 0;
4480 }
4481 switch (known_errorHandler) {
4482 case 1: /* strict */
4483 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4484 goto onError;
4485 case 2: /* replace */
4486 /* No need to check for space, this is a 1:1 replacement */
4487 for (coll = collstart; coll<collend; ++coll)
4488 *str++ = '?';
4489 /* fall through */
4490 case 3: /* ignore */
4491 p = collend;
4492 break;
4493 case 4: /* xmlcharrefreplace */
4494 /* generate replacement (temporarily (mis)uses p) */
4495 for (p = collstart; p < collend; ++p) {
4496 char buffer[2+29+1+1];
4497 char *cp;
4498 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004499 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4501 goto onError;
4502 for (cp = buffer; *cp; ++cp)
4503 *str++ = *cp;
4504 }
4505 p = collend;
4506 break;
4507 default:
4508 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4509 reason, startp, size, &exc,
4510 collstart-startp, collend-startp, &newpos);
4511 if (repunicode == NULL)
4512 goto onError;
4513 /* generate replacement */
4514 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004515 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4517 Py_DECREF(repunicode);
4518 goto onError;
4519 }
4520 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4521 *str++ = *uni2;
4522 p = startp + newpos;
4523 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
4525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 /* Resize if we allocated to much */
4528 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004529 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004530 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004531 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 }
4533 Py_XDECREF(exc);
4534 Py_XDECREF(errorHandler);
4535 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 onError:
4538 Py_XDECREF(res);
4539 Py_XDECREF(exc);
4540 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 return NULL;
4542}
4543
4544PyObject *PyUnicode_Translate(PyObject *str,
4545 PyObject *mapping,
4546 const char *errors)
4547{
4548 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004549
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 str = PyUnicode_FromObject(str);
4551 if (str == NULL)
4552 goto onError;
4553 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4554 PyUnicode_GET_SIZE(str),
4555 mapping,
4556 errors);
4557 Py_DECREF(str);
4558 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004559
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 onError:
4561 Py_XDECREF(str);
4562 return NULL;
4563}
Tim Petersced69f82003-09-16 20:30:58 +00004564
Guido van Rossum9e896b32000-04-05 20:11:21 +00004565/* --- Decimal Encoder ---------------------------------------------------- */
4566
4567int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004569 char *output,
4570 const char *errors)
4571{
4572 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 PyObject *errorHandler = NULL;
4574 PyObject *exc = NULL;
4575 const char *encoding = "decimal";
4576 const char *reason = "invalid decimal Unicode string";
4577 /* the following variable is used for caching string comparisons
4578 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4579 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004580
4581 if (output == NULL) {
4582 PyErr_BadArgument();
4583 return -1;
4584 }
4585
4586 p = s;
4587 end = s + length;
4588 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004590 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t repsize;
4593 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 Py_UNICODE *uni2;
4595 Py_UNICODE *collstart;
4596 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004597
Guido van Rossum9e896b32000-04-05 20:11:21 +00004598 if (Py_UNICODE_ISSPACE(ch)) {
4599 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004601 continue;
4602 }
4603 decimal = Py_UNICODE_TODECIMAL(ch);
4604 if (decimal >= 0) {
4605 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004607 continue;
4608 }
Guido van Rossumba477042000-04-06 18:18:10 +00004609 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004610 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004612 continue;
4613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 /* All other characters are considered unencodable */
4615 collstart = p;
4616 collend = p+1;
4617 while (collend < end) {
4618 if ((0 < *collend && *collend < 256) ||
4619 !Py_UNICODE_ISSPACE(*collend) ||
4620 Py_UNICODE_TODECIMAL(*collend))
4621 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 /* cache callback name lookup
4624 * (if not done yet, i.e. it's the first error) */
4625 if (known_errorHandler==-1) {
4626 if ((errors==NULL) || (!strcmp(errors, "strict")))
4627 known_errorHandler = 1;
4628 else if (!strcmp(errors, "replace"))
4629 known_errorHandler = 2;
4630 else if (!strcmp(errors, "ignore"))
4631 known_errorHandler = 3;
4632 else if (!strcmp(errors, "xmlcharrefreplace"))
4633 known_errorHandler = 4;
4634 else
4635 known_errorHandler = 0;
4636 }
4637 switch (known_errorHandler) {
4638 case 1: /* strict */
4639 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4640 goto onError;
4641 case 2: /* replace */
4642 for (p = collstart; p < collend; ++p)
4643 *output++ = '?';
4644 /* fall through */
4645 case 3: /* ignore */
4646 p = collend;
4647 break;
4648 case 4: /* xmlcharrefreplace */
4649 /* generate replacement (temporarily (mis)uses p) */
4650 for (p = collstart; p < collend; ++p)
4651 output += sprintf(output, "&#%d;", (int)*p);
4652 p = collend;
4653 break;
4654 default:
4655 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4656 encoding, reason, s, length, &exc,
4657 collstart-s, collend-s, &newpos);
4658 if (repunicode == NULL)
4659 goto onError;
4660 /* generate replacement */
4661 repsize = PyUnicode_GET_SIZE(repunicode);
4662 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4663 Py_UNICODE ch = *uni2;
4664 if (Py_UNICODE_ISSPACE(ch))
4665 *output++ = ' ';
4666 else {
4667 decimal = Py_UNICODE_TODECIMAL(ch);
4668 if (decimal >= 0)
4669 *output++ = '0' + decimal;
4670 else if (0 < ch && ch < 256)
4671 *output++ = (char)ch;
4672 else {
4673 Py_DECREF(repunicode);
4674 raise_encode_exception(&exc, encoding,
4675 s, length, collstart-s, collend-s, reason);
4676 goto onError;
4677 }
4678 }
4679 }
4680 p = s + newpos;
4681 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004682 }
4683 }
4684 /* 0-terminate the output string */
4685 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 Py_XDECREF(exc);
4687 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004688 return 0;
4689
4690 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 Py_XDECREF(exc);
4692 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004693 return -1;
4694}
4695
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696/* --- Helpers ------------------------------------------------------------ */
4697
Thomas Wouters477c8d52006-05-27 19:21:47 +00004698#define STRINGLIB_CHAR Py_UNICODE
4699
4700#define STRINGLIB_LEN PyUnicode_GET_SIZE
4701#define STRINGLIB_NEW PyUnicode_FromUnicode
4702#define STRINGLIB_STR PyUnicode_AS_UNICODE
4703
4704Py_LOCAL_INLINE(int)
4705STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004707 if (str[0] != other[0])
4708 return 1;
4709 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
4711
Thomas Wouters477c8d52006-05-27 19:21:47 +00004712#define STRINGLIB_EMPTY unicode_empty
4713
4714#include "stringlib/fastsearch.h"
4715
4716#include "stringlib/count.h"
4717#include "stringlib/find.h"
4718#include "stringlib/partition.h"
4719
4720/* helper macro to fixup start/end slice values */
4721#define FIX_START_END(obj) \
4722 if (start < 0) \
4723 start += (obj)->length; \
4724 if (start < 0) \
4725 start = 0; \
4726 if (end > (obj)->length) \
4727 end = (obj)->length; \
4728 if (end < 0) \
4729 end += (obj)->length; \
4730 if (end < 0) \
4731 end = 0;
4732
Martin v. Löwis18e16552006-02-15 17:27:45 +00004733Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004734 PyObject *substr,
4735 Py_ssize_t start,
4736 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004738 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004739 PyUnicodeObject* str_obj;
4740 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004741
Thomas Wouters477c8d52006-05-27 19:21:47 +00004742 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4743 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004745 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4746 if (!sub_obj) {
4747 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return -1;
4749 }
Tim Petersced69f82003-09-16 20:30:58 +00004750
Thomas Wouters477c8d52006-05-27 19:21:47 +00004751 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004752
Thomas Wouters477c8d52006-05-27 19:21:47 +00004753 result = stringlib_count(
4754 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4755 );
4756
4757 Py_DECREF(sub_obj);
4758 Py_DECREF(str_obj);
4759
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 return result;
4761}
4762
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004764 PyObject *sub,
4765 Py_ssize_t start,
4766 Py_ssize_t end,
4767 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004770
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004772 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004773 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004774 sub = PyUnicode_FromObject(sub);
4775 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004776 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004777 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 }
Tim Petersced69f82003-09-16 20:30:58 +00004779
Thomas Wouters477c8d52006-05-27 19:21:47 +00004780 if (direction > 0)
4781 result = stringlib_find_slice(
4782 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4783 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4784 start, end
4785 );
4786 else
4787 result = stringlib_rfind_slice(
4788 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4789 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4790 start, end
4791 );
4792
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004794 Py_DECREF(sub);
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 return result;
4797}
4798
Tim Petersced69f82003-09-16 20:30:58 +00004799static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800int tailmatch(PyUnicodeObject *self,
4801 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802 Py_ssize_t start,
4803 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 int direction)
4805{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 if (substring->length == 0)
4807 return 1;
4808
Thomas Wouters477c8d52006-05-27 19:21:47 +00004809 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
4811 end -= substring->length;
4812 if (end < start)
4813 return 0;
4814
4815 if (direction > 0) {
4816 if (Py_UNICODE_MATCH(self, end, substring))
4817 return 1;
4818 } else {
4819 if (Py_UNICODE_MATCH(self, start, substring))
4820 return 1;
4821 }
4822
4823 return 0;
4824}
4825
Martin v. Löwis18e16552006-02-15 17:27:45 +00004826Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004828 Py_ssize_t start,
4829 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 int direction)
4831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004833
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 str = PyUnicode_FromObject(str);
4835 if (str == NULL)
4836 return -1;
4837 substr = PyUnicode_FromObject(substr);
4838 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004839 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 return -1;
4841 }
Tim Petersced69f82003-09-16 20:30:58 +00004842
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 result = tailmatch((PyUnicodeObject *)str,
4844 (PyUnicodeObject *)substr,
4845 start, end, direction);
4846 Py_DECREF(str);
4847 Py_DECREF(substr);
4848 return result;
4849}
4850
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851/* Apply fixfct filter to the Unicode object self and return a
4852 reference to the modified object */
4853
Tim Petersced69f82003-09-16 20:30:58 +00004854static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855PyObject *fixup(PyUnicodeObject *self,
4856 int (*fixfct)(PyUnicodeObject *s))
4857{
4858
4859 PyUnicodeObject *u;
4860
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004861 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 if (u == NULL)
4863 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004864
4865 Py_UNICODE_COPY(u->str, self->str, self->length);
4866
Tim Peters7a29bd52001-09-12 03:03:31 +00004867 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 /* fixfct should return TRUE if it modified the buffer. If
4869 FALSE, return a reference to the original buffer instead
4870 (to save space, not time) */
4871 Py_INCREF(self);
4872 Py_DECREF(u);
4873 return (PyObject*) self;
4874 }
4875 return (PyObject*) u;
4876}
4877
Tim Petersced69f82003-09-16 20:30:58 +00004878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879int fixupper(PyUnicodeObject *self)
4880{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 Py_UNICODE *s = self->str;
4883 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 while (len-- > 0) {
4886 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 ch = Py_UNICODE_TOUPPER(*s);
4889 if (ch != *s) {
4890 status = 1;
4891 *s = ch;
4892 }
4893 s++;
4894 }
4895
4896 return status;
4897}
4898
Tim Petersced69f82003-09-16 20:30:58 +00004899static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900int fixlower(PyUnicodeObject *self)
4901{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 Py_UNICODE *s = self->str;
4904 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004905
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 while (len-- > 0) {
4907 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004908
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 ch = Py_UNICODE_TOLOWER(*s);
4910 if (ch != *s) {
4911 status = 1;
4912 *s = ch;
4913 }
4914 s++;
4915 }
4916
4917 return status;
4918}
4919
Tim Petersced69f82003-09-16 20:30:58 +00004920static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921int fixswapcase(PyUnicodeObject *self)
4922{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004923 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 Py_UNICODE *s = self->str;
4925 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004926
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 while (len-- > 0) {
4928 if (Py_UNICODE_ISUPPER(*s)) {
4929 *s = Py_UNICODE_TOLOWER(*s);
4930 status = 1;
4931 } else if (Py_UNICODE_ISLOWER(*s)) {
4932 *s = Py_UNICODE_TOUPPER(*s);
4933 status = 1;
4934 }
4935 s++;
4936 }
4937
4938 return status;
4939}
4940
Tim Petersced69f82003-09-16 20:30:58 +00004941static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942int fixcapitalize(PyUnicodeObject *self)
4943{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004944 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004945 Py_UNICODE *s = self->str;
4946 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004947
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004948 if (len == 0)
4949 return 0;
4950 if (Py_UNICODE_ISLOWER(*s)) {
4951 *s = Py_UNICODE_TOUPPER(*s);
4952 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004954 s++;
4955 while (--len > 0) {
4956 if (Py_UNICODE_ISUPPER(*s)) {
4957 *s = Py_UNICODE_TOLOWER(*s);
4958 status = 1;
4959 }
4960 s++;
4961 }
4962 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963}
4964
4965static
4966int fixtitle(PyUnicodeObject *self)
4967{
4968 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4969 register Py_UNICODE *e;
4970 int previous_is_cased;
4971
4972 /* Shortcut for single character strings */
4973 if (PyUnicode_GET_SIZE(self) == 1) {
4974 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4975 if (*p != ch) {
4976 *p = ch;
4977 return 1;
4978 }
4979 else
4980 return 0;
4981 }
Tim Petersced69f82003-09-16 20:30:58 +00004982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 e = p + PyUnicode_GET_SIZE(self);
4984 previous_is_cased = 0;
4985 for (; p < e; p++) {
4986 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004987
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 if (previous_is_cased)
4989 *p = Py_UNICODE_TOLOWER(ch);
4990 else
4991 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004992
4993 if (Py_UNICODE_ISLOWER(ch) ||
4994 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 Py_UNICODE_ISTITLE(ch))
4996 previous_is_cased = 1;
4997 else
4998 previous_is_cased = 0;
4999 }
5000 return 1;
5001}
5002
Tim Peters8ce9f162004-08-27 01:49:32 +00005003PyObject *
5004PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005{
Tim Peters8ce9f162004-08-27 01:49:32 +00005006 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005007 const Py_UNICODE blank = ' ';
5008 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005009 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005010 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005011 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5012 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005013 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5014 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005016 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005017 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018
Tim Peters05eba1f2004-08-27 21:32:02 +00005019 fseq = PySequence_Fast(seq, "");
5020 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005021 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005022 }
5023
Tim Peters91879ab2004-08-27 22:35:44 +00005024 /* Grrrr. A codec may be invoked to convert str objects to
5025 * Unicode, and so it's possible to call back into Python code
5026 * during PyUnicode_FromObject(), and so it's possible for a sick
5027 * codec to change the size of fseq (if seq is a list). Therefore
5028 * we have to keep refetching the size -- can't assume seqlen
5029 * is invariant.
5030 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005031 seqlen = PySequence_Fast_GET_SIZE(fseq);
5032 /* If empty sequence, return u"". */
5033 if (seqlen == 0) {
5034 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5035 goto Done;
5036 }
5037 /* If singleton sequence with an exact Unicode, return that. */
5038 if (seqlen == 1) {
5039 item = PySequence_Fast_GET_ITEM(fseq, 0);
5040 if (PyUnicode_CheckExact(item)) {
5041 Py_INCREF(item);
5042 res = (PyUnicodeObject *)item;
5043 goto Done;
5044 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005045 }
5046
Tim Peters05eba1f2004-08-27 21:32:02 +00005047 /* At least two items to join, or one that isn't exact Unicode. */
5048 if (seqlen > 1) {
5049 /* Set up sep and seplen -- they're needed. */
5050 if (separator == NULL) {
5051 sep = &blank;
5052 seplen = 1;
5053 }
5054 else {
5055 internal_separator = PyUnicode_FromObject(separator);
5056 if (internal_separator == NULL)
5057 goto onError;
5058 sep = PyUnicode_AS_UNICODE(internal_separator);
5059 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005060 /* In case PyUnicode_FromObject() mutated seq. */
5061 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005062 }
5063 }
5064
5065 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005066 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005067 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005068 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005069 res_p = PyUnicode_AS_UNICODE(res);
5070 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005071
Tim Peters05eba1f2004-08-27 21:32:02 +00005072 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005073 Py_ssize_t itemlen;
5074 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005075
5076 item = PySequence_Fast_GET_ITEM(fseq, i);
5077 /* Convert item to Unicode. */
5078 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5079 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005080 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005081 " %.80s found",
5082 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005083 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005084 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005085 item = PyUnicode_FromObject(item);
5086 if (item == NULL)
5087 goto onError;
5088 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005089
Tim Peters91879ab2004-08-27 22:35:44 +00005090 /* In case PyUnicode_FromObject() mutated seq. */
5091 seqlen = PySequence_Fast_GET_SIZE(fseq);
5092
Tim Peters8ce9f162004-08-27 01:49:32 +00005093 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005095 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005096 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005097 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005098 if (i < seqlen - 1) {
5099 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005101 goto Overflow;
5102 }
5103 if (new_res_used > res_alloc) {
5104 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005105 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005106 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005107 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005108 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005109 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005110 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005111 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005113 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005114 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005116
5117 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005118 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005119 res_p += itemlen;
5120 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005121 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005122 res_p += seplen;
5123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005125 res_used = new_res_used;
5126 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005127
Tim Peters05eba1f2004-08-27 21:32:02 +00005128 /* Shrink res to match the used area; this probably can't fail,
5129 * but it's cheap to check.
5130 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005131 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005132 goto onError;
5133
5134 Done:
5135 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005136 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 return (PyObject *)res;
5138
Tim Peters8ce9f162004-08-27 01:49:32 +00005139 Overflow:
5140 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005141 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 Py_DECREF(item);
5143 /* fall through */
5144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005146 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005147 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005148 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 return NULL;
5150}
5151
Tim Petersced69f82003-09-16 20:30:58 +00005152static
5153PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005154 Py_ssize_t left,
5155 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 Py_UNICODE fill)
5157{
5158 PyUnicodeObject *u;
5159
5160 if (left < 0)
5161 left = 0;
5162 if (right < 0)
5163 right = 0;
5164
Tim Peters7a29bd52001-09-12 03:03:31 +00005165 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 Py_INCREF(self);
5167 return self;
5168 }
5169
5170 u = _PyUnicode_New(left + self->length + right);
5171 if (u) {
5172 if (left)
5173 Py_UNICODE_FILL(u->str, fill, left);
5174 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5175 if (right)
5176 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5177 }
5178
5179 return u;
5180}
5181
5182#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005183 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 if (!str) \
5185 goto onError; \
5186 if (PyList_Append(list, str)) { \
5187 Py_DECREF(str); \
5188 goto onError; \
5189 } \
5190 else \
5191 Py_DECREF(str);
5192
5193static
5194PyObject *split_whitespace(PyUnicodeObject *self,
5195 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005196 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 register Py_ssize_t i;
5199 register Py_ssize_t j;
5200 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyObject *str;
5202
5203 for (i = j = 0; i < len; ) {
5204 /* find a token */
5205 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5206 i++;
5207 j = i;
5208 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5209 i++;
5210 if (j < i) {
5211 if (maxcount-- <= 0)
5212 break;
5213 SPLIT_APPEND(self->str, j, i);
5214 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5215 i++;
5216 j = i;
5217 }
5218 }
5219 if (j < len) {
5220 SPLIT_APPEND(self->str, j, len);
5221 }
5222 return list;
5223
5224 onError:
5225 Py_DECREF(list);
5226 return NULL;
5227}
5228
5229PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005230 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 register Py_ssize_t i;
5233 register Py_ssize_t j;
5234 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 PyObject *list;
5236 PyObject *str;
5237 Py_UNICODE *data;
5238
5239 string = PyUnicode_FromObject(string);
5240 if (string == NULL)
5241 return NULL;
5242 data = PyUnicode_AS_UNICODE(string);
5243 len = PyUnicode_GET_SIZE(string);
5244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 list = PyList_New(0);
5246 if (!list)
5247 goto onError;
5248
5249 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005251
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005253 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
5256 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005257 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (i < len) {
5259 if (data[i] == '\r' && i + 1 < len &&
5260 data[i+1] == '\n')
5261 i += 2;
5262 else
5263 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005264 if (keepends)
5265 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 }
Guido van Rossum86662912000-04-11 15:38:46 +00005267 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 j = i;
5269 }
5270 if (j < len) {
5271 SPLIT_APPEND(data, j, len);
5272 }
5273
5274 Py_DECREF(string);
5275 return list;
5276
5277 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005278 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 Py_DECREF(string);
5280 return NULL;
5281}
5282
Tim Petersced69f82003-09-16 20:30:58 +00005283static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284PyObject *split_char(PyUnicodeObject *self,
5285 PyObject *list,
5286 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005287 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005289 register Py_ssize_t i;
5290 register Py_ssize_t j;
5291 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 PyObject *str;
5293
5294 for (i = j = 0; i < len; ) {
5295 if (self->str[i] == ch) {
5296 if (maxcount-- <= 0)
5297 break;
5298 SPLIT_APPEND(self->str, j, i);
5299 i = j = i + 1;
5300 } else
5301 i++;
5302 }
5303 if (j <= len) {
5304 SPLIT_APPEND(self->str, j, len);
5305 }
5306 return list;
5307
5308 onError:
5309 Py_DECREF(list);
5310 return NULL;
5311}
5312
Tim Petersced69f82003-09-16 20:30:58 +00005313static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314PyObject *split_substring(PyUnicodeObject *self,
5315 PyObject *list,
5316 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005317 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005319 register Py_ssize_t i;
5320 register Py_ssize_t j;
5321 Py_ssize_t len = self->length;
5322 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 PyObject *str;
5324
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005325 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 if (Py_UNICODE_MATCH(self, i, substring)) {
5327 if (maxcount-- <= 0)
5328 break;
5329 SPLIT_APPEND(self->str, j, i);
5330 i = j = i + sublen;
5331 } else
5332 i++;
5333 }
5334 if (j <= len) {
5335 SPLIT_APPEND(self->str, j, len);
5336 }
5337 return list;
5338
5339 onError:
5340 Py_DECREF(list);
5341 return NULL;
5342}
5343
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005344static
5345PyObject *rsplit_whitespace(PyUnicodeObject *self,
5346 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005348{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005349 register Py_ssize_t i;
5350 register Py_ssize_t j;
5351 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005352 PyObject *str;
5353
5354 for (i = j = len - 1; i >= 0; ) {
5355 /* find a token */
5356 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5357 i--;
5358 j = i;
5359 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5360 i--;
5361 if (j > i) {
5362 if (maxcount-- <= 0)
5363 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005364 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005365 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5366 i--;
5367 j = i;
5368 }
5369 }
5370 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005371 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005372 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005373 if (PyList_Reverse(list) < 0)
5374 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005375 return list;
5376
5377 onError:
5378 Py_DECREF(list);
5379 return NULL;
5380}
5381
5382static
5383PyObject *rsplit_char(PyUnicodeObject *self,
5384 PyObject *list,
5385 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005387{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005388 register Py_ssize_t i;
5389 register Py_ssize_t j;
5390 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005391 PyObject *str;
5392
5393 for (i = j = len - 1; i >= 0; ) {
5394 if (self->str[i] == ch) {
5395 if (maxcount-- <= 0)
5396 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005397 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005398 j = i = i - 1;
5399 } else
5400 i--;
5401 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005402 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005403 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005404 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005405 if (PyList_Reverse(list) < 0)
5406 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005407 return list;
5408
5409 onError:
5410 Py_DECREF(list);
5411 return NULL;
5412}
5413
5414static
5415PyObject *rsplit_substring(PyUnicodeObject *self,
5416 PyObject *list,
5417 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005419{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005420 register Py_ssize_t i;
5421 register Py_ssize_t j;
5422 Py_ssize_t len = self->length;
5423 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424 PyObject *str;
5425
5426 for (i = len - sublen, j = len; i >= 0; ) {
5427 if (Py_UNICODE_MATCH(self, i, substring)) {
5428 if (maxcount-- <= 0)
5429 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005430 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005431 j = i;
5432 i -= sublen;
5433 } else
5434 i--;
5435 }
5436 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005437 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005438 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005439 if (PyList_Reverse(list) < 0)
5440 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005441 return list;
5442
5443 onError:
5444 Py_DECREF(list);
5445 return NULL;
5446}
5447
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448#undef SPLIT_APPEND
5449
5450static
5451PyObject *split(PyUnicodeObject *self,
5452 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005453 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454{
5455 PyObject *list;
5456
5457 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005458 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
5460 list = PyList_New(0);
5461 if (!list)
5462 return NULL;
5463
5464 if (substring == NULL)
5465 return split_whitespace(self,list,maxcount);
5466
5467 else if (substring->length == 1)
5468 return split_char(self,list,substring->str[0],maxcount);
5469
5470 else if (substring->length == 0) {
5471 Py_DECREF(list);
5472 PyErr_SetString(PyExc_ValueError, "empty separator");
5473 return NULL;
5474 }
5475 else
5476 return split_substring(self,list,substring,maxcount);
5477}
5478
Tim Petersced69f82003-09-16 20:30:58 +00005479static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005480PyObject *rsplit(PyUnicodeObject *self,
5481 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005482 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005483{
5484 PyObject *list;
5485
5486 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005487 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005488
5489 list = PyList_New(0);
5490 if (!list)
5491 return NULL;
5492
5493 if (substring == NULL)
5494 return rsplit_whitespace(self,list,maxcount);
5495
5496 else if (substring->length == 1)
5497 return rsplit_char(self,list,substring->str[0],maxcount);
5498
5499 else if (substring->length == 0) {
5500 Py_DECREF(list);
5501 PyErr_SetString(PyExc_ValueError, "empty separator");
5502 return NULL;
5503 }
5504 else
5505 return rsplit_substring(self,list,substring,maxcount);
5506}
5507
5508static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509PyObject *replace(PyUnicodeObject *self,
5510 PyUnicodeObject *str1,
5511 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005512 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513{
5514 PyUnicodeObject *u;
5515
5516 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005517 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518
Thomas Wouters477c8d52006-05-27 19:21:47 +00005519 if (str1->length == str2->length) {
5520 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005521 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005522 if (str1->length == 1) {
5523 /* replace characters */
5524 Py_UNICODE u1, u2;
5525 if (!findchar(self->str, self->length, str1->str[0]))
5526 goto nothing;
5527 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5528 if (!u)
5529 return NULL;
5530 Py_UNICODE_COPY(u->str, self->str, self->length);
5531 u1 = str1->str[0];
5532 u2 = str2->str[0];
5533 for (i = 0; i < u->length; i++)
5534 if (u->str[i] == u1) {
5535 if (--maxcount < 0)
5536 break;
5537 u->str[i] = u2;
5538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005540 i = fastsearch(
5541 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005543 if (i < 0)
5544 goto nothing;
5545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5546 if (!u)
5547 return NULL;
5548 Py_UNICODE_COPY(u->str, self->str, self->length);
5549 while (i <= self->length - str1->length)
5550 if (Py_UNICODE_MATCH(self, i, str1)) {
5551 if (--maxcount < 0)
5552 break;
5553 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5554 i += str1->length;
5555 } else
5556 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559
5560 Py_ssize_t n, i, j, e;
5561 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 Py_UNICODE *p;
5563
5564 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005565 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 if (n > maxcount)
5567 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 if (n == 0)
5569 goto nothing;
5570 /* new_size = self->length + n * (str2->length - str1->length)); */
5571 delta = (str2->length - str1->length);
5572 if (delta == 0) {
5573 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005575 product = n * (str2->length - str1->length);
5576 if ((product / (str2->length - str1->length)) != n) {
5577 PyErr_SetString(PyExc_OverflowError,
5578 "replace string is too long");
5579 return NULL;
5580 }
5581 new_size = self->length + product;
5582 if (new_size < 0) {
5583 PyErr_SetString(PyExc_OverflowError,
5584 "replace string is too long");
5585 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 }
5587 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005588 u = _PyUnicode_New(new_size);
5589 if (!u)
5590 return NULL;
5591 i = 0;
5592 p = u->str;
5593 e = self->length - str1->length;
5594 if (str1->length > 0) {
5595 while (n-- > 0) {
5596 /* look for next match */
5597 j = i;
5598 while (j <= e) {
5599 if (Py_UNICODE_MATCH(self, j, str1))
5600 break;
5601 j++;
5602 }
5603 if (j > i) {
5604 if (j > e)
5605 break;
5606 /* copy unchanged part [i:j] */
5607 Py_UNICODE_COPY(p, self->str+i, j-i);
5608 p += j - i;
5609 }
5610 /* copy substitution string */
5611 if (str2->length > 0) {
5612 Py_UNICODE_COPY(p, str2->str, str2->length);
5613 p += str2->length;
5614 }
5615 i = j + str1->length;
5616 }
5617 if (i < self->length)
5618 /* copy tail [i:] */
5619 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5620 } else {
5621 /* interleave */
5622 while (n > 0) {
5623 Py_UNICODE_COPY(p, str2->str, str2->length);
5624 p += str2->length;
5625 if (--n <= 0)
5626 break;
5627 *p++ = self->str[i++];
5628 }
5629 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005633
5634nothing:
5635 /* nothing to replace; return original string (when possible) */
5636 if (PyUnicode_CheckExact(self)) {
5637 Py_INCREF(self);
5638 return (PyObject *) self;
5639 }
5640 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
5643/* --- Unicode Object Methods --------------------------------------------- */
5644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646"S.title() -> unicode\n\
5647\n\
5648Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 return fixup(self, fixtitle);
5655}
5656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005657PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658"S.capitalize() -> unicode\n\
5659\n\
5660Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662
5663static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005664unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return fixup(self, fixcapitalize);
5667}
5668
5669#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671"S.capwords() -> unicode\n\
5672\n\
5673Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005674normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
5676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005677unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
5679 PyObject *list;
5680 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005681 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 /* Split into words */
5684 list = split(self, NULL, -1);
5685 if (!list)
5686 return NULL;
5687
5688 /* Capitalize each word */
5689 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5690 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5691 fixcapitalize);
5692 if (item == NULL)
5693 goto onError;
5694 Py_DECREF(PyList_GET_ITEM(list, i));
5695 PyList_SET_ITEM(list, i, item);
5696 }
5697
5698 /* Join the words to form a new string */
5699 item = PyUnicode_Join(NULL, list);
5700
5701onError:
5702 Py_DECREF(list);
5703 return (PyObject *)item;
5704}
5705#endif
5706
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005707/* Argument converter. Coerces to a single unicode character */
5708
5709static int
5710convert_uc(PyObject *obj, void *addr)
5711{
5712 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5713 PyObject *uniobj;
5714 Py_UNICODE *unistr;
5715
5716 uniobj = PyUnicode_FromObject(obj);
5717 if (uniobj == NULL) {
5718 PyErr_SetString(PyExc_TypeError,
5719 "The fill character cannot be converted to Unicode");
5720 return 0;
5721 }
5722 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5723 PyErr_SetString(PyExc_TypeError,
5724 "The fill character must be exactly one character long");
5725 Py_DECREF(uniobj);
5726 return 0;
5727 }
5728 unistr = PyUnicode_AS_UNICODE(uniobj);
5729 *fillcharloc = unistr[0];
5730 Py_DECREF(uniobj);
5731 return 1;
5732}
5733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005734PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005735"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005737Return S centered in a Unicode string of length width. Padding is\n\
5738done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740static PyObject *
5741unicode_center(PyUnicodeObject *self, PyObject *args)
5742{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005743 Py_ssize_t marg, left;
5744 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005745 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Thomas Woutersde017742006-02-16 19:34:37 +00005747 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 return NULL;
5749
Tim Peters7a29bd52001-09-12 03:03:31 +00005750 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 Py_INCREF(self);
5752 return (PyObject*) self;
5753 }
5754
5755 marg = width - self->length;
5756 left = marg / 2 + (marg & width & 1);
5757
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005758 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759}
5760
Marc-André Lemburge5034372000-08-08 08:04:29 +00005761#if 0
5762
5763/* This code should go into some future Unicode collation support
5764 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005765 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005766
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005767/* speedy UTF-16 code point order comparison */
5768/* gleaned from: */
5769/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5770
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005771static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005772{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005773 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005774 0, 0, 0, 0, 0, 0, 0, 0,
5775 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005776 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005777};
5778
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779static int
5780unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005782 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005783
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 Py_UNICODE *s1 = str1->str;
5785 Py_UNICODE *s2 = str2->str;
5786
5787 len1 = str1->length;
5788 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005791 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005792
5793 c1 = *s1++;
5794 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005795
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005796 if (c1 > (1<<11) * 26)
5797 c1 += utf16Fixup[c1>>11];
5798 if (c2 > (1<<11) * 26)
5799 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005800 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005801
5802 if (c1 != c2)
5803 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005804
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005805 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 }
5807
5808 return (len1 < len2) ? -1 : (len1 != len2);
5809}
5810
Marc-André Lemburge5034372000-08-08 08:04:29 +00005811#else
5812
5813static int
5814unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5815{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005816 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005817
5818 Py_UNICODE *s1 = str1->str;
5819 Py_UNICODE *s2 = str2->str;
5820
5821 len1 = str1->length;
5822 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005823
Marc-André Lemburge5034372000-08-08 08:04:29 +00005824 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005825 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005826
Fredrik Lundh45714e92001-06-26 16:39:36 +00005827 c1 = *s1++;
5828 c2 = *s2++;
5829
5830 if (c1 != c2)
5831 return (c1 < c2) ? -1 : 1;
5832
Marc-André Lemburge5034372000-08-08 08:04:29 +00005833 len1--; len2--;
5834 }
5835
5836 return (len1 < len2) ? -1 : (len1 != len2);
5837}
5838
5839#endif
5840
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841int PyUnicode_Compare(PyObject *left,
5842 PyObject *right)
5843{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005844 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5845 return unicode_compare((PyUnicodeObject *)left,
5846 (PyUnicodeObject *)right);
5847 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5848 (PyUnicode_Check(left) && PyString_Check(right))) {
5849 if (PyUnicode_Check(left))
5850 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5851 if (PyUnicode_Check(right))
5852 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5853 assert(PyString_Check(left));
5854 assert(PyString_Check(right));
5855 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005857 PyErr_Format(PyExc_TypeError,
5858 "Can't compare %.100s and %.100s",
5859 left->ob_type->tp_name,
5860 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 return -1;
5862}
5863
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005864PyObject *PyUnicode_RichCompare(PyObject *left,
5865 PyObject *right,
5866 int op)
5867{
5868 int result;
5869
5870 result = PyUnicode_Compare(left, right);
5871 if (result == -1 && PyErr_Occurred())
5872 goto onError;
5873
5874 /* Convert the return value to a Boolean */
5875 switch (op) {
5876 case Py_EQ:
5877 result = (result == 0);
5878 break;
5879 case Py_NE:
5880 result = (result != 0);
5881 break;
5882 case Py_LE:
5883 result = (result <= 0);
5884 break;
5885 case Py_GE:
5886 result = (result >= 0);
5887 break;
5888 case Py_LT:
5889 result = (result == -1);
5890 break;
5891 case Py_GT:
5892 result = (result == 1);
5893 break;
5894 }
5895 return PyBool_FromLong(result);
5896
5897 onError:
5898
5899 /* Standard case
5900
5901 Type errors mean that PyUnicode_FromObject() could not convert
5902 one of the arguments (usually the right hand side) to Unicode,
5903 ie. we can't handle the comparison request. However, it is
5904 possible that the other object knows a comparison method, which
5905 is why we return Py_NotImplemented to give the other object a
5906 chance.
5907
5908 */
5909 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5910 PyErr_Clear();
5911 Py_INCREF(Py_NotImplemented);
5912 return Py_NotImplemented;
5913 }
5914 if (op != Py_EQ && op != Py_NE)
5915 return NULL;
5916
5917 /* Equality comparison.
5918
5919 This is a special case: we silence any PyExc_UnicodeDecodeError
5920 and instead turn it into a PyErr_UnicodeWarning.
5921
5922 */
5923 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5924 return NULL;
5925 PyErr_Clear();
5926 if (PyErr_Warn(PyExc_UnicodeWarning,
5927 (op == Py_EQ) ?
5928 "Unicode equal comparison "
5929 "failed to convert both arguments to Unicode - "
5930 "interpreting them as being unequal" :
5931 "Unicode unequal comparison "
5932 "failed to convert both arguments to Unicode - "
5933 "interpreting them as being unequal"
5934 ) < 0)
5935 return NULL;
5936 result = (op == Py_NE);
5937 return PyBool_FromLong(result);
5938}
5939
Guido van Rossum403d68b2000-03-13 15:55:09 +00005940int PyUnicode_Contains(PyObject *container,
5941 PyObject *element)
5942{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005943 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005944 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005945
5946 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947 sub = PyUnicode_FromObject(element);
5948 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005949 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005950 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005952 }
5953
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 str = PyUnicode_FromObject(container);
5955 if (!str) {
5956 Py_DECREF(sub);
5957 return -1;
5958 }
5959
5960 result = stringlib_contains_obj(str, sub);
5961
5962 Py_DECREF(str);
5963 Py_DECREF(sub);
5964
Guido van Rossum403d68b2000-03-13 15:55:09 +00005965 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005966}
5967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968/* Concat to string or Unicode object giving a new Unicode object. */
5969
5970PyObject *PyUnicode_Concat(PyObject *left,
5971 PyObject *right)
5972{
5973 PyUnicodeObject *u = NULL, *v = NULL, *w;
5974
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005975 if (PyBytes_Check(left) || PyBytes_Check(right))
5976 return PyBytes_Concat(left, right);
5977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 /* Coerce the two arguments */
5979 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5980 if (u == NULL)
5981 goto onError;
5982 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5983 if (v == NULL)
5984 goto onError;
5985
5986 /* Shortcuts */
5987 if (v == unicode_empty) {
5988 Py_DECREF(v);
5989 return (PyObject *)u;
5990 }
5991 if (u == unicode_empty) {
5992 Py_DECREF(u);
5993 return (PyObject *)v;
5994 }
5995
5996 /* Concat the two Unicode strings */
5997 w = _PyUnicode_New(u->length + v->length);
5998 if (w == NULL)
5999 goto onError;
6000 Py_UNICODE_COPY(w->str, u->str, u->length);
6001 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6002
6003 Py_DECREF(u);
6004 Py_DECREF(v);
6005 return (PyObject *)w;
6006
6007onError:
6008 Py_XDECREF(u);
6009 Py_XDECREF(v);
6010 return NULL;
6011}
6012
Walter Dörwald1ab83302007-05-18 17:15:44 +00006013void
6014PyUnicode_Append(PyObject **pleft, PyObject *right)
6015{
6016 PyObject *new;
6017 if (*pleft == NULL)
6018 return;
6019 if (right == NULL || !PyUnicode_Check(*pleft)) {
6020 Py_DECREF(*pleft);
6021 *pleft = NULL;
6022 return;
6023 }
6024 new = PyUnicode_Concat(*pleft, right);
6025 Py_DECREF(*pleft);
6026 *pleft = new;
6027}
6028
6029void
6030PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6031{
6032 PyUnicode_Append(pleft, right);
6033 Py_XDECREF(right);
6034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037"S.count(sub[, start[, end]]) -> int\n\
6038\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006039Return the number of non-overlapping occurrences of substring sub in\n\
6040Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006041interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
6043static PyObject *
6044unicode_count(PyUnicodeObject *self, PyObject *args)
6045{
6046 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006048 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 PyObject *result;
6050
Guido van Rossumb8872e62000-05-09 14:14:27 +00006051 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6052 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 return NULL;
6054
6055 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006056 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 if (substring == NULL)
6058 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006059
Thomas Wouters477c8d52006-05-27 19:21:47 +00006060 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Thomas Wouters477c8d52006-05-27 19:21:47 +00006062 result = PyInt_FromSsize_t(
6063 stringlib_count(self->str + start, end - start,
6064 substring->str, substring->length)
6065 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066
6067 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return result;
6070}
6071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006073"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006075Encodes S using the codec registered for encoding. encoding defaults\n\
6076to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006077handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6079'xmlcharrefreplace' as well as any other name registered with\n\
6080codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
6082static PyObject *
6083unicode_encode(PyUnicodeObject *self, PyObject *args)
6084{
6085 char *encoding = NULL;
6086 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006087 PyObject *v;
6088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6090 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006091 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006092 if (v == NULL)
6093 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006094 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006095 if (PyString_Check(v)) {
6096 /* Old codec, turn it into bytes */
6097 PyObject *b = PyBytes_FromObject(v);
6098 Py_DECREF(v);
6099 return b;
6100 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006101 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006102 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006103 "(type=%.400s)",
6104 v->ob_type->tp_name);
6105 Py_DECREF(v);
6106 return NULL;
6107 }
6108 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006109
6110 onError:
6111 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006112}
6113
6114PyDoc_STRVAR(decode__doc__,
6115"S.decode([encoding[,errors]]) -> string or unicode\n\
6116\n\
6117Decodes S using the codec registered for encoding. encoding defaults\n\
6118to the default encoding. errors may be given to set a different error\n\
6119handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6120a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6121as well as any other name registerd with codecs.register_error that is\n\
6122able to handle UnicodeDecodeErrors.");
6123
6124static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006125unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006126{
6127 char *encoding = NULL;
6128 char *errors = NULL;
6129 PyObject *v;
6130
6131 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6132 return NULL;
6133 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006134 if (v == NULL)
6135 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006136 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6137 PyErr_Format(PyExc_TypeError,
6138 "decoder did not return a string/unicode object "
6139 "(type=%.400s)",
6140 v->ob_type->tp_name);
6141 Py_DECREF(v);
6142 return NULL;
6143 }
6144 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006145
6146 onError:
6147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006150PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151"S.expandtabs([tabsize]) -> unicode\n\
6152\n\
6153Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006154If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
6156static PyObject*
6157unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6158{
6159 Py_UNICODE *e;
6160 Py_UNICODE *p;
6161 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 PyUnicodeObject *u;
6164 int tabsize = 8;
6165
6166 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6167 return NULL;
6168
Thomas Wouters7e474022000-07-16 12:04:32 +00006169 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 i = j = 0;
6171 e = self->str + self->length;
6172 for (p = self->str; p < e; p++)
6173 if (*p == '\t') {
6174 if (tabsize > 0)
6175 j += tabsize - (j % tabsize);
6176 }
6177 else {
6178 j++;
6179 if (*p == '\n' || *p == '\r') {
6180 i += j;
6181 j = 0;
6182 }
6183 }
6184
6185 /* Second pass: create output string and fill it */
6186 u = _PyUnicode_New(i + j);
6187 if (!u)
6188 return NULL;
6189
6190 j = 0;
6191 q = u->str;
6192
6193 for (p = self->str; p < e; p++)
6194 if (*p == '\t') {
6195 if (tabsize > 0) {
6196 i = tabsize - (j % tabsize);
6197 j += i;
6198 while (i--)
6199 *q++ = ' ';
6200 }
6201 }
6202 else {
6203 j++;
6204 *q++ = *p;
6205 if (*p == '\n' || *p == '\r')
6206 j = 0;
6207 }
6208
6209 return (PyObject*) u;
6210}
6211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213"S.find(sub [,start [,end]]) -> int\n\
6214\n\
6215Return the lowest index in S where substring sub is found,\n\
6216such that sub is contained within s[start,end]. Optional\n\
6217arguments start and end are interpreted as in slice notation.\n\
6218\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221static PyObject *
6222unicode_find(PyUnicodeObject *self, PyObject *args)
6223{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006224 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006225 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006226 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006227 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
Guido van Rossumb8872e62000-05-09 14:14:27 +00006229 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6230 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006232 substring = PyUnicode_FromObject(substring);
6233 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 return NULL;
6235
Thomas Wouters477c8d52006-05-27 19:21:47 +00006236 result = stringlib_find_slice(
6237 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6238 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6239 start, end
6240 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241
6242 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006243
6244 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245}
6246
6247static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
6250 if (index < 0 || index >= self->length) {
6251 PyErr_SetString(PyExc_IndexError, "string index out of range");
6252 return NULL;
6253 }
6254
6255 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6256}
6257
6258static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006259unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006261 /* Since Unicode objects compare equal to their UTF-8 string
6262 counterparts, we hash the UTF-8 string. */
6263 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6264 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006267PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268"S.index(sub [,start [,end]]) -> int\n\
6269\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006270Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272static PyObject *
6273unicode_index(PyUnicodeObject *self, PyObject *args)
6274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006275 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006276 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006278 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279
Guido van Rossumb8872e62000-05-09 14:14:27 +00006280 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6281 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006283 substring = PyUnicode_FromObject(substring);
6284 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 return NULL;
6286
Thomas Wouters477c8d52006-05-27 19:21:47 +00006287 result = stringlib_find_slice(
6288 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6289 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6290 start, end
6291 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
6293 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006294
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 if (result < 0) {
6296 PyErr_SetString(PyExc_ValueError, "substring not found");
6297 return NULL;
6298 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301}
6302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006303PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006304"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006306Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006307at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
6309static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006310unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311{
6312 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6313 register const Py_UNICODE *e;
6314 int cased;
6315
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 /* Shortcut for single character strings */
6317 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006318 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006320 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006321 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006323
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 e = p + PyUnicode_GET_SIZE(self);
6325 cased = 0;
6326 for (; p < e; p++) {
6327 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006328
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 else if (!cased && Py_UNICODE_ISLOWER(ch))
6332 cased = 1;
6333 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006334 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006338"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006340Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006344unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
6346 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6347 register const Py_UNICODE *e;
6348 int cased;
6349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 /* Shortcut for single character strings */
6351 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006352 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006354 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006355 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006356 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 e = p + PyUnicode_GET_SIZE(self);
6359 cased = 0;
6360 for (; p < e; p++) {
6361 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006362
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006364 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 else if (!cased && Py_UNICODE_ISUPPER(ch))
6366 cased = 1;
6367 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369}
6370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006371PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006372"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006374Return True if S is a titlecased string and there is at least one\n\
6375character in S, i.e. upper- and titlecase characters may only\n\
6376follow uncased characters and lowercase characters only cased ones.\n\
6377Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
6379static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006380unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381{
6382 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6383 register const Py_UNICODE *e;
6384 int cased, previous_is_cased;
6385
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 /* Shortcut for single character strings */
6387 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006388 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6389 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006391 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006392 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006394
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 e = p + PyUnicode_GET_SIZE(self);
6396 cased = 0;
6397 previous_is_cased = 0;
6398 for (; p < e; p++) {
6399 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6402 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006403 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 previous_is_cased = 1;
6405 cased = 1;
6406 }
6407 else if (Py_UNICODE_ISLOWER(ch)) {
6408 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006409 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 previous_is_cased = 1;
6411 cased = 1;
6412 }
6413 else
6414 previous_is_cased = 0;
6415 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417}
6418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006419PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006422Return True if all characters in S are whitespace\n\
6423and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
6425static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006426unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6429 register const Py_UNICODE *e;
6430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 /* Shortcut for single character strings */
6432 if (PyUnicode_GET_SIZE(self) == 1 &&
6433 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006436 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006437 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006438 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 e = p + PyUnicode_GET_SIZE(self);
6441 for (; p < e; p++) {
6442 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006443 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006445 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446}
6447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006448PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006449"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006450\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006451Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006452and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006453
6454static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006455unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006456{
6457 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6458 register const Py_UNICODE *e;
6459
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006460 /* Shortcut for single character strings */
6461 if (PyUnicode_GET_SIZE(self) == 1 &&
6462 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006463 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006464
6465 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006466 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006467 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006468
6469 e = p + PyUnicode_GET_SIZE(self);
6470 for (; p < e; p++) {
6471 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006472 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006473 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006474 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006475}
6476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006477PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006478"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006479\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006480Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006482
6483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006484unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006485{
6486 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6487 register const Py_UNICODE *e;
6488
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006489 /* Shortcut for single character strings */
6490 if (PyUnicode_GET_SIZE(self) == 1 &&
6491 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006492 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006493
6494 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006495 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006496 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006497
6498 e = p + PyUnicode_GET_SIZE(self);
6499 for (; p < e; p++) {
6500 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006501 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006502 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006503 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006504}
6505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006507"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006509Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006510False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006513unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514{
6515 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6516 register const Py_UNICODE *e;
6517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 /* Shortcut for single character strings */
6519 if (PyUnicode_GET_SIZE(self) == 1 &&
6520 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006521 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006523 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006524 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006525 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006526
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 e = p + PyUnicode_GET_SIZE(self);
6528 for (; p < e; p++) {
6529 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006530 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006532 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533}
6534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006535PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006536"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006538Return True if all characters in S are digits\n\
6539and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
6541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006542unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543{
6544 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6545 register const Py_UNICODE *e;
6546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 /* Shortcut for single character strings */
6548 if (PyUnicode_GET_SIZE(self) == 1 &&
6549 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006550 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006552 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006553 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006554 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 e = p + PyUnicode_GET_SIZE(self);
6557 for (; p < e; p++) {
6558 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006559 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006561 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006565"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006567Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006568False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
6570static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006571unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572{
6573 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6574 register const Py_UNICODE *e;
6575
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 /* Shortcut for single character strings */
6577 if (PyUnicode_GET_SIZE(self) == 1 &&
6578 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006579 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006581 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006582 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006583 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 e = p + PyUnicode_GET_SIZE(self);
6586 for (; p < e; p++) {
6587 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006588 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006590 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006593PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594"S.join(sequence) -> unicode\n\
6595\n\
6596Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598
6599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006600unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006602 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603}
6604
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606unicode_length(PyUnicodeObject *self)
6607{
6608 return self->length;
6609}
6610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006611PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006612"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613\n\
6614Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006615done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
6617static PyObject *
6618unicode_ljust(PyUnicodeObject *self, PyObject *args)
6619{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006620 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006621 Py_UNICODE fillchar = ' ';
6622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006623 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 return NULL;
6625
Tim Peters7a29bd52001-09-12 03:03:31 +00006626 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 Py_INCREF(self);
6628 return (PyObject*) self;
6629 }
6630
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006631 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006634PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635"S.lower() -> unicode\n\
6636\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
6639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006640unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 return fixup(self, fixlower);
6643}
6644
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006645#define LEFTSTRIP 0
6646#define RIGHTSTRIP 1
6647#define BOTHSTRIP 2
6648
6649/* Arrays indexed by above */
6650static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6651
6652#define STRIPNAME(i) (stripformat[i]+3)
6653
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006654/* externally visible for str.strip(unicode) */
6655PyObject *
6656_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6657{
6658 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006659 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006660 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6662 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006663
Thomas Wouters477c8d52006-05-27 19:21:47 +00006664 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6665
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006666 i = 0;
6667 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6669 i++;
6670 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006671 }
6672
6673 j = len;
6674 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006675 do {
6676 j--;
6677 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6678 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006679 }
6680
6681 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 Py_INCREF(self);
6683 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006684 }
6685 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006686 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006687}
6688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
6690static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006691do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006693 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006695
6696 i = 0;
6697 if (striptype != RIGHTSTRIP) {
6698 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6699 i++;
6700 }
6701 }
6702
6703 j = len;
6704 if (striptype != LEFTSTRIP) {
6705 do {
6706 j--;
6707 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6708 j++;
6709 }
6710
6711 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6712 Py_INCREF(self);
6713 return (PyObject*)self;
6714 }
6715 else
6716 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006719
6720static PyObject *
6721do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6722{
6723 PyObject *sep = NULL;
6724
6725 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6726 return NULL;
6727
6728 if (sep != NULL && sep != Py_None) {
6729 if (PyUnicode_Check(sep))
6730 return _PyUnicode_XStrip(self, striptype, sep);
6731 else if (PyString_Check(sep)) {
6732 PyObject *res;
6733 sep = PyUnicode_FromObject(sep);
6734 if (sep==NULL)
6735 return NULL;
6736 res = _PyUnicode_XStrip(self, striptype, sep);
6737 Py_DECREF(sep);
6738 return res;
6739 }
6740 else {
6741 PyErr_Format(PyExc_TypeError,
6742 "%s arg must be None, unicode or str",
6743 STRIPNAME(striptype));
6744 return NULL;
6745 }
6746 }
6747
6748 return do_strip(self, striptype);
6749}
6750
6751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006753"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006754\n\
6755Return a copy of the string S with leading and trailing\n\
6756whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006757If chars is given and not None, remove characters in chars instead.\n\
6758If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006759
6760static PyObject *
6761unicode_strip(PyUnicodeObject *self, PyObject *args)
6762{
6763 if (PyTuple_GET_SIZE(args) == 0)
6764 return do_strip(self, BOTHSTRIP); /* Common case */
6765 else
6766 return do_argstrip(self, BOTHSTRIP, args);
6767}
6768
6769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006770PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006771"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006772\n\
6773Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006774If chars is given and not None, remove characters in chars instead.\n\
6775If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006776
6777static PyObject *
6778unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6779{
6780 if (PyTuple_GET_SIZE(args) == 0)
6781 return do_strip(self, LEFTSTRIP); /* Common case */
6782 else
6783 return do_argstrip(self, LEFTSTRIP, args);
6784}
6785
6786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006787PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006788"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006789\n\
6790Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006791If chars is given and not None, remove characters in chars instead.\n\
6792If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006793
6794static PyObject *
6795unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6796{
6797 if (PyTuple_GET_SIZE(args) == 0)
6798 return do_strip(self, RIGHTSTRIP); /* Common case */
6799 else
6800 return do_argstrip(self, RIGHTSTRIP, args);
6801}
6802
6803
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006805unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
6807 PyUnicodeObject *u;
6808 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006809 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006810 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
6812 if (len < 0)
6813 len = 0;
6814
Tim Peters7a29bd52001-09-12 03:03:31 +00006815 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 /* no repeat, return original string */
6817 Py_INCREF(str);
6818 return (PyObject*) str;
6819 }
Tim Peters8f422462000-09-09 06:13:41 +00006820
6821 /* ensure # of chars needed doesn't overflow int and # of bytes
6822 * needed doesn't overflow size_t
6823 */
6824 nchars = len * str->length;
6825 if (len && nchars / len != str->length) {
6826 PyErr_SetString(PyExc_OverflowError,
6827 "repeated string is too long");
6828 return NULL;
6829 }
6830 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6831 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6832 PyErr_SetString(PyExc_OverflowError,
6833 "repeated string is too long");
6834 return NULL;
6835 }
6836 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 if (!u)
6838 return NULL;
6839
6840 p = u->str;
6841
Thomas Wouters477c8d52006-05-27 19:21:47 +00006842 if (str->length == 1 && len > 0) {
6843 Py_UNICODE_FILL(p, str->str[0], len);
6844 } else {
6845 Py_ssize_t done = 0; /* number of characters copied this far */
6846 if (done < nchars) {
6847 Py_UNICODE_COPY(p, str->str, str->length);
6848 done = str->length;
6849 }
6850 while (done < nchars) {
6851 int n = (done <= nchars-done) ? done : nchars-done;
6852 Py_UNICODE_COPY(p+done, p, n);
6853 done += n;
6854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 }
6856
6857 return (PyObject*) u;
6858}
6859
6860PyObject *PyUnicode_Replace(PyObject *obj,
6861 PyObject *subobj,
6862 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006863 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864{
6865 PyObject *self;
6866 PyObject *str1;
6867 PyObject *str2;
6868 PyObject *result;
6869
6870 self = PyUnicode_FromObject(obj);
6871 if (self == NULL)
6872 return NULL;
6873 str1 = PyUnicode_FromObject(subobj);
6874 if (str1 == NULL) {
6875 Py_DECREF(self);
6876 return NULL;
6877 }
6878 str2 = PyUnicode_FromObject(replobj);
6879 if (str2 == NULL) {
6880 Py_DECREF(self);
6881 Py_DECREF(str1);
6882 return NULL;
6883 }
Tim Petersced69f82003-09-16 20:30:58 +00006884 result = replace((PyUnicodeObject *)self,
6885 (PyUnicodeObject *)str1,
6886 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 maxcount);
6888 Py_DECREF(self);
6889 Py_DECREF(str1);
6890 Py_DECREF(str2);
6891 return result;
6892}
6893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895"S.replace (old, new[, maxsplit]) -> unicode\n\
6896\n\
6897Return a copy of S with all occurrences of substring\n\
6898old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901static PyObject*
6902unicode_replace(PyUnicodeObject *self, PyObject *args)
6903{
6904 PyUnicodeObject *str1;
6905 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006906 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 PyObject *result;
6908
Martin v. Löwis18e16552006-02-15 17:27:45 +00006909 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 return NULL;
6911 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6912 if (str1 == NULL)
6913 return NULL;
6914 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006915 if (str2 == NULL) {
6916 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
6920 result = replace(self, str1, str2, maxcount);
6921
6922 Py_DECREF(str1);
6923 Py_DECREF(str2);
6924 return result;
6925}
6926
6927static
6928PyObject *unicode_repr(PyObject *unicode)
6929{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006930 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006931 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006932 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6933 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6934
6935 /* XXX(nnorwitz): rather than over-allocating, it would be
6936 better to choose a different scheme. Perhaps scan the
6937 first N-chars of the string and allocate based on that size.
6938 */
6939 /* Initial allocation is based on the longest-possible unichr
6940 escape.
6941
6942 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6943 unichr, so in this case it's the longest unichr escape. In
6944 narrow (UTF-16) builds this is five chars per source unichr
6945 since there are two unichrs in the surrogate pair, so in narrow
6946 (UTF-16) builds it's not the longest unichr escape.
6947
6948 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6949 so in the narrow (UTF-16) build case it's the longest unichr
6950 escape.
6951 */
6952
Walter Dörwald1ab83302007-05-18 17:15:44 +00006953 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006954 2 /* quotes */
6955#ifdef Py_UNICODE_WIDE
6956 + 10*size
6957#else
6958 + 6*size
6959#endif
6960 + 1);
6961 if (repr == NULL)
6962 return NULL;
6963
Walter Dörwald1ab83302007-05-18 17:15:44 +00006964 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006965
6966 /* Add quote */
6967 *p++ = (findchar(s, size, '\'') &&
6968 !findchar(s, size, '"')) ? '"' : '\'';
6969 while (size-- > 0) {
6970 Py_UNICODE ch = *s++;
6971
6972 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006973 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006974 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006975 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006976 continue;
6977 }
6978
6979#ifdef Py_UNICODE_WIDE
6980 /* Map 21-bit characters to '\U00xxxxxx' */
6981 else if (ch >= 0x10000) {
6982 *p++ = '\\';
6983 *p++ = 'U';
6984 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6985 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6986 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6987 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6988 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6989 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6990 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6991 *p++ = hexdigits[ch & 0x0000000F];
6992 continue;
6993 }
6994#else
6995 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6996 else if (ch >= 0xD800 && ch < 0xDC00) {
6997 Py_UNICODE ch2;
6998 Py_UCS4 ucs;
6999
7000 ch2 = *s++;
7001 size--;
7002 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7003 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7004 *p++ = '\\';
7005 *p++ = 'U';
7006 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7007 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7008 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7009 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7010 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7011 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7012 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7013 *p++ = hexdigits[ucs & 0x0000000F];
7014 continue;
7015 }
7016 /* Fall through: isolated surrogates are copied as-is */
7017 s--;
7018 size++;
7019 }
7020#endif
7021
7022 /* Map 16-bit characters to '\uxxxx' */
7023 if (ch >= 256) {
7024 *p++ = '\\';
7025 *p++ = 'u';
7026 *p++ = hexdigits[(ch >> 12) & 0x000F];
7027 *p++ = hexdigits[(ch >> 8) & 0x000F];
7028 *p++ = hexdigits[(ch >> 4) & 0x000F];
7029 *p++ = hexdigits[ch & 0x000F];
7030 }
7031
7032 /* Map special whitespace to '\t', \n', '\r' */
7033 else if (ch == '\t') {
7034 *p++ = '\\';
7035 *p++ = 't';
7036 }
7037 else if (ch == '\n') {
7038 *p++ = '\\';
7039 *p++ = 'n';
7040 }
7041 else if (ch == '\r') {
7042 *p++ = '\\';
7043 *p++ = 'r';
7044 }
7045
7046 /* Map non-printable US ASCII to '\xhh' */
7047 else if (ch < ' ' || ch >= 0x7F) {
7048 *p++ = '\\';
7049 *p++ = 'x';
7050 *p++ = hexdigits[(ch >> 4) & 0x000F];
7051 *p++ = hexdigits[ch & 0x000F];
7052 }
7053
7054 /* Copy everything else as-is */
7055 else
7056 *p++ = (char) ch;
7057 }
7058 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007059 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007060
7061 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007062 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007063 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067"S.rfind(sub [,start [,end]]) -> int\n\
7068\n\
7069Return the highest index in S where substring sub is found,\n\
7070such that sub is contained within s[start,end]. Optional\n\
7071arguments start and end are interpreted as in slice notation.\n\
7072\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007073Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075static PyObject *
7076unicode_rfind(PyUnicodeObject *self, PyObject *args)
7077{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007079 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007080 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007081 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
Guido van Rossumb8872e62000-05-09 14:14:27 +00007083 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7084 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007086 substring = PyUnicode_FromObject(substring);
7087 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 return NULL;
7089
Thomas Wouters477c8d52006-05-27 19:21:47 +00007090 result = stringlib_rfind_slice(
7091 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7092 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7093 start, end
7094 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097
7098 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099}
7100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007101PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102"S.rindex(sub [,start [,end]]) -> int\n\
7103\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject *
7107unicode_rindex(PyUnicodeObject *self, PyObject *args)
7108{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007109 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007110 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007111 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007112 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113
Guido van Rossumb8872e62000-05-09 14:14:27 +00007114 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7115 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007117 substring = PyUnicode_FromObject(substring);
7118 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 return NULL;
7120
Thomas Wouters477c8d52006-05-27 19:21:47 +00007121 result = stringlib_rfind_slice(
7122 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7123 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7124 start, end
7125 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007128
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 if (result < 0) {
7130 PyErr_SetString(PyExc_ValueError, "substring not found");
7131 return NULL;
7132 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007133 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134}
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007137"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138\n\
7139Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007140done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142static PyObject *
7143unicode_rjust(PyUnicodeObject *self, PyObject *args)
7144{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007145 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007146 Py_UNICODE fillchar = ' ';
7147
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007148 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150
Tim Peters7a29bd52001-09-12 03:03:31 +00007151 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 Py_INCREF(self);
7153 return (PyObject*) self;
7154 }
7155
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007156 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157}
7158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161{
7162 /* standard clamping */
7163 if (start < 0)
7164 start = 0;
7165 if (end < 0)
7166 end = 0;
7167 if (end > self->length)
7168 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007169 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 /* full slice, return original string */
7171 Py_INCREF(self);
7172 return (PyObject*) self;
7173 }
7174 if (start > end)
7175 start = end;
7176 /* copy slice */
7177 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7178 end - start);
7179}
7180
7181PyObject *PyUnicode_Split(PyObject *s,
7182 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184{
7185 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007186
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 s = PyUnicode_FromObject(s);
7188 if (s == NULL)
7189 return NULL;
7190 if (sep != NULL) {
7191 sep = PyUnicode_FromObject(sep);
7192 if (sep == NULL) {
7193 Py_DECREF(s);
7194 return NULL;
7195 }
7196 }
7197
7198 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7199
7200 Py_DECREF(s);
7201 Py_XDECREF(sep);
7202 return result;
7203}
7204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206"S.split([sep [,maxsplit]]) -> list of strings\n\
7207\n\
7208Return a list of the words in S, using sep as the\n\
7209delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007210splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007211any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213static PyObject*
7214unicode_split(PyUnicodeObject *self, PyObject *args)
7215{
7216 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007217 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return NULL;
7221
7222 if (substring == Py_None)
7223 return split(self, NULL, maxcount);
7224 else if (PyUnicode_Check(substring))
7225 return split(self, (PyUnicodeObject *)substring, maxcount);
7226 else
7227 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7228}
7229
Thomas Wouters477c8d52006-05-27 19:21:47 +00007230PyObject *
7231PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7232{
7233 PyObject* str_obj;
7234 PyObject* sep_obj;
7235 PyObject* out;
7236
7237 str_obj = PyUnicode_FromObject(str_in);
7238 if (!str_obj)
7239 return NULL;
7240 sep_obj = PyUnicode_FromObject(sep_in);
7241 if (!sep_obj) {
7242 Py_DECREF(str_obj);
7243 return NULL;
7244 }
7245
7246 out = stringlib_partition(
7247 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7248 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7249 );
7250
7251 Py_DECREF(sep_obj);
7252 Py_DECREF(str_obj);
7253
7254 return out;
7255}
7256
7257
7258PyObject *
7259PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7260{
7261 PyObject* str_obj;
7262 PyObject* sep_obj;
7263 PyObject* out;
7264
7265 str_obj = PyUnicode_FromObject(str_in);
7266 if (!str_obj)
7267 return NULL;
7268 sep_obj = PyUnicode_FromObject(sep_in);
7269 if (!sep_obj) {
7270 Py_DECREF(str_obj);
7271 return NULL;
7272 }
7273
7274 out = stringlib_rpartition(
7275 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7276 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7277 );
7278
7279 Py_DECREF(sep_obj);
7280 Py_DECREF(str_obj);
7281
7282 return out;
7283}
7284
7285PyDoc_STRVAR(partition__doc__,
7286"S.partition(sep) -> (head, sep, tail)\n\
7287\n\
7288Searches for the separator sep in S, and returns the part before it,\n\
7289the separator itself, and the part after it. If the separator is not\n\
7290found, returns S and two empty strings.");
7291
7292static PyObject*
7293unicode_partition(PyUnicodeObject *self, PyObject *separator)
7294{
7295 return PyUnicode_Partition((PyObject *)self, separator);
7296}
7297
7298PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007299"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007300\n\
7301Searches for the separator sep in S, starting at the end of S, and returns\n\
7302the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007303separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007304
7305static PyObject*
7306unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7307{
7308 return PyUnicode_RPartition((PyObject *)self, separator);
7309}
7310
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007311PyObject *PyUnicode_RSplit(PyObject *s,
7312 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007313 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007314{
7315 PyObject *result;
7316
7317 s = PyUnicode_FromObject(s);
7318 if (s == NULL)
7319 return NULL;
7320 if (sep != NULL) {
7321 sep = PyUnicode_FromObject(sep);
7322 if (sep == NULL) {
7323 Py_DECREF(s);
7324 return NULL;
7325 }
7326 }
7327
7328 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7329
7330 Py_DECREF(s);
7331 Py_XDECREF(sep);
7332 return result;
7333}
7334
7335PyDoc_STRVAR(rsplit__doc__,
7336"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7337\n\
7338Return a list of the words in S, using sep as the\n\
7339delimiter string, starting at the end of the string and\n\
7340working to the front. If maxsplit is given, at most maxsplit\n\
7341splits are done. If sep is not specified, any whitespace string\n\
7342is a separator.");
7343
7344static PyObject*
7345unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7346{
7347 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007348 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007349
Martin v. Löwis18e16552006-02-15 17:27:45 +00007350 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007351 return NULL;
7352
7353 if (substring == Py_None)
7354 return rsplit(self, NULL, maxcount);
7355 else if (PyUnicode_Check(substring))
7356 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7357 else
7358 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7359}
7360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007362"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363\n\
7364Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007365Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368static PyObject*
7369unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7370{
Guido van Rossum86662912000-04-11 15:38:46 +00007371 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Guido van Rossum86662912000-04-11 15:38:46 +00007373 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 return NULL;
7375
Guido van Rossum86662912000-04-11 15:38:46 +00007376 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377}
7378
7379static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007380PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
Walter Dörwald346737f2007-05-31 10:44:43 +00007382 if (PyUnicode_CheckExact(self)) {
7383 Py_INCREF(self);
7384 return self;
7385 } else
7386 /* Subtype -- return genuine unicode string with the same value. */
7387 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7388 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389}
7390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007391PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392"S.swapcase() -> unicode\n\
7393\n\
7394Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
7397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007398unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 return fixup(self, fixswapcase);
7401}
7402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007403PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404"S.translate(table) -> unicode\n\
7405\n\
7406Return a copy of the string S, where all characters have been mapped\n\
7407through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007408Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7409Unmapped characters are left untouched. Characters mapped to None\n\
7410are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
7412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007413unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414{
Tim Petersced69f82003-09-16 20:30:58 +00007415 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007417 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 "ignore");
7419}
7420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007421PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422"S.upper() -> unicode\n\
7423\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
7426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007427unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 return fixup(self, fixupper);
7430}
7431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007432PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433"S.zfill(width) -> unicode\n\
7434\n\
7435Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007436of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
7438static PyObject *
7439unicode_zfill(PyUnicodeObject *self, PyObject *args)
7440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 PyUnicodeObject *u;
7443
Martin v. Löwis18e16552006-02-15 17:27:45 +00007444 Py_ssize_t width;
7445 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 return NULL;
7447
7448 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007449 if (PyUnicode_CheckExact(self)) {
7450 Py_INCREF(self);
7451 return (PyObject*) self;
7452 }
7453 else
7454 return PyUnicode_FromUnicode(
7455 PyUnicode_AS_UNICODE(self),
7456 PyUnicode_GET_SIZE(self)
7457 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 }
7459
7460 fill = width - self->length;
7461
7462 u = pad(self, fill, 0, '0');
7463
Walter Dörwald068325e2002-04-15 13:36:47 +00007464 if (u == NULL)
7465 return NULL;
7466
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 if (u->str[fill] == '+' || u->str[fill] == '-') {
7468 /* move sign to beginning of string */
7469 u->str[0] = u->str[fill];
7470 u->str[fill] = '0';
7471 }
7472
7473 return (PyObject*) u;
7474}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476#if 0
7477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007478unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 return PyInt_FromLong(unicode_freelist_size);
7481}
7482#endif
7483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007485"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007487Return True if S starts with the specified prefix, False otherwise.\n\
7488With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489With optional end, stop comparing S at that position.\n\
7490prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491
7492static PyObject *
7493unicode_startswith(PyUnicodeObject *self,
7494 PyObject *args)
7495{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007499 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007503 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505 if (PyTuple_Check(subobj)) {
7506 Py_ssize_t i;
7507 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7508 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7509 PyTuple_GET_ITEM(subobj, i));
7510 if (substring == NULL)
7511 return NULL;
7512 result = tailmatch(self, substring, start, end, -1);
7513 Py_DECREF(substring);
7514 if (result) {
7515 Py_RETURN_TRUE;
7516 }
7517 }
7518 /* nothing matched */
7519 Py_RETURN_FALSE;
7520 }
7521 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523 return NULL;
7524 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527}
7528
7529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007530PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007531"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007533Return True if S ends with the specified suffix, False otherwise.\n\
7534With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535With optional end, stop comparing S at that position.\n\
7536suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
7538static PyObject *
7539unicode_endswith(PyUnicodeObject *self,
7540 PyObject *args)
7541{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007545 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007548 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7549 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007551 if (PyTuple_Check(subobj)) {
7552 Py_ssize_t i;
7553 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7554 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7555 PyTuple_GET_ITEM(subobj, i));
7556 if (substring == NULL)
7557 return NULL;
7558 result = tailmatch(self, substring, start, end, +1);
7559 Py_DECREF(substring);
7560 if (result) {
7561 Py_RETURN_TRUE;
7562 }
7563 }
7564 Py_RETURN_FALSE;
7565 }
7566 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007570 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573}
7574
7575
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007576
7577static PyObject *
7578unicode_getnewargs(PyUnicodeObject *v)
7579{
7580 return Py_BuildValue("(u#)", v->str, v->length);
7581}
7582
7583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584static PyMethodDef unicode_methods[] = {
7585
7586 /* Order is according to common usage: often used methods should
7587 appear first, since lookup is done sequentially. */
7588
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007589 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7590 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7591 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007592 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007593 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7594 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7595 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7596 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7597 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7598 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7599 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007600 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007601 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7602 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7603 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007604 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007605 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007606/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7607 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7608 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7609 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007610 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007611 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007612 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007613 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007614 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7615 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7616 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7617 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7618 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7619 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7620 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7621 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7622 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7623 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7624 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7625 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7626 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7627 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007628 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007629#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007630 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631#endif
7632
7633#if 0
7634 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007635 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636#endif
7637
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007638 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 {NULL, NULL}
7640};
7641
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007642static PyObject *
7643unicode_mod(PyObject *v, PyObject *w)
7644{
7645 if (!PyUnicode_Check(v)) {
7646 Py_INCREF(Py_NotImplemented);
7647 return Py_NotImplemented;
7648 }
7649 return PyUnicode_Format(v, w);
7650}
7651
7652static PyNumberMethods unicode_as_number = {
7653 0, /*nb_add*/
7654 0, /*nb_subtract*/
7655 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007656 unicode_mod, /*nb_remainder*/
7657};
7658
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007661 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007662 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7663 (ssizeargfunc) unicode_getitem, /* sq_item */
7664 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 0, /* sq_ass_item */
7666 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007667 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668};
7669
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007670static PyObject*
7671unicode_subscript(PyUnicodeObject* self, PyObject* item)
7672{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007673 if (PyIndex_Check(item)) {
7674 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007675 if (i == -1 && PyErr_Occurred())
7676 return NULL;
7677 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007678 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007679 return unicode_getitem(self, i);
7680 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007681 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007682 Py_UNICODE* source_buf;
7683 Py_UNICODE* result_buf;
7684 PyObject* result;
7685
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007686 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007687 &start, &stop, &step, &slicelength) < 0) {
7688 return NULL;
7689 }
7690
7691 if (slicelength <= 0) {
7692 return PyUnicode_FromUnicode(NULL, 0);
7693 } else {
7694 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007695 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7696 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007697
7698 if (result_buf == NULL)
7699 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007700
7701 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7702 result_buf[i] = source_buf[cur];
7703 }
Tim Petersced69f82003-09-16 20:30:58 +00007704
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007705 result = PyUnicode_FromUnicode(result_buf, slicelength);
7706 PyMem_FREE(result_buf);
7707 return result;
7708 }
7709 } else {
7710 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7711 return NULL;
7712 }
7713}
7714
7715static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007717 (binaryfunc)unicode_subscript, /* mp_subscript */
7718 (objobjargproc)0, /* mp_ass_subscript */
7719};
7720
Martin v. Löwis18e16552006-02-15 17:27:45 +00007721static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 const void **ptr)
7725{
7726 if (index != 0) {
7727 PyErr_SetString(PyExc_SystemError,
7728 "accessing non-existent unicode segment");
7729 return -1;
7730 }
7731 *ptr = (void *) self->str;
7732 return PyUnicode_GET_DATA_SIZE(self);
7733}
7734
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735static Py_ssize_t
7736unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 const void **ptr)
7738{
7739 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007740 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 return -1;
7742}
7743
7744static int
7745unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
7748 if (lenp)
7749 *lenp = PyUnicode_GET_DATA_SIZE(self);
7750 return 1;
7751}
7752
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007753static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007755 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 const void **ptr)
7757{
7758 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007759
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 if (index != 0) {
7761 PyErr_SetString(PyExc_SystemError,
7762 "accessing non-existent unicode segment");
7763 return -1;
7764 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007765 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 if (str == NULL)
7767 return -1;
7768 *ptr = (void *) PyString_AS_STRING(str);
7769 return PyString_GET_SIZE(str);
7770}
7771
7772/* Helpers for PyUnicode_Format() */
7773
7774static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007775getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007777 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 if (argidx < arglen) {
7779 (*p_argidx)++;
7780 if (arglen < 0)
7781 return args;
7782 else
7783 return PyTuple_GetItem(args, argidx);
7784 }
7785 PyErr_SetString(PyExc_TypeError,
7786 "not enough arguments for format string");
7787 return NULL;
7788}
7789
7790#define F_LJUST (1<<0)
7791#define F_SIGN (1<<1)
7792#define F_BLANK (1<<2)
7793#define F_ALT (1<<3)
7794#define F_ZERO (1<<4)
7795
Martin v. Löwis18e16552006-02-15 17:27:45 +00007796static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007797strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 register Py_ssize_t i;
7800 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 for (i = len - 1; i >= 0; i--)
7802 buffer[i] = (Py_UNICODE) charbuffer[i];
7803
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 return len;
7805}
7806
Neal Norwitzfc76d632006-01-10 06:03:13 +00007807static int
7808doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7809{
Tim Peters15231542006-02-16 01:08:01 +00007810 Py_ssize_t result;
7811
Neal Norwitzfc76d632006-01-10 06:03:13 +00007812 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007813 result = strtounicode(buffer, (char *)buffer);
7814 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007815}
7816
7817static int
7818longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7819{
Tim Peters15231542006-02-16 01:08:01 +00007820 Py_ssize_t result;
7821
Neal Norwitzfc76d632006-01-10 06:03:13 +00007822 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007823 result = strtounicode(buffer, (char *)buffer);
7824 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007825}
7826
Guido van Rossum078151d2002-08-11 04:24:12 +00007827/* XXX To save some code duplication, formatfloat/long/int could have been
7828 shared with stringobject.c, converting from 8-bit to Unicode after the
7829 formatting is done. */
7830
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831static int
7832formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007833 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 int flags,
7835 int prec,
7836 int type,
7837 PyObject *v)
7838{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007839 /* fmt = '%#.' + `prec` + `type`
7840 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 char fmt[20];
7842 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007843
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 x = PyFloat_AsDouble(v);
7845 if (x == -1.0 && PyErr_Occurred())
7846 return -1;
7847 if (prec < 0)
7848 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7850 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007851 /* Worst case length calc to ensure no buffer overrun:
7852
7853 'g' formats:
7854 fmt = %#.<prec>g
7855 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7856 for any double rep.)
7857 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7858
7859 'f' formats:
7860 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7861 len = 1 + 50 + 1 + prec = 52 + prec
7862
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007863 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007864 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007865
7866 */
7867 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7868 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007869 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007870 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007871 return -1;
7872 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007873 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7874 (flags&F_ALT) ? "#" : "",
7875 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007876 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877}
7878
Tim Peters38fd5b62000-09-21 05:43:11 +00007879static PyObject*
7880formatlong(PyObject *val, int flags, int prec, int type)
7881{
7882 char *buf;
7883 int i, len;
7884 PyObject *str; /* temporary string object. */
7885 PyUnicodeObject *result;
7886
7887 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7888 if (!str)
7889 return NULL;
7890 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007891 if (!result) {
7892 Py_DECREF(str);
7893 return NULL;
7894 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007895 for (i = 0; i < len; i++)
7896 result->str[i] = buf[i];
7897 result->str[len] = 0;
7898 Py_DECREF(str);
7899 return (PyObject*)result;
7900}
7901
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902static int
7903formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007904 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 int flags,
7906 int prec,
7907 int type,
7908 PyObject *v)
7909{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007910 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007911 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7912 * + 1 + 1
7913 * = 24
7914 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007915 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007916 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 long x;
7918
7919 x = PyInt_AsLong(v);
7920 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007921 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007922 if (x < 0 && type == 'u') {
7923 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007924 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007925 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7926 sign = "-";
7927 else
7928 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007930 prec = 1;
7931
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007932 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7933 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007934 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007935 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007936 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007937 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007938 return -1;
7939 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007940
7941 if ((flags & F_ALT) &&
7942 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007943 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007944 * of issues that cause pain:
7945 * - when 0 is being converted, the C standard leaves off
7946 * the '0x' or '0X', which is inconsistent with other
7947 * %#x/%#X conversions and inconsistent with Python's
7948 * hex() function
7949 * - there are platforms that violate the standard and
7950 * convert 0 with the '0x' or '0X'
7951 * (Metrowerks, Compaq Tru64)
7952 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007953 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007954 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007955 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007956 * We can achieve the desired consistency by inserting our
7957 * own '0x' or '0X' prefix, and substituting %x/%X in place
7958 * of %#x/%#X.
7959 *
7960 * Note that this is the same approach as used in
7961 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007962 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007963 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7964 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007965 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007966 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007967 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7968 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007969 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007970 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007971 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007972 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007973 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007974 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975}
7976
7977static int
7978formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007979 size_t buflen,
7980 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007982 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007983 if (PyUnicode_Check(v)) {
7984 if (PyUnicode_GET_SIZE(v) != 1)
7985 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007989 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007990 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007991 goto onError;
7992 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
7995 else {
7996 /* Integer input truncated to a character */
7997 long x;
7998 x = PyInt_AsLong(v);
7999 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008000 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008001#ifdef Py_UNICODE_WIDE
8002 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008003 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008004 "%c arg not in range(0x110000) "
8005 "(wide Python build)");
8006 return -1;
8007 }
8008#else
8009 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008010 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008011 "%c arg not in range(0x10000) "
8012 "(narrow Python build)");
8013 return -1;
8014 }
8015#endif
8016 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 }
8018 buf[1] = '\0';
8019 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008020
8021 onError:
8022 PyErr_SetString(PyExc_TypeError,
8023 "%c requires int or char");
8024 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025}
8026
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008027/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8028
8029 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8030 chars are formatted. XXX This is a magic number. Each formatting
8031 routine does bounds checking to ensure no overflow, but a better
8032 solution may be to malloc a buffer of appropriate size for each
8033 format. For now, the current solution is sufficient.
8034*/
8035#define FORMATBUFLEN (size_t)120
8036
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037PyObject *PyUnicode_Format(PyObject *format,
8038 PyObject *args)
8039{
8040 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 int args_owned = 0;
8043 PyUnicodeObject *result = NULL;
8044 PyObject *dict = NULL;
8045 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 if (format == NULL || args == NULL) {
8048 PyErr_BadInternalCall();
8049 return NULL;
8050 }
8051 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008052 if (uformat == NULL)
8053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 fmt = PyUnicode_AS_UNICODE(uformat);
8055 fmtcnt = PyUnicode_GET_SIZE(uformat);
8056
8057 reslen = rescnt = fmtcnt + 100;
8058 result = _PyUnicode_New(reslen);
8059 if (result == NULL)
8060 goto onError;
8061 res = PyUnicode_AS_UNICODE(result);
8062
8063 if (PyTuple_Check(args)) {
8064 arglen = PyTuple_Size(args);
8065 argidx = 0;
8066 }
8067 else {
8068 arglen = -1;
8069 argidx = -2;
8070 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008071 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8072 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 dict = args;
8074
8075 while (--fmtcnt >= 0) {
8076 if (*fmt != '%') {
8077 if (--rescnt < 0) {
8078 rescnt = fmtcnt + 100;
8079 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008080 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008081 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8083 --rescnt;
8084 }
8085 *res++ = *fmt++;
8086 }
8087 else {
8088 /* Got a format specifier */
8089 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 Py_UNICODE c = '\0';
8093 Py_UNICODE fill;
8094 PyObject *v = NULL;
8095 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008096 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008099 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100
8101 fmt++;
8102 if (*fmt == '(') {
8103 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 PyObject *key;
8106 int pcount = 1;
8107
8108 if (dict == NULL) {
8109 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008110 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 goto onError;
8112 }
8113 ++fmt;
8114 --fmtcnt;
8115 keystart = fmt;
8116 /* Skip over balanced parentheses */
8117 while (pcount > 0 && --fmtcnt >= 0) {
8118 if (*fmt == ')')
8119 --pcount;
8120 else if (*fmt == '(')
8121 ++pcount;
8122 fmt++;
8123 }
8124 keylen = fmt - keystart - 1;
8125 if (fmtcnt < 0 || pcount > 0) {
8126 PyErr_SetString(PyExc_ValueError,
8127 "incomplete format key");
8128 goto onError;
8129 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008130#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008131 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 then looked up since Python uses strings to hold
8133 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008134 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 key = PyUnicode_EncodeUTF8(keystart,
8136 keylen,
8137 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008138#else
8139 key = PyUnicode_FromUnicode(keystart, keylen);
8140#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 if (key == NULL)
8142 goto onError;
8143 if (args_owned) {
8144 Py_DECREF(args);
8145 args_owned = 0;
8146 }
8147 args = PyObject_GetItem(dict, key);
8148 Py_DECREF(key);
8149 if (args == NULL) {
8150 goto onError;
8151 }
8152 args_owned = 1;
8153 arglen = -1;
8154 argidx = -2;
8155 }
8156 while (--fmtcnt >= 0) {
8157 switch (c = *fmt++) {
8158 case '-': flags |= F_LJUST; continue;
8159 case '+': flags |= F_SIGN; continue;
8160 case ' ': flags |= F_BLANK; continue;
8161 case '#': flags |= F_ALT; continue;
8162 case '0': flags |= F_ZERO; continue;
8163 }
8164 break;
8165 }
8166 if (c == '*') {
8167 v = getnextarg(args, arglen, &argidx);
8168 if (v == NULL)
8169 goto onError;
8170 if (!PyInt_Check(v)) {
8171 PyErr_SetString(PyExc_TypeError,
8172 "* wants int");
8173 goto onError;
8174 }
8175 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008176 if (width == -1 && PyErr_Occurred())
8177 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 if (width < 0) {
8179 flags |= F_LJUST;
8180 width = -width;
8181 }
8182 if (--fmtcnt >= 0)
8183 c = *fmt++;
8184 }
8185 else if (c >= '0' && c <= '9') {
8186 width = c - '0';
8187 while (--fmtcnt >= 0) {
8188 c = *fmt++;
8189 if (c < '0' || c > '9')
8190 break;
8191 if ((width*10) / 10 != width) {
8192 PyErr_SetString(PyExc_ValueError,
8193 "width too big");
8194 goto onError;
8195 }
8196 width = width*10 + (c - '0');
8197 }
8198 }
8199 if (c == '.') {
8200 prec = 0;
8201 if (--fmtcnt >= 0)
8202 c = *fmt++;
8203 if (c == '*') {
8204 v = getnextarg(args, arglen, &argidx);
8205 if (v == NULL)
8206 goto onError;
8207 if (!PyInt_Check(v)) {
8208 PyErr_SetString(PyExc_TypeError,
8209 "* wants int");
8210 goto onError;
8211 }
8212 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008213 if (prec == -1 && PyErr_Occurred())
8214 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 if (prec < 0)
8216 prec = 0;
8217 if (--fmtcnt >= 0)
8218 c = *fmt++;
8219 }
8220 else if (c >= '0' && c <= '9') {
8221 prec = c - '0';
8222 while (--fmtcnt >= 0) {
8223 c = Py_CHARMASK(*fmt++);
8224 if (c < '0' || c > '9')
8225 break;
8226 if ((prec*10) / 10 != prec) {
8227 PyErr_SetString(PyExc_ValueError,
8228 "prec too big");
8229 goto onError;
8230 }
8231 prec = prec*10 + (c - '0');
8232 }
8233 }
8234 } /* prec */
8235 if (fmtcnt >= 0) {
8236 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 if (--fmtcnt >= 0)
8238 c = *fmt++;
8239 }
8240 }
8241 if (fmtcnt < 0) {
8242 PyErr_SetString(PyExc_ValueError,
8243 "incomplete format");
8244 goto onError;
8245 }
8246 if (c != '%') {
8247 v = getnextarg(args, arglen, &argidx);
8248 if (v == NULL)
8249 goto onError;
8250 }
8251 sign = 0;
8252 fill = ' ';
8253 switch (c) {
8254
8255 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008256 pbuf = formatbuf;
8257 /* presume that buffer length is at least 1 */
8258 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 len = 1;
8260 break;
8261
8262 case 's':
8263 case 'r':
8264 if (PyUnicode_Check(v) && c == 's') {
8265 temp = v;
8266 Py_INCREF(temp);
8267 }
8268 else {
8269 PyObject *unicode;
8270 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008271 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 else
8273 temp = PyObject_Repr(v);
8274 if (temp == NULL)
8275 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008276 if (PyUnicode_Check(temp))
8277 /* nothing to do */;
8278 else if (PyString_Check(temp)) {
8279 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008280 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008282 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008284 Py_DECREF(temp);
8285 temp = unicode;
8286 if (temp == NULL)
8287 goto onError;
8288 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008289 else {
8290 Py_DECREF(temp);
8291 PyErr_SetString(PyExc_TypeError,
8292 "%s argument has non-string str()");
8293 goto onError;
8294 }
8295 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008296 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 len = PyUnicode_GET_SIZE(temp);
8298 if (prec >= 0 && len > prec)
8299 len = prec;
8300 break;
8301
8302 case 'i':
8303 case 'd':
8304 case 'u':
8305 case 'o':
8306 case 'x':
8307 case 'X':
8308 if (c == 'i')
8309 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008310 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008311 temp = formatlong(v, flags, prec, c);
8312 if (!temp)
8313 goto onError;
8314 pbuf = PyUnicode_AS_UNICODE(temp);
8315 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008316 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008318 else {
8319 pbuf = formatbuf;
8320 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8321 flags, prec, c, v);
8322 if (len < 0)
8323 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008324 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008325 }
8326 if (flags & F_ZERO)
8327 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 break;
8329
8330 case 'e':
8331 case 'E':
8332 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008333 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 case 'g':
8335 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008336 if (c == 'F')
8337 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008338 pbuf = formatbuf;
8339 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8340 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 if (len < 0)
8342 goto onError;
8343 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008344 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 fill = '0';
8346 break;
8347
8348 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008349 pbuf = formatbuf;
8350 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 if (len < 0)
8352 goto onError;
8353 break;
8354
8355 default:
8356 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008357 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008358 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008359 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008360 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008361 (Py_ssize_t)(fmt - 1 -
8362 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 goto onError;
8364 }
8365 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008366 if (*pbuf == '-' || *pbuf == '+') {
8367 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 len--;
8369 }
8370 else if (flags & F_SIGN)
8371 sign = '+';
8372 else if (flags & F_BLANK)
8373 sign = ' ';
8374 else
8375 sign = 0;
8376 }
8377 if (width < len)
8378 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008379 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 reslen -= rescnt;
8381 rescnt = width + fmtcnt + 100;
8382 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008383 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008384 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008385 PyErr_NoMemory();
8386 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008387 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008388 if (_PyUnicode_Resize(&result, reslen) < 0) {
8389 Py_XDECREF(temp);
8390 goto onError;
8391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 res = PyUnicode_AS_UNICODE(result)
8393 + reslen - rescnt;
8394 }
8395 if (sign) {
8396 if (fill != ' ')
8397 *res++ = sign;
8398 rescnt--;
8399 if (width > len)
8400 width--;
8401 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008402 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8403 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008404 assert(pbuf[1] == c);
8405 if (fill != ' ') {
8406 *res++ = *pbuf++;
8407 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008408 }
Tim Petersfff53252001-04-12 18:38:48 +00008409 rescnt -= 2;
8410 width -= 2;
8411 if (width < 0)
8412 width = 0;
8413 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 if (width > len && !(flags & F_LJUST)) {
8416 do {
8417 --rescnt;
8418 *res++ = fill;
8419 } while (--width > len);
8420 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008421 if (fill == ' ') {
8422 if (sign)
8423 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008424 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008425 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008426 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008427 *res++ = *pbuf++;
8428 *res++ = *pbuf++;
8429 }
8430 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008431 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 res += len;
8433 rescnt -= len;
8434 while (--width >= len) {
8435 --rescnt;
8436 *res++ = ' ';
8437 }
8438 if (dict && (argidx < arglen) && c != '%') {
8439 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008440 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008441 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 goto onError;
8443 }
8444 Py_XDECREF(temp);
8445 } /* '%' */
8446 } /* until end */
8447 if (argidx < arglen && !dict) {
8448 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008449 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 goto onError;
8451 }
8452
Thomas Woutersa96affe2006-03-12 00:29:36 +00008453 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 if (args_owned) {
8456 Py_DECREF(args);
8457 }
8458 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 return (PyObject *)result;
8460
8461 onError:
8462 Py_XDECREF(result);
8463 Py_DECREF(uformat);
8464 if (args_owned) {
8465 Py_DECREF(args);
8466 }
8467 return NULL;
8468}
8469
8470static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008471 (readbufferproc) unicode_buffer_getreadbuf,
8472 (writebufferproc) unicode_buffer_getwritebuf,
8473 (segcountproc) unicode_buffer_getsegcount,
8474 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475};
8476
Jeremy Hylton938ace62002-07-17 16:30:39 +00008477static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008478unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8479
Tim Peters6d6c1a32001-08-02 04:15:00 +00008480static PyObject *
8481unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8482{
8483 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008484 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008485 char *encoding = NULL;
8486 char *errors = NULL;
8487
Guido van Rossume023fe02001-08-30 03:12:59 +00008488 if (type != &PyUnicode_Type)
8489 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008490 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8491 kwlist, &x, &encoding, &errors))
8492 return NULL;
8493 if (x == NULL)
8494 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008495 if (encoding == NULL && errors == NULL)
8496 return PyObject_Unicode(x);
8497 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008498 return PyUnicode_FromEncodedObject(x, encoding, errors);
8499}
8500
Guido van Rossume023fe02001-08-30 03:12:59 +00008501static PyObject *
8502unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8503{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008504 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008505 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008506
8507 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8508 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8509 if (tmp == NULL)
8510 return NULL;
8511 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008512 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008513 if (pnew == NULL) {
8514 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008515 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008516 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008517 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8518 if (pnew->str == NULL) {
8519 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008520 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008521 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008522 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008523 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008524 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8525 pnew->length = n;
8526 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008527 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008528 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008529}
8530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008531PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008532"unicode(string [, encoding[, errors]]) -> object\n\
8533\n\
8534Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008535encoding defaults to the current default string encoding.\n\
8536errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008537
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008538static PyObject *unicode_iter(PyObject *seq);
8539
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540PyTypeObject PyUnicode_Type = {
8541 PyObject_HEAD_INIT(&PyType_Type)
8542 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008543 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 sizeof(PyUnicodeObject), /* tp_size */
8545 0, /* tp_itemsize */
8546 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008547 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008549 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008551 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008552 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008553 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008555 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 (hashfunc) unicode_hash, /* tp_hash*/
8557 0, /* tp_call*/
8558 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008559 PyObject_GenericGetAttr, /* tp_getattro */
8560 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008562 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8563 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008564 unicode_doc, /* tp_doc */
8565 0, /* tp_traverse */
8566 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008567 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008568 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008569 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008570 0, /* tp_iternext */
8571 unicode_methods, /* tp_methods */
8572 0, /* tp_members */
8573 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008574 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008575 0, /* tp_dict */
8576 0, /* tp_descr_get */
8577 0, /* tp_descr_set */
8578 0, /* tp_dictoffset */
8579 0, /* tp_init */
8580 0, /* tp_alloc */
8581 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008582 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583};
8584
8585/* Initialize the Unicode implementation */
8586
Thomas Wouters78890102000-07-22 19:25:51 +00008587void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008589 int i;
8590
Thomas Wouters477c8d52006-05-27 19:21:47 +00008591 /* XXX - move this array to unicodectype.c ? */
8592 Py_UNICODE linebreak[] = {
8593 0x000A, /* LINE FEED */
8594 0x000D, /* CARRIAGE RETURN */
8595 0x001C, /* FILE SEPARATOR */
8596 0x001D, /* GROUP SEPARATOR */
8597 0x001E, /* RECORD SEPARATOR */
8598 0x0085, /* NEXT LINE */
8599 0x2028, /* LINE SEPARATOR */
8600 0x2029, /* PARAGRAPH SEPARATOR */
8601 };
8602
Fred Drakee4315f52000-05-09 19:53:39 +00008603 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008604 unicode_freelist = NULL;
8605 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008607 if (!unicode_empty)
8608 return;
8609
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008610 for (i = 0; i < 256; i++)
8611 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008612 if (PyType_Ready(&PyUnicode_Type) < 0)
8613 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614
8615 /* initialize the linebreak bloom filter */
8616 bloom_linebreak = make_bloom_mask(
8617 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8618 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008619
8620 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621}
8622
8623/* Finalize the Unicode implementation */
8624
8625void
Thomas Wouters78890102000-07-22 19:25:51 +00008626_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008628 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008629 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008631 Py_XDECREF(unicode_empty);
8632 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008633
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008634 for (i = 0; i < 256; i++) {
8635 if (unicode_latin1[i]) {
8636 Py_DECREF(unicode_latin1[i]);
8637 unicode_latin1[i] = NULL;
8638 }
8639 }
8640
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008641 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 PyUnicodeObject *v = u;
8643 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008644 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008645 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008646 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008647 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008649 unicode_freelist = NULL;
8650 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008652
Walter Dörwald16807132007-05-25 13:52:07 +00008653void
8654PyUnicode_InternInPlace(PyObject **p)
8655{
8656 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8657 PyObject *t;
8658 if (s == NULL || !PyUnicode_Check(s))
8659 Py_FatalError(
8660 "PyUnicode_InternInPlace: unicode strings only please!");
8661 /* If it's a subclass, we don't really know what putting
8662 it in the interned dict might do. */
8663 if (!PyUnicode_CheckExact(s))
8664 return;
8665 if (PyUnicode_CHECK_INTERNED(s))
8666 return;
8667 if (interned == NULL) {
8668 interned = PyDict_New();
8669 if (interned == NULL) {
8670 PyErr_Clear(); /* Don't leave an exception */
8671 return;
8672 }
8673 }
8674 t = PyDict_GetItem(interned, (PyObject *)s);
8675 if (t) {
8676 Py_INCREF(t);
8677 Py_DECREF(*p);
8678 *p = t;
8679 return;
8680 }
8681
8682 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8683 PyErr_Clear();
8684 return;
8685 }
8686 /* The two references in interned are not counted by refcnt.
8687 The deallocator will take care of this */
8688 s->ob_refcnt -= 2;
8689 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8690}
8691
8692void
8693PyUnicode_InternImmortal(PyObject **p)
8694{
8695 PyUnicode_InternInPlace(p);
8696 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8697 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8698 Py_INCREF(*p);
8699 }
8700}
8701
8702PyObject *
8703PyUnicode_InternFromString(const char *cp)
8704{
8705 PyObject *s = PyUnicode_FromString(cp);
8706 if (s == NULL)
8707 return NULL;
8708 PyUnicode_InternInPlace(&s);
8709 return s;
8710}
8711
8712void _Py_ReleaseInternedUnicodeStrings(void)
8713{
8714 PyObject *keys;
8715 PyUnicodeObject *s;
8716 Py_ssize_t i, n;
8717 Py_ssize_t immortal_size = 0, mortal_size = 0;
8718
8719 if (interned == NULL || !PyDict_Check(interned))
8720 return;
8721 keys = PyDict_Keys(interned);
8722 if (keys == NULL || !PyList_Check(keys)) {
8723 PyErr_Clear();
8724 return;
8725 }
8726
8727 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8728 detector, interned unicode strings are not forcibly deallocated;
8729 rather, we give them their stolen references back, and then clear
8730 and DECREF the interned dict. */
8731
8732 n = PyList_GET_SIZE(keys);
8733 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8734 n);
8735 for (i = 0; i < n; i++) {
8736 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8737 switch (s->state) {
8738 case SSTATE_NOT_INTERNED:
8739 /* XXX Shouldn't happen */
8740 break;
8741 case SSTATE_INTERNED_IMMORTAL:
8742 s->ob_refcnt += 1;
8743 immortal_size += s->length;
8744 break;
8745 case SSTATE_INTERNED_MORTAL:
8746 s->ob_refcnt += 2;
8747 mortal_size += s->length;
8748 break;
8749 default:
8750 Py_FatalError("Inconsistent interned string state.");
8751 }
8752 s->state = SSTATE_NOT_INTERNED;
8753 }
8754 fprintf(stderr, "total size of all interned strings: "
8755 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8756 "mortal/immortal\n", mortal_size, immortal_size);
8757 Py_DECREF(keys);
8758 PyDict_Clear(interned);
8759 Py_DECREF(interned);
8760 interned = NULL;
8761}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008762
8763
8764/********************* Unicode Iterator **************************/
8765
8766typedef struct {
8767 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008768 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008769 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8770} unicodeiterobject;
8771
8772static void
8773unicodeiter_dealloc(unicodeiterobject *it)
8774{
8775 _PyObject_GC_UNTRACK(it);
8776 Py_XDECREF(it->it_seq);
8777 PyObject_GC_Del(it);
8778}
8779
8780static int
8781unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8782{
8783 Py_VISIT(it->it_seq);
8784 return 0;
8785}
8786
8787static PyObject *
8788unicodeiter_next(unicodeiterobject *it)
8789{
8790 PyUnicodeObject *seq;
8791 PyObject *item;
8792
8793 assert(it != NULL);
8794 seq = it->it_seq;
8795 if (seq == NULL)
8796 return NULL;
8797 assert(PyUnicode_Check(seq));
8798
8799 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008800 item = PyUnicode_FromUnicode(
8801 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008802 if (item != NULL)
8803 ++it->it_index;
8804 return item;
8805 }
8806
8807 Py_DECREF(seq);
8808 it->it_seq = NULL;
8809 return NULL;
8810}
8811
8812static PyObject *
8813unicodeiter_len(unicodeiterobject *it)
8814{
8815 Py_ssize_t len = 0;
8816 if (it->it_seq)
8817 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8818 return PyInt_FromSsize_t(len);
8819}
8820
8821PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8822
8823static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008824 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8825 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008826 {NULL, NULL} /* sentinel */
8827};
8828
8829PyTypeObject PyUnicodeIter_Type = {
8830 PyObject_HEAD_INIT(&PyType_Type)
8831 0, /* ob_size */
8832 "unicodeiterator", /* tp_name */
8833 sizeof(unicodeiterobject), /* tp_basicsize */
8834 0, /* tp_itemsize */
8835 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008836 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008837 0, /* tp_print */
8838 0, /* tp_getattr */
8839 0, /* tp_setattr */
8840 0, /* tp_compare */
8841 0, /* tp_repr */
8842 0, /* tp_as_number */
8843 0, /* tp_as_sequence */
8844 0, /* tp_as_mapping */
8845 0, /* tp_hash */
8846 0, /* tp_call */
8847 0, /* tp_str */
8848 PyObject_GenericGetAttr, /* tp_getattro */
8849 0, /* tp_setattro */
8850 0, /* tp_as_buffer */
8851 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8852 0, /* tp_doc */
8853 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8854 0, /* tp_clear */
8855 0, /* tp_richcompare */
8856 0, /* tp_weaklistoffset */
8857 PyObject_SelfIter, /* tp_iter */
8858 (iternextfunc)unicodeiter_next, /* tp_iternext */
8859 unicodeiter_methods, /* tp_methods */
8860 0,
8861};
8862
8863static PyObject *
8864unicode_iter(PyObject *seq)
8865{
8866 unicodeiterobject *it;
8867
8868 if (!PyUnicode_Check(seq)) {
8869 PyErr_BadInternalCall();
8870 return NULL;
8871 }
8872 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8873 if (it == NULL)
8874 return NULL;
8875 it->it_index = 0;
8876 Py_INCREF(seq);
8877 it->it_seq = (PyUnicodeObject *)seq;
8878 _PyObject_GC_TRACK(it);
8879 return (PyObject *)it;
8880}
8881
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008882#ifdef __cplusplus
8883}
8884#endif
8885
8886
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008887/*
8888Local variables:
8889c-basic-offset: 4
8890indent-tabs-mode: nil
8891End:
8892*/