blob: bb4036aa7acd1ee6fbd5f6e6bc0c493b8479c559 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
543 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
808 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
809 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
810 Py_ssize_t upos;
811 for (upos = 0; upos<usize;)
812 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 ++callresult;
817 break;
818 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 case 'p':
820 sprintf(buffer, "%p", va_arg(vargs, void*));
821 /* %p is ill-defined: ensure leading 0x. */
822 if (buffer[1] == 'X')
823 buffer[1] = 'x';
824 else if (buffer[1] != 'x') {
825 memmove(buffer+2, buffer, strlen(buffer)+1);
826 buffer[0] = '0';
827 buffer[1] = 'x';
828 }
829 appendstring(buffer);
830 break;
831 case '%':
832 *s++ = '%';
833 break;
834 default:
835 appendstring(p);
836 goto end;
837 }
838 } else
839 *s++ = *f;
840 }
841
842 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000843 if (callresults)
844 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000845 if (abuffer)
846 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
848 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000849 fail:
850 if (callresults) {
851 PyObject **callresult2 = callresults;
852 while (callresult2 <= callresult) {
853 Py_DECREF(*callresult2);
854 ++callresult2;
855 }
856 PyMem_Free(callresults);
857 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000858 if (abuffer)
859 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000860 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861}
862
863#undef appendstring
864
865PyObject *
866PyUnicode_FromFormat(const char *format, ...)
867{
868 PyObject* ret;
869 va_list vargs;
870
871#ifdef HAVE_STDARG_PROTOTYPES
872 va_start(vargs, format);
873#else
874 va_start(vargs);
875#endif
876 ret = PyUnicode_FromFormatV(format, vargs);
877 va_end(vargs);
878 return ret;
879}
880
Martin v. Löwis18e16552006-02-15 17:27:45 +0000881Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
882 wchar_t *w,
883 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884{
885 if (unicode == NULL) {
886 PyErr_BadInternalCall();
887 return -1;
888 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000889
890 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000891 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000892 size = PyUnicode_GET_SIZE(unicode) + 1;
893
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894#ifdef HAVE_USABLE_WCHAR_T
895 memcpy(w, unicode->str, size * sizeof(wchar_t));
896#else
897 {
898 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000899 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000901 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 *w++ = *u++;
903 }
904#endif
905
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000906 if (size > PyUnicode_GET_SIZE(unicode))
907 return PyUnicode_GET_SIZE(unicode);
908 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000909 return size;
910}
911
912#endif
913
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000914PyObject *PyUnicode_FromOrdinal(int ordinal)
915{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000916 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917
918#ifdef Py_UNICODE_WIDE
919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000921 "chr() arg not in range(0x110000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 "(wide Python build)");
923 return NULL;
924 }
925#else
926 if (ordinal < 0 || ordinal > 0xffff) {
927 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000928 "chr() arg not in range(0x10000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 "(narrow Python build)");
930 return NULL;
931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968#if 0
969 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000970 that no encodings is given and then redirect to
971 PyObject_Unicode() which then applies the additional logic for
972 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000974 NOTE: This API should really only be used for object which
975 represent *encoded* Unicode !
976
977 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 if (PyUnicode_Check(obj)) {
979 if (encoding) {
980 PyErr_SetString(PyExc_TypeError,
981 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986#else
987 if (PyUnicode_Check(obj)) {
988 PyErr_SetString(PyExc_TypeError,
989 "decoding Unicode is not supported");
990 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000992#endif
993
994 /* Coerce object */
995 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 s = PyString_AS_STRING(obj);
997 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000999 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1000 /* Overwrite the error message with something more useful in
1001 case of a TypeError. */
1002 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001004 "coercing to Unicode: need string or buffer, "
1005 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001006 obj->ob_type->tp_name);
1007 goto onError;
1008 }
Tim Petersced69f82003-09-16 20:30:58 +00001009
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (len == 0) {
1012 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 }
Tim Petersced69f82003-09-16 20:30:58 +00001015 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001017
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return v;
1019
1020 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022}
1023
1024PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 const char *encoding,
1027 const char *errors)
1028{
1029 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030
1031 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001032 encoding = PyUnicode_GetDefaultEncoding();
1033
1034 /* Shortcuts for common default encodings */
1035 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001037 else if (strcmp(encoding, "latin-1") == 0)
1038 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001039#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1040 else if (strcmp(encoding, "mbcs") == 0)
1041 return PyUnicode_DecodeMBCS(s, size, errors);
1042#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001043 else if (strcmp(encoding, "ascii") == 0)
1044 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046 /* Decode via the codec registry */
1047 buffer = PyBuffer_FromMemory((void *)s, size);
1048 if (buffer == NULL)
1049 goto onError;
1050 unicode = PyCodec_Decode(buffer, encoding, errors);
1051 if (unicode == NULL)
1052 goto onError;
1053 if (!PyUnicode_Check(unicode)) {
1054 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001055 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 unicode->ob_type->tp_name);
1057 Py_DECREF(unicode);
1058 goto onError;
1059 }
1060 Py_DECREF(buffer);
1061 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 onError:
1064 Py_XDECREF(buffer);
1065 return NULL;
1066}
1067
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001068PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1069 const char *encoding,
1070 const char *errors)
1071{
1072 PyObject *v;
1073
1074 if (!PyUnicode_Check(unicode)) {
1075 PyErr_BadArgument();
1076 goto onError;
1077 }
1078
1079 if (encoding == NULL)
1080 encoding = PyUnicode_GetDefaultEncoding();
1081
1082 /* Decode via the codec registry */
1083 v = PyCodec_Decode(unicode, encoding, errors);
1084 if (v == NULL)
1085 goto onError;
1086 return v;
1087
1088 onError:
1089 return NULL;
1090}
1091
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 const char *encoding,
1095 const char *errors)
1096{
1097 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 unicode = PyUnicode_FromUnicode(s, size);
1100 if (unicode == NULL)
1101 return NULL;
1102 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1103 Py_DECREF(unicode);
1104 return v;
1105}
1106
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001107PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1108 const char *encoding,
1109 const char *errors)
1110{
1111 PyObject *v;
1112
1113 if (!PyUnicode_Check(unicode)) {
1114 PyErr_BadArgument();
1115 goto onError;
1116 }
1117
1118 if (encoding == NULL)
1119 encoding = PyUnicode_GetDefaultEncoding();
1120
1121 /* Encode via the codec registry */
1122 v = PyCodec_Encode(unicode, encoding, errors);
1123 if (v == NULL)
1124 goto onError;
1125 return v;
1126
1127 onError:
1128 return NULL;
1129}
1130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
Fred Drakee4315f52000-05-09 19:53:39 +00001141
Tim Petersced69f82003-09-16 20:30:58 +00001142 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001143 encoding = PyUnicode_GetDefaultEncoding();
1144
1145 /* Shortcuts for common default encodings */
1146 if (errors == NULL) {
1147 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001148 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001149 else if (strcmp(encoding, "latin-1") == 0)
1150 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001151#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1152 else if (strcmp(encoding, "mbcs") == 0)
1153 return PyUnicode_AsMBCSString(unicode);
1154#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001155 else if (strcmp(encoding, "ascii") == 0)
1156 return PyUnicode_AsASCIIString(unicode);
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 /* Encode via the codec registry */
1160 v = PyCodec_Encode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001163 if (!PyBytes_Check(v)) {
1164 if (PyString_Check(v)) {
1165 /* Old codec, turn it into bytes */
1166 PyObject *b = PyBytes_FromObject(v);
1167 Py_DECREF(v);
1168 return b;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001171 "encoder did not return a bytes object "
1172 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1173 v->ob_type->tp_name,
1174 encoding ? encoding : "NULL",
1175 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 Py_DECREF(v);
1177 goto onError;
1178 }
1179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 onError:
1182 return NULL;
1183}
1184
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001185PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1186 const char *errors)
1187{
1188 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001190 if (v)
1191 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001192 if (errors != NULL)
1193 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1194 if (errors == NULL) {
1195 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1196 PyUnicode_GET_SIZE(unicode),
1197 NULL);
1198 }
1199 else {
1200 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1201 }
1202 if (!b)
1203 return NULL;
1204 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1205 PyBytes_Size(b));
1206 Py_DECREF(b);
1207 if (!errors) {
1208 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001210 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001211 return v;
1212}
1213
Martin v. Löwis5b222132007-06-10 09:51:05 +00001214char*
1215PyUnicode_AsString(PyObject *unicode)
1216{
1217 assert(PyUnicode_Check(unicode));
1218 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1219 if (!unicode)
1220 return NULL;
1221 return PyString_AsString(unicode);
1222}
1223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1225{
1226 if (!PyUnicode_Check(unicode)) {
1227 PyErr_BadArgument();
1228 goto onError;
1229 }
1230 return PyUnicode_AS_UNICODE(unicode);
1231
1232 onError:
1233 return NULL;
1234}
1235
Martin v. Löwis18e16552006-02-15 17:27:45 +00001236Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237{
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
1242 return PyUnicode_GET_SIZE(unicode);
1243
1244 onError:
1245 return -1;
1246}
1247
Thomas Wouters78890102000-07-22 19:25:51 +00001248const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001249{
1250 return unicode_default_encoding;
1251}
1252
1253int PyUnicode_SetDefaultEncoding(const char *encoding)
1254{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001255 if (strcmp(encoding, unicode_default_encoding) != 0) {
1256 PyErr_Format(PyExc_ValueError,
1257 "Can only set default encoding to %s",
1258 unicode_default_encoding);
1259 return -1;
1260 }
Fred Drakee4315f52000-05-09 19:53:39 +00001261 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001262}
1263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001264/* error handling callback helper:
1265 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001266 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 and adjust various state variables.
1268 return 0 on success, -1 on error
1269*/
1270
1271static
1272int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1273 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1275 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278
1279 PyObject *restuple = NULL;
1280 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1282 Py_ssize_t requiredsize;
1283 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 int res = -1;
1287
1288 if (*errorHandler == NULL) {
1289 *errorHandler = PyCodec_LookupError(errors);
1290 if (*errorHandler == NULL)
1291 goto onError;
1292 }
1293
1294 if (*exceptionObject == NULL) {
1295 *exceptionObject = PyUnicodeDecodeError_Create(
1296 encoding, input, insize, *startinpos, *endinpos, reason);
1297 if (*exceptionObject == NULL)
1298 goto onError;
1299 }
1300 else {
1301 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1304 goto onError;
1305 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1306 goto onError;
1307 }
1308
1309 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1310 if (restuple == NULL)
1311 goto onError;
1312 if (!PyTuple_Check(restuple)) {
1313 PyErr_Format(PyExc_TypeError, &argparse[4]);
1314 goto onError;
1315 }
1316 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1317 goto onError;
1318 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001319 newpos = insize+newpos;
1320 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001321 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001322 goto onError;
1323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324
1325 /* need more space? (at least enough for what we
1326 have+the replacement+the rest of the string (starting
1327 at the new input position), so we won't have to check space
1328 when there are no errors in the rest of the string) */
1329 repptr = PyUnicode_AS_UNICODE(repunicode);
1330 repsize = PyUnicode_GET_SIZE(repunicode);
1331 requiredsize = *outpos + repsize + insize-newpos;
1332 if (requiredsize > outsize) {
1333 if (requiredsize<2*outsize)
1334 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001335 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001336 goto onError;
1337 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1338 }
1339 *endinpos = newpos;
1340 *inptr = input + newpos;
1341 Py_UNICODE_COPY(*outptr, repptr, repsize);
1342 *outptr += repsize;
1343 *outpos += repsize;
1344 /* we made it! */
1345 res = 0;
1346
1347 onError:
1348 Py_XDECREF(restuple);
1349 return res;
1350}
1351
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001352/* --- UTF-7 Codec -------------------------------------------------------- */
1353
1354/* see RFC2152 for details */
1355
Tim Petersced69f82003-09-16 20:30:58 +00001356static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001357char utf7_special[128] = {
1358 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1359 encoded:
1360 0 - not special
1361 1 - special
1362 2 - whitespace (optional)
1363 3 - RFC2152 Set O (optional) */
1364 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1365 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1366 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1368 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1370 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1372
1373};
1374
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001375/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1376 warnings about the comparison always being false; since
1377 utf7_special[0] is 1, we can safely make that one comparison
1378 true */
1379
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001380#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001381 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001382 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001383 (encodeO && (utf7_special[(c)] == 3)))
1384
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001385#define B64(n) \
1386 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1387#define B64CHAR(c) \
1388 (isalnum(c) || (c) == '+' || (c) == '/')
1389#define UB64(c) \
1390 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1391 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001393#define ENCODE(out, ch, bits) \
1394 while (bits >= 6) { \
1395 *out++ = B64(ch >> (bits-6)); \
1396 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 }
1398
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001399#define DECODE(out, ch, bits, surrogate) \
1400 while (bits >= 16) { \
1401 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1402 bits -= 16; \
1403 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001404 /* We have already generated an error for the high surrogate \
1405 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001406 surrogate = 0; \
1407 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001409 it in a 16-bit character */ \
1410 surrogate = 1; \
1411 errmsg = "code pairs are not supported"; \
1412 goto utf7Error; \
1413 } else { \
1414 *out++ = outCh; \
1415 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 const char *errors)
1421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t startinpos;
1424 Py_ssize_t endinpos;
1425 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001426 const char *e;
1427 PyUnicodeObject *unicode;
1428 Py_UNICODE *p;
1429 const char *errmsg = "";
1430 int inShift = 0;
1431 unsigned int bitsleft = 0;
1432 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 int surrogate = 0;
1434 PyObject *errorHandler = NULL;
1435 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001436
1437 unicode = _PyUnicode_New(size);
1438 if (!unicode)
1439 return NULL;
1440 if (size == 0)
1441 return (PyObject *)unicode;
1442
1443 p = unicode->str;
1444 e = s + size;
1445
1446 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 Py_UNICODE ch;
1448 restart:
1449 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450
1451 if (inShift) {
1452 if ((ch == '-') || !B64CHAR(ch)) {
1453 inShift = 0;
1454 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001455
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1457 if (bitsleft >= 6) {
1458 /* The shift sequence has a partial character in it. If
1459 bitsleft < 6 then we could just classify it as padding
1460 but that is not the case here */
1461
1462 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001463 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464 }
1465 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001466 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 here so indicate the potential of a misencoded character. */
1468
1469 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1470 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1471 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 }
1474
1475 if (ch == '-') {
1476 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001477 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 inShift = 1;
1479 }
1480 } else if (SPECIAL(ch,0,0)) {
1481 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 } else {
1484 *p++ = ch;
1485 }
1486 } else {
1487 charsleft = (charsleft << 6) | UB64(ch);
1488 bitsleft += 6;
1489 s++;
1490 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1491 }
1492 }
1493 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 s++;
1496 if (s < e && *s == '-') {
1497 s++;
1498 *p++ = '+';
1499 } else
1500 {
1501 inShift = 1;
1502 bitsleft = 0;
1503 }
1504 }
1505 else if (SPECIAL(ch,0,0)) {
1506 errmsg = "unexpected special character";
1507 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001508 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 }
1510 else {
1511 *p++ = ch;
1512 s++;
1513 }
1514 continue;
1515 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 outpos = p-PyUnicode_AS_UNICODE(unicode);
1517 endinpos = s-starts;
1518 if (unicode_decode_call_errorhandler(
1519 errors, &errorHandler,
1520 "utf7", errmsg,
1521 starts, size, &startinpos, &endinpos, &exc, &s,
1522 (PyObject **)&unicode, &outpos, &p))
1523 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 }
1525
1526 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 outpos = p-PyUnicode_AS_UNICODE(unicode);
1528 endinpos = size;
1529 if (unicode_decode_call_errorhandler(
1530 errors, &errorHandler,
1531 "utf7", "unterminated shift sequence",
1532 starts, size, &startinpos, &endinpos, &exc, &s,
1533 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001535 if (s < e)
1536 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 }
1538
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001539 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 goto onError;
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 Py_XDECREF(errorHandler);
1543 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 return (PyObject *)unicode;
1545
1546onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_XDECREF(errorHandler);
1548 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 Py_DECREF(unicode);
1550 return NULL;
1551}
1552
1553
1554PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001555 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 int encodeSetO,
1557 int encodeWhiteSpace,
1558 const char *errors)
1559{
1560 PyObject *v;
1561 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 unsigned int bitsleft = 0;
1566 unsigned long charsleft = 0;
1567 char * out;
1568 char * start;
1569
1570 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001571 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572
Walter Dörwald51ab4142007-05-05 14:43:36 +00001573 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574 if (v == NULL)
1575 return NULL;
1576
Walter Dörwald51ab4142007-05-05 14:43:36 +00001577 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 for (;i < size; ++i) {
1579 Py_UNICODE ch = s[i];
1580
1581 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001582 if (ch == '+') {
1583 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 *out++ = '-';
1585 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1586 charsleft = ch;
1587 bitsleft = 16;
1588 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001589 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001591 } else {
1592 *out++ = (char) ch;
1593 }
1594 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1596 *out++ = B64(charsleft << (6-bitsleft));
1597 charsleft = 0;
1598 bitsleft = 0;
1599 /* Characters not in the BASE64 set implicitly unshift the sequence
1600 so no '-' is required, except if the character is itself a '-' */
1601 if (B64CHAR(ch) || ch == '-') {
1602 *out++ = '-';
1603 }
1604 inShift = 0;
1605 *out++ = (char) ch;
1606 } else {
1607 bitsleft += 16;
1608 charsleft = (charsleft << 16) | ch;
1609 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1610
1611 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001612 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 or '-' then the shift sequence will be terminated implicitly and we
1614 don't have to insert a '-'. */
1615
1616 if (bitsleft == 0) {
1617 if (i + 1 < size) {
1618 Py_UNICODE ch2 = s[i+1];
1619
1620 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001621
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 } else if (B64CHAR(ch2) || ch2 == '-') {
1623 *out++ = '-';
1624 inShift = 0;
1625 } else {
1626 inShift = 0;
1627 }
1628
1629 }
1630 else {
1631 *out++ = '-';
1632 inShift = 0;
1633 }
1634 }
Tim Petersced69f82003-09-16 20:30:58 +00001635 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 if (bitsleft) {
1639 *out++= B64(charsleft << (6-bitsleft) );
1640 *out++ = '-';
1641 }
1642
Walter Dörwald51ab4142007-05-05 14:43:36 +00001643 if (PyBytes_Resize(v, out - start)) {
1644 Py_DECREF(v);
1645 return NULL;
1646 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 return v;
1648}
1649
1650#undef SPECIAL
1651#undef B64
1652#undef B64CHAR
1653#undef UB64
1654#undef ENCODE
1655#undef DECODE
1656
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657/* --- UTF-8 Codec -------------------------------------------------------- */
1658
Tim Petersced69f82003-09-16 20:30:58 +00001659static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660char utf8_code_length[256] = {
1661 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1662 illegal prefix. see RFC 2279 for details */
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1675 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1676 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1677 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1678 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1679};
1680
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001682 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 const char *errors)
1684{
Walter Dörwald69652032004-09-07 20:24:22 +00001685 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1686}
1687
1688PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001690 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695 Py_ssize_t startinpos;
1696 Py_ssize_t endinpos;
1697 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 const char *e;
1699 PyUnicodeObject *unicode;
1700 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001701 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 PyObject *errorHandler = NULL;
1703 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704
1705 /* Note: size will always be longer than the resulting Unicode
1706 character count */
1707 unicode = _PyUnicode_New(size);
1708 if (!unicode)
1709 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001710 if (size == 0) {
1711 if (consumed)
1712 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
1716 /* Unpack UTF-8 encoded data */
1717 p = unicode->str;
1718 e = s + size;
1719
1720 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001721 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
1723 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001724 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 s++;
1726 continue;
1727 }
1728
1729 n = utf8_code_length[ch];
1730
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001731 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001732 if (consumed)
1733 break;
1734 else {
1735 errmsg = "unexpected end of data";
1736 startinpos = s-starts;
1737 endinpos = size;
1738 goto utf8Error;
1739 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 switch (n) {
1743
1744 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
1747 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001748 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
1750 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
1753 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 if ((s[1] & 0xc0) != 0x80) {
1758 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 startinpos = s-starts;
1760 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 goto utf8Error;
1762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 startinpos = s-starts;
1766 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 errmsg = "illegal encoding";
1768 goto utf8Error;
1769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 break;
1773
1774 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001775 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 (s[2] & 0xc0) != 0x80) {
1777 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 startinpos = s-starts;
1779 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 goto utf8Error;
1781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001783 if (ch < 0x0800) {
1784 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001785 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001786
1787 XXX For wide builds (UCS-4) we should probably try
1788 to recombine the surrogates into a single code
1789 unit.
1790 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001791 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 startinpos = s-starts;
1793 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 goto utf8Error;
1795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001797 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001798 break;
1799
1800 case 4:
1801 if ((s[1] & 0xc0) != 0x80 ||
1802 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 (s[3] & 0xc0) != 0x80) {
1804 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 startinpos = s-starts;
1806 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 goto utf8Error;
1808 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001809 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1810 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1811 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001812 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001813 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001815 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001816 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
1821 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001822#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001823 *p++ = (Py_UNICODE)ch;
1824#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001826
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001827 /* translate from 10000..10FFFF to 0..FFFF */
1828 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001829
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001830 /* high surrogate = top 10 bits added to D800 */
1831 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001832
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001833 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001834 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001835#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 break;
1837
1838 default:
1839 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 startinpos = s-starts;
1842 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
1845 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001847
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 outpos = p-PyUnicode_AS_UNICODE(unicode);
1850 if (unicode_decode_call_errorhandler(
1851 errors, &errorHandler,
1852 "utf8", errmsg,
1853 starts, size, &startinpos, &endinpos, &exc, &s,
1854 (PyObject **)&unicode, &outpos, &p))
1855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 }
Walter Dörwald69652032004-09-07 20:24:22 +00001857 if (consumed)
1858 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859
1860 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001861 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 goto onError;
1863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return (PyObject *)unicode;
1867
1868onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 Py_XDECREF(errorHandler);
1870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 Py_DECREF(unicode);
1872 return NULL;
1873}
1874
Tim Peters602f7402002-04-27 18:03:26 +00001875/* Allocation strategy: if the string is short, convert into a stack buffer
1876 and allocate exactly as much space needed at the end. Else allocate the
1877 maximum possible needed (4 result bytes per Unicode character), and return
1878 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001879*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001880PyObject *
1881PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001882 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001883 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884{
Tim Peters602f7402002-04-27 18:03:26 +00001885#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001886
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001888 PyObject *v; /* result string object */
1889 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001890 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001891 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001892 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001893
Tim Peters602f7402002-04-27 18:03:26 +00001894 assert(s != NULL);
1895 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896
Tim Peters602f7402002-04-27 18:03:26 +00001897 if (size <= MAX_SHORT_UNICHARS) {
1898 /* Write into the stack buffer; nallocated can't overflow.
1899 * At the end, we'll allocate exactly as much heap space as it
1900 * turns out we need.
1901 */
1902 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1903 v = NULL; /* will allocate after we're done */
1904 p = stackbuf;
1905 }
1906 else {
1907 /* Overallocate on the heap, and give the excess back at the end. */
1908 nallocated = size * 4;
1909 if (nallocated / 4 != size) /* overflow! */
1910 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001911 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001912 if (v == NULL)
1913 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001914 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001915 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001916
Tim Peters602f7402002-04-27 18:03:26 +00001917 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001919
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001920 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001921 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001923
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001925 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001926 *p++ = (char)(0xc0 | (ch >> 6));
1927 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001928 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001929 else {
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode UCS2 Unicode ordinals */
1931 if (ch < 0x10000) {
1932 /* Special case: check for high surrogate */
1933 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1934 Py_UCS4 ch2 = s[i];
1935 /* Check for low surrogate and combine the two to
1936 form a UCS4 value */
1937 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001938 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001939 i++;
1940 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Tim Peters602f7402002-04-27 18:03:26 +00001942 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001945 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1946 *p++ = (char)(0x80 | (ch & 0x3f));
1947 continue;
1948 }
1949encodeUCS4:
1950 /* Encode UCS4 Unicode ordinals */
1951 *p++ = (char)(0xf0 | (ch >> 18));
1952 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1953 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1954 *p++ = (char)(0x80 | (ch & 0x3f));
1955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001957
Tim Peters602f7402002-04-27 18:03:26 +00001958 if (v == NULL) {
1959 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001960 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001961 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001962 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001963 }
1964 else {
1965 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001966 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001967 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001968 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001971
Tim Peters602f7402002-04-27 18:03:26 +00001972#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973}
1974
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 if (!PyUnicode_Check(unicode)) {
1978 PyErr_BadArgument();
1979 return NULL;
1980 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001981 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1982 PyUnicode_GET_SIZE(unicode),
1983 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984}
1985
1986/* --- UTF-16 Codec ------------------------------------------------------- */
1987
Tim Peters772747b2001-08-09 22:21:55 +00001988PyObject *
1989PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001990 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001991 const char *errors,
1992 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993{
Walter Dörwald69652032004-09-07 20:24:22 +00001994 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1995}
1996
1997PyObject *
1998PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002000 const char *errors,
2001 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002002 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002005 Py_ssize_t startinpos;
2006 Py_ssize_t endinpos;
2007 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 PyUnicodeObject *unicode;
2009 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002010 const unsigned char *q, *e;
2011 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002013 /* Offsets from q for retrieving byte pairs in the right order. */
2014#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2015 int ihi = 1, ilo = 0;
2016#else
2017 int ihi = 0, ilo = 1;
2018#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Note: size will always be longer than the resulting Unicode
2023 character count */
2024 unicode = _PyUnicode_New(size);
2025 if (!unicode)
2026 return NULL;
2027 if (size == 0)
2028 return (PyObject *)unicode;
2029
2030 /* Unpack UTF-16 encoded data */
2031 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002032 q = (unsigned char *)s;
2033 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002036 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002038 /* Check for BOM marks (U+FEFF) in the input and adjust current
2039 byte order setting accordingly. In native mode, the leading BOM
2040 mark is skipped, in all other modes, it is copied to the output
2041 stream as-is (giving a ZWNBSP character). */
2042 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002043 if (size >= 2) {
2044 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002045#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002046 if (bom == 0xFEFF) {
2047 q += 2;
2048 bo = -1;
2049 }
2050 else if (bom == 0xFFFE) {
2051 q += 2;
2052 bo = 1;
2053 }
Tim Petersced69f82003-09-16 20:30:58 +00002054#else
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (bom == 0xFEFF) {
2056 q += 2;
2057 bo = 1;
2058 }
2059 else if (bom == 0xFFFE) {
2060 q += 2;
2061 bo = -1;
2062 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002063#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002064 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
Tim Peters772747b2001-08-09 22:21:55 +00002067 if (bo == -1) {
2068 /* force LE */
2069 ihi = 1;
2070 ilo = 0;
2071 }
2072 else if (bo == 1) {
2073 /* force BE */
2074 ihi = 0;
2075 ilo = 1;
2076 }
2077
2078 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002080 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002082 if (consumed)
2083 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 errmsg = "truncated data";
2085 startinpos = ((const char *)q)-starts;
2086 endinpos = ((const char *)e)-starts;
2087 goto utf16Error;
2088 /* The remaining input chars are ignored if the callback
2089 chooses to skip the input */
2090 }
2091 ch = (q[ihi] << 8) | q[ilo];
2092
Tim Peters772747b2001-08-09 22:21:55 +00002093 q += 2;
2094
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 if (ch < 0xD800 || ch > 0xDFFF) {
2096 *p++ = ch;
2097 continue;
2098 }
2099
2100 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002101 if (q >= e) {
2102 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 startinpos = (((const char *)q)-2)-starts;
2104 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 goto utf16Error;
2106 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002107 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002108 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2109 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002110 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002111#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002112 *p++ = ch;
2113 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114#else
2115 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002118 }
2119 else {
2120 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 startinpos = (((const char *)q)-4)-starts;
2122 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002123 goto utf16Error;
2124 }
2125
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002127 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 startinpos = (((const char *)q)-2)-starts;
2129 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002130 /* Fall through to report the error */
2131
2132 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 outpos = p-PyUnicode_AS_UNICODE(unicode);
2134 if (unicode_decode_call_errorhandler(
2135 errors, &errorHandler,
2136 "utf16", errmsg,
2137 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2138 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 }
2141
2142 if (byteorder)
2143 *byteorder = bo;
2144
Walter Dörwald69652032004-09-07 20:24:22 +00002145 if (consumed)
2146 *consumed = (const char *)q-starts;
2147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 goto onError;
2151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 Py_XDECREF(errorHandler);
2153 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 return (PyObject *)unicode;
2155
2156onError:
2157 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 Py_XDECREF(errorHandler);
2159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 return NULL;
2161}
2162
Tim Peters772747b2001-08-09 22:21:55 +00002163PyObject *
2164PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002165 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002166 const char *errors,
2167 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168{
2169 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002170 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002171#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002172 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002173#else
2174 const int pairs = 0;
2175#endif
Tim Peters772747b2001-08-09 22:21:55 +00002176 /* Offsets from p for storing byte pairs in the right order. */
2177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2178 int ihi = 1, ilo = 0;
2179#else
2180 int ihi = 0, ilo = 1;
2181#endif
2182
2183#define STORECHAR(CH) \
2184 do { \
2185 p[ihi] = ((CH) >> 8) & 0xff; \
2186 p[ilo] = (CH) & 0xff; \
2187 p += 2; \
2188 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002190#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002191 for (i = pairs = 0; i < size; i++)
2192 if (s[i] >= 0x10000)
2193 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002194#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002195 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002196 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 if (v == NULL)
2198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199
Walter Dörwald3cc34522007-05-04 10:48:27 +00002200 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002202 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002203 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002204 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002205
2206 if (byteorder == -1) {
2207 /* force LE */
2208 ihi = 1;
2209 ilo = 0;
2210 }
2211 else if (byteorder == 1) {
2212 /* force BE */
2213 ihi = 0;
2214 ilo = 1;
2215 }
2216
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002217 while (size-- > 0) {
2218 Py_UNICODE ch = *s++;
2219 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002220#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002221 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002222 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2223 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002225#endif
Tim Peters772747b2001-08-09 22:21:55 +00002226 STORECHAR(ch);
2227 if (ch2)
2228 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002231#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232}
2233
2234PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2235{
2236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
2240 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode),
2242 NULL,
2243 0);
2244}
2245
2246/* --- Unicode Escape Codec ----------------------------------------------- */
2247
Fredrik Lundh06d12682001-01-24 07:59:11 +00002248static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002249
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002251 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 const char *errors)
2253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002255 Py_ssize_t startinpos;
2256 Py_ssize_t endinpos;
2257 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002262 char* message;
2263 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 PyObject *errorHandler = NULL;
2265 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002266
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 /* Escaped strings will always be longer than the resulting
2268 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269 length after conversion to the true value.
2270 (but if the error callback returns a long replacement string
2271 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002280
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 while (s < end) {
2282 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002283 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002284 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
2286 /* Non-escape characters are interpreted as Unicode ordinals */
2287 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002288 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 continue;
2290 }
2291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 /* \ - Escapes */
2294 s++;
2295 switch (*s++) {
2296
2297 /* \x escapes */
2298 case '\n': break;
2299 case '\\': *p++ = '\\'; break;
2300 case '\'': *p++ = '\''; break;
2301 case '\"': *p++ = '\"'; break;
2302 case 'b': *p++ = '\b'; break;
2303 case 'f': *p++ = '\014'; break; /* FF */
2304 case 't': *p++ = '\t'; break;
2305 case 'n': *p++ = '\n'; break;
2306 case 'r': *p++ = '\r'; break;
2307 case 'v': *p++ = '\013'; break; /* VT */
2308 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2309
2310 /* \OOO (octal) escapes */
2311 case '0': case '1': case '2': case '3':
2312 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002313 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002315 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002317 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002319 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 break;
2321
Fredrik Lundhccc74732001-02-18 22:13:49 +00002322 /* hex escapes */
2323 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002325 digits = 2;
2326 message = "truncated \\xXX escape";
2327 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328
Fredrik Lundhccc74732001-02-18 22:13:49 +00002329 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002331 digits = 4;
2332 message = "truncated \\uXXXX escape";
2333 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002336 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002337 digits = 8;
2338 message = "truncated \\UXXXXXXXX escape";
2339 hexescape:
2340 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 outpos = p-PyUnicode_AS_UNICODE(v);
2342 if (s+digits>end) {
2343 endinpos = size;
2344 if (unicode_decode_call_errorhandler(
2345 errors, &errorHandler,
2346 "unicodeescape", "end of string in escape sequence",
2347 starts, size, &startinpos, &endinpos, &exc, &s,
2348 (PyObject **)&v, &outpos, &p))
2349 goto onError;
2350 goto nextByte;
2351 }
2352 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002353 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002354 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 endinpos = (s+i+1)-starts;
2356 if (unicode_decode_call_errorhandler(
2357 errors, &errorHandler,
2358 "unicodeescape", message,
2359 starts, size, &startinpos, &endinpos, &exc, &s,
2360 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002361 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 }
2364 chr = (chr<<4) & ~0xF;
2365 if (c >= '0' && c <= '9')
2366 chr += c - '0';
2367 else if (c >= 'a' && c <= 'f')
2368 chr += 10 + c - 'a';
2369 else
2370 chr += 10 + c - 'A';
2371 }
2372 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002373 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002374 /* _decoding_error will have already written into the
2375 target buffer. */
2376 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002377 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002378 /* when we get here, chr is a 32-bit unicode character */
2379 if (chr <= 0xffff)
2380 /* UCS-2 character */
2381 *p++ = (Py_UNICODE) chr;
2382 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002383 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002384 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002385#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002386 *p++ = chr;
2387#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002388 chr -= 0x10000L;
2389 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002390 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002391#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002392 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393 endinpos = s-starts;
2394 outpos = p-PyUnicode_AS_UNICODE(v);
2395 if (unicode_decode_call_errorhandler(
2396 errors, &errorHandler,
2397 "unicodeescape", "illegal Unicode character",
2398 starts, size, &startinpos, &endinpos, &exc, &s,
2399 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002400 goto onError;
2401 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002402 break;
2403
2404 /* \N{name} */
2405 case 'N':
2406 message = "malformed \\N character escape";
2407 if (ucnhash_CAPI == NULL) {
2408 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002409 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002410 m = PyImport_ImportModule("unicodedata");
2411 if (m == NULL)
2412 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002413 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002414 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002415 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002416 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002417 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002418 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002419 if (ucnhash_CAPI == NULL)
2420 goto ucnhashError;
2421 }
2422 if (*s == '{') {
2423 const char *start = s+1;
2424 /* look for the closing brace */
2425 while (*s != '}' && s < end)
2426 s++;
2427 if (s > start && s < end && *s == '}') {
2428 /* found a name. look it up in the unicode database */
2429 message = "unknown Unicode character name";
2430 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002431 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002432 goto store;
2433 }
2434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435 endinpos = s-starts;
2436 outpos = p-PyUnicode_AS_UNICODE(v);
2437 if (unicode_decode_call_errorhandler(
2438 errors, &errorHandler,
2439 "unicodeescape", message,
2440 starts, size, &startinpos, &endinpos, &exc, &s,
2441 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002442 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002443 break;
2444
2445 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002446 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002447 message = "\\ at end of string";
2448 s--;
2449 endinpos = s-starts;
2450 outpos = p-PyUnicode_AS_UNICODE(v);
2451 if (unicode_decode_call_errorhandler(
2452 errors, &errorHandler,
2453 "unicodeescape", message,
2454 starts, size, &startinpos, &endinpos, &exc, &s,
2455 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002456 goto onError;
2457 }
2458 else {
2459 *p++ = '\\';
2460 *p++ = (unsigned char)s[-1];
2461 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002462 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 nextByte:
2465 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002467 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002469 Py_XDECREF(errorHandler);
2470 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002472
Fredrik Lundhccc74732001-02-18 22:13:49 +00002473ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002474 PyErr_SetString(
2475 PyExc_UnicodeError,
2476 "\\N escapes not supported (can't load unicodedata module)"
2477 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002478 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_XDECREF(errorHandler);
2480 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002481 return NULL;
2482
Fredrik Lundhccc74732001-02-18 22:13:49 +00002483onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 Py_XDECREF(errorHandler);
2486 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 return NULL;
2488}
2489
2490/* Return a Unicode-Escape string version of the Unicode object.
2491
2492 If quotes is true, the string is enclosed in u"" or u'' quotes as
2493 appropriate.
2494
2495*/
2496
Thomas Wouters477c8d52006-05-27 19:21:47 +00002497Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2498 Py_ssize_t size,
2499 Py_UNICODE ch)
2500{
2501 /* like wcschr, but doesn't stop at NULL characters */
2502
2503 while (size-- > 0) {
2504 if (*s == ch)
2505 return s;
2506 s++;
2507 }
2508
2509 return NULL;
2510}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002511
Walter Dörwald79e913e2007-05-12 11:08:06 +00002512static const char *hexdigits = "0123456789abcdef";
2513
2514PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2515 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516{
2517 PyObject *repr;
2518 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
Thomas Wouters89f507f2006-12-13 04:49:30 +00002520 /* XXX(nnorwitz): rather than over-allocating, it would be
2521 better to choose a different scheme. Perhaps scan the
2522 first N-chars of the string and allocate based on that size.
2523 */
2524 /* Initial allocation is based on the longest-possible unichr
2525 escape.
2526
2527 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2528 unichr, so in this case it's the longest unichr escape. In
2529 narrow (UTF-16) builds this is five chars per source unichr
2530 since there are two unichrs in the surrogate pair, so in narrow
2531 (UTF-16) builds it's not the longest unichr escape.
2532
2533 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2534 so in the narrow (UTF-16) build case it's the longest unichr
2535 escape.
2536 */
2537
Walter Dörwald79e913e2007-05-12 11:08:06 +00002538 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002539#ifdef Py_UNICODE_WIDE
2540 + 10*size
2541#else
2542 + 6*size
2543#endif
2544 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 if (repr == NULL)
2546 return NULL;
2547
Walter Dörwald79e913e2007-05-12 11:08:06 +00002548 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 while (size-- > 0) {
2551 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002552
Walter Dörwald79e913e2007-05-12 11:08:06 +00002553 /* Escape backslashes */
2554 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 *p++ = '\\';
2556 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002557 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002558 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002559
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002560#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561 /* Map 21-bit characters to '\U00xxxxxx' */
2562 else if (ch >= 0x10000) {
2563 *p++ = '\\';
2564 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002565 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2566 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2567 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2570 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2571 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2572 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002573 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002575#else
2576 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002577 else if (ch >= 0xD800 && ch < 0xDC00) {
2578 Py_UNICODE ch2;
2579 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002580
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002581 ch2 = *s++;
2582 size--;
2583 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2584 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2585 *p++ = '\\';
2586 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002587 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2588 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2589 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2592 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2593 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2594 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002595 continue;
2596 }
2597 /* Fall through: isolated surrogates are copied as-is */
2598 s--;
2599 size++;
2600 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002601#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002604 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 *p++ = '\\';
2606 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002607 *p++ = hexdigits[(ch >> 12) & 0x000F];
2608 *p++ = hexdigits[(ch >> 8) & 0x000F];
2609 *p++ = hexdigits[(ch >> 4) & 0x000F];
2610 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002612
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002613 /* Map special whitespace to '\t', \n', '\r' */
2614 else if (ch == '\t') {
2615 *p++ = '\\';
2616 *p++ = 't';
2617 }
2618 else if (ch == '\n') {
2619 *p++ = '\\';
2620 *p++ = 'n';
2621 }
2622 else if (ch == '\r') {
2623 *p++ = '\\';
2624 *p++ = 'r';
2625 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002626
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002627 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002628 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002630 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002631 *p++ = hexdigits[(ch >> 4) & 0x000F];
2632 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002633 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002634
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 /* Copy everything else as-is */
2636 else
2637 *p++ = (char) ch;
2638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639
2640 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002641 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2642 Py_DECREF(repr);
2643 return NULL;
2644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 return repr;
2646}
2647
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2649{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002650 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 if (!PyUnicode_Check(unicode)) {
2652 PyErr_BadArgument();
2653 return NULL;
2654 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002655 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2656 PyUnicode_GET_SIZE(unicode));
2657
2658 if (!s)
2659 return NULL;
2660 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2661 PyBytes_GET_SIZE(s));
2662 Py_DECREF(s);
2663 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666/* --- Raw Unicode Escape Codec ------------------------------------------- */
2667
2668PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002669 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 const char *errors)
2671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002673 Py_ssize_t startinpos;
2674 Py_ssize_t endinpos;
2675 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 const char *end;
2679 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 PyObject *errorHandler = NULL;
2681 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002682
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 /* Escaped strings will always be longer than the resulting
2684 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 length after conversion to the true value. (But decoding error
2686 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 v = _PyUnicode_New(size);
2688 if (v == NULL)
2689 goto onError;
2690 if (size == 0)
2691 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 end = s + size;
2694 while (s < end) {
2695 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002696 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002698 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699
2700 /* Non-escape characters are interpreted as Unicode ordinals */
2701 if (*s != '\\') {
2702 *p++ = (unsigned char)*s++;
2703 continue;
2704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706
2707 /* \u-escapes are only interpreted iff the number of leading
2708 backslashes if odd */
2709 bs = s;
2710 for (;s < end;) {
2711 if (*s != '\\')
2712 break;
2713 *p++ = (unsigned char)*s++;
2714 }
2715 if (((s - bs) & 1) == 0 ||
2716 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002717 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 continue;
2719 }
2720 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002721 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 s++;
2723
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002724 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002726 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 endinpos = s-starts;
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "rawunicodeescape", "truncated \\uXXXX",
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 }
2738 x = (x<<4) & ~0xF;
2739 if (c >= '0' && c <= '9')
2740 x += c - '0';
2741 else if (c >= 'a' && c <= 'f')
2742 x += 10 + c - 'a';
2743 else
2744 x += 10 + c - 'A';
2745 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002746#ifndef Py_UNICODE_WIDE
2747 if (x > 0x10000) {
2748 if (unicode_decode_call_errorhandler(
2749 errors, &errorHandler,
2750 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2751 starts, size, &startinpos, &endinpos, &exc, &s,
2752 (PyObject **)&v, &outpos, &p))
2753 goto onError;
2754 }
2755#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 *p++ = x;
2757 nextByte:
2758 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002760 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 Py_XDECREF(errorHandler);
2763 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 onError:
2767 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 Py_XDECREF(errorHandler);
2769 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 return NULL;
2771}
2772
2773PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002774 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775{
2776 PyObject *repr;
2777 char *p;
2778 char *q;
2779
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002780#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002781 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002783 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002784#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 if (repr == NULL)
2786 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002787 if (size == 0)
2788 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789
Walter Dörwald711005d2007-05-12 12:03:26 +00002790 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 while (size-- > 0) {
2792 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002793#ifdef Py_UNICODE_WIDE
2794 /* Map 32-bit characters to '\Uxxxxxxxx' */
2795 if (ch >= 0x10000) {
2796 *p++ = '\\';
2797 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002798 *p++ = hexdigits[(ch >> 28) & 0xf];
2799 *p++ = hexdigits[(ch >> 24) & 0xf];
2800 *p++ = hexdigits[(ch >> 20) & 0xf];
2801 *p++ = hexdigits[(ch >> 16) & 0xf];
2802 *p++ = hexdigits[(ch >> 12) & 0xf];
2803 *p++ = hexdigits[(ch >> 8) & 0xf];
2804 *p++ = hexdigits[(ch >> 4) & 0xf];
2805 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002806 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002807 else
2808#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 /* Map 16-bit characters to '\uxxxx' */
2810 if (ch >= 256) {
2811 *p++ = '\\';
2812 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002813 *p++ = hexdigits[(ch >> 12) & 0xf];
2814 *p++ = hexdigits[(ch >> 8) & 0xf];
2815 *p++ = hexdigits[(ch >> 4) & 0xf];
2816 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
2818 /* Copy everything else as-is */
2819 else
2820 *p++ = (char) ch;
2821 }
2822 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002823 if (PyBytes_Resize(repr, p - q)) {
2824 Py_DECREF(repr);
2825 return NULL;
2826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 return repr;
2828}
2829
2830PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2831{
Walter Dörwald711005d2007-05-12 12:03:26 +00002832 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002834 PyErr_BadArgument();
2835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002837 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2838 PyUnicode_GET_SIZE(unicode));
2839
2840 if (!s)
2841 return NULL;
2842 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2843 PyBytes_GET_SIZE(s));
2844 Py_DECREF(s);
2845 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846}
2847
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002848/* --- Unicode Internal Codec ------------------------------------------- */
2849
2850PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002851 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002852 const char *errors)
2853{
2854 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t startinpos;
2856 Py_ssize_t endinpos;
2857 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002858 PyUnicodeObject *v;
2859 Py_UNICODE *p;
2860 const char *end;
2861 const char *reason;
2862 PyObject *errorHandler = NULL;
2863 PyObject *exc = NULL;
2864
Neal Norwitzd43069c2006-01-08 01:12:10 +00002865#ifdef Py_UNICODE_WIDE
2866 Py_UNICODE unimax = PyUnicode_GetMax();
2867#endif
2868
Thomas Wouters89f507f2006-12-13 04:49:30 +00002869 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002870 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2871 if (v == NULL)
2872 goto onError;
2873 if (PyUnicode_GetSize((PyObject *)v) == 0)
2874 return (PyObject *)v;
2875 p = PyUnicode_AS_UNICODE(v);
2876 end = s + size;
2877
2878 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002879 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002880 /* We have to sanity check the raw data, otherwise doom looms for
2881 some malformed UCS-4 data. */
2882 if (
2883 #ifdef Py_UNICODE_WIDE
2884 *p > unimax || *p < 0 ||
2885 #endif
2886 end-s < Py_UNICODE_SIZE
2887 )
2888 {
2889 startinpos = s - starts;
2890 if (end-s < Py_UNICODE_SIZE) {
2891 endinpos = end-starts;
2892 reason = "truncated input";
2893 }
2894 else {
2895 endinpos = s - starts + Py_UNICODE_SIZE;
2896 reason = "illegal code point (> 0x10FFFF)";
2897 }
2898 outpos = p - PyUnicode_AS_UNICODE(v);
2899 if (unicode_decode_call_errorhandler(
2900 errors, &errorHandler,
2901 "unicode_internal", reason,
2902 starts, size, &startinpos, &endinpos, &exc, &s,
2903 (PyObject **)&v, &outpos, &p)) {
2904 goto onError;
2905 }
2906 }
2907 else {
2908 p++;
2909 s += Py_UNICODE_SIZE;
2910 }
2911 }
2912
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002913 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002914 goto onError;
2915 Py_XDECREF(errorHandler);
2916 Py_XDECREF(exc);
2917 return (PyObject *)v;
2918
2919 onError:
2920 Py_XDECREF(v);
2921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
2923 return NULL;
2924}
2925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926/* --- Latin-1 Codec ------------------------------------------------------ */
2927
2928PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002929 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 const char *errors)
2931{
2932 PyUnicodeObject *v;
2933 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002934
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002936 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002937 Py_UNICODE r = *(unsigned char*)s;
2938 return PyUnicode_FromUnicode(&r, 1);
2939 }
2940
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 v = _PyUnicode_New(size);
2942 if (v == NULL)
2943 goto onError;
2944 if (size == 0)
2945 return (PyObject *)v;
2946 p = PyUnicode_AS_UNICODE(v);
2947 while (size-- > 0)
2948 *p++ = (unsigned char)*s++;
2949 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 onError:
2952 Py_XDECREF(v);
2953 return NULL;
2954}
2955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956/* create or adjust a UnicodeEncodeError */
2957static void make_encode_exception(PyObject **exceptionObject,
2958 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002959 const Py_UNICODE *unicode, Py_ssize_t size,
2960 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 if (*exceptionObject == NULL) {
2964 *exceptionObject = PyUnicodeEncodeError_Create(
2965 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
2967 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2969 goto onError;
2970 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2971 goto onError;
2972 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2973 goto onError;
2974 return;
2975 onError:
2976 Py_DECREF(*exceptionObject);
2977 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
2979}
2980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981/* raises a UnicodeEncodeError */
2982static void raise_encode_exception(PyObject **exceptionObject,
2983 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002984 const Py_UNICODE *unicode, Py_ssize_t size,
2985 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 const char *reason)
2987{
2988 make_encode_exception(exceptionObject,
2989 encoding, unicode, size, startpos, endpos, reason);
2990 if (*exceptionObject != NULL)
2991 PyCodec_StrictErrors(*exceptionObject);
2992}
2993
2994/* error handling callback helper:
2995 build arguments, call the callback and check the arguments,
2996 put the result into newpos and return the replacement string, which
2997 has to be freed by the caller */
2998static PyObject *unicode_encode_call_errorhandler(const char *errors,
2999 PyObject **errorHandler,
3000 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003001 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3002 Py_ssize_t startpos, Py_ssize_t endpos,
3003 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003005 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006
3007 PyObject *restuple;
3008 PyObject *resunicode;
3009
3010 if (*errorHandler == NULL) {
3011 *errorHandler = PyCodec_LookupError(errors);
3012 if (*errorHandler == NULL)
3013 return NULL;
3014 }
3015
3016 make_encode_exception(exceptionObject,
3017 encoding, unicode, size, startpos, endpos, reason);
3018 if (*exceptionObject == NULL)
3019 return NULL;
3020
3021 restuple = PyObject_CallFunctionObjArgs(
3022 *errorHandler, *exceptionObject, NULL);
3023 if (restuple == NULL)
3024 return NULL;
3025 if (!PyTuple_Check(restuple)) {
3026 PyErr_Format(PyExc_TypeError, &argparse[4]);
3027 Py_DECREF(restuple);
3028 return NULL;
3029 }
3030 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3031 &resunicode, newpos)) {
3032 Py_DECREF(restuple);
3033 return NULL;
3034 }
3035 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003036 *newpos = size+*newpos;
3037 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003038 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003039 Py_DECREF(restuple);
3040 return NULL;
3041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_INCREF(resunicode);
3043 Py_DECREF(restuple);
3044 return resunicode;
3045}
3046
3047static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003048 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 const char *errors,
3050 int limit)
3051{
3052 /* output object */
3053 PyObject *res;
3054 /* pointers to the beginning and end+1 of input */
3055 const Py_UNICODE *startp = p;
3056 const Py_UNICODE *endp = p + size;
3057 /* pointer to the beginning of the unencodable characters */
3058 /* const Py_UNICODE *badp = NULL; */
3059 /* pointer into the output */
3060 char *str;
3061 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t respos = 0;
3063 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003064 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3065 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
3068 /* the following variable is used for caching string comparisons
3069 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3070 int known_errorHandler = -1;
3071
3072 /* allocate enough for a simple encoding without
3073 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003074 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 if (res == NULL)
3076 goto onError;
3077 if (size == 0)
3078 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003079 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 ressize = size;
3081
3082 while (p<endp) {
3083 Py_UNICODE c = *p;
3084
3085 /* can we encode this? */
3086 if (c<limit) {
3087 /* no overflow check, because we know that the space is enough */
3088 *str++ = (char)c;
3089 ++p;
3090 }
3091 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003092 Py_ssize_t unicodepos = p-startp;
3093 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003095 Py_ssize_t repsize;
3096 Py_ssize_t newpos;
3097 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 Py_UNICODE *uni2;
3099 /* startpos for collecting unencodable chars */
3100 const Py_UNICODE *collstart = p;
3101 const Py_UNICODE *collend = p;
3102 /* find all unecodable characters */
3103 while ((collend < endp) && ((*collend)>=limit))
3104 ++collend;
3105 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3106 if (known_errorHandler==-1) {
3107 if ((errors==NULL) || (!strcmp(errors, "strict")))
3108 known_errorHandler = 1;
3109 else if (!strcmp(errors, "replace"))
3110 known_errorHandler = 2;
3111 else if (!strcmp(errors, "ignore"))
3112 known_errorHandler = 3;
3113 else if (!strcmp(errors, "xmlcharrefreplace"))
3114 known_errorHandler = 4;
3115 else
3116 known_errorHandler = 0;
3117 }
3118 switch (known_errorHandler) {
3119 case 1: /* strict */
3120 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3121 goto onError;
3122 case 2: /* replace */
3123 while (collstart++<collend)
3124 *str++ = '?'; /* fall through */
3125 case 3: /* ignore */
3126 p = collend;
3127 break;
3128 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003129 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 /* determine replacement size (temporarily (mis)uses p) */
3131 for (p = collstart, repsize = 0; p < collend; ++p) {
3132 if (*p<10)
3133 repsize += 2+1+1;
3134 else if (*p<100)
3135 repsize += 2+2+1;
3136 else if (*p<1000)
3137 repsize += 2+3+1;
3138 else if (*p<10000)
3139 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003140#ifndef Py_UNICODE_WIDE
3141 else
3142 repsize += 2+5+1;
3143#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 else if (*p<100000)
3145 repsize += 2+5+1;
3146 else if (*p<1000000)
3147 repsize += 2+6+1;
3148 else
3149 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003150#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 }
3152 requiredsize = respos+repsize+(endp-collend);
3153 if (requiredsize > ressize) {
3154 if (requiredsize<2*ressize)
3155 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003156 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003158 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 ressize = requiredsize;
3160 }
3161 /* generate replacement (temporarily (mis)uses p) */
3162 for (p = collstart; p < collend; ++p) {
3163 str += sprintf(str, "&#%d;", (int)*p);
3164 }
3165 p = collend;
3166 break;
3167 default:
3168 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3169 encoding, reason, startp, size, &exc,
3170 collstart-startp, collend-startp, &newpos);
3171 if (repunicode == NULL)
3172 goto onError;
3173 /* need more space? (at least enough for what we
3174 have+the replacement+the rest of the string, so
3175 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003176 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 repsize = PyUnicode_GET_SIZE(repunicode);
3178 requiredsize = respos+repsize+(endp-collend);
3179 if (requiredsize > ressize) {
3180 if (requiredsize<2*ressize)
3181 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003182 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 Py_DECREF(repunicode);
3184 goto onError;
3185 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003186 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187 ressize = requiredsize;
3188 }
3189 /* check if there is anything unencodable in the replacement
3190 and copy it to the output */
3191 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3192 c = *uni2;
3193 if (c >= limit) {
3194 raise_encode_exception(&exc, encoding, startp, size,
3195 unicodepos, unicodepos+1, reason);
3196 Py_DECREF(repunicode);
3197 goto onError;
3198 }
3199 *str = (char)c;
3200 }
3201 p = startp + newpos;
3202 Py_DECREF(repunicode);
3203 }
3204 }
3205 }
3206 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003207 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 if (respos<ressize)
3209 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003210 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_XDECREF(errorHandler);
3212 Py_XDECREF(exc);
3213 return res;
3214
3215 onError:
3216 Py_XDECREF(res);
3217 Py_XDECREF(errorHandler);
3218 Py_XDECREF(exc);
3219 return NULL;
3220}
3221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003223 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 const char *errors)
3225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227}
3228
3229PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3230{
3231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
3233 return NULL;
3234 }
3235 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3236 PyUnicode_GET_SIZE(unicode),
3237 NULL);
3238}
3239
3240/* --- 7-bit ASCII Codec -------------------------------------------------- */
3241
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003243 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 const char *errors)
3245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 PyUnicodeObject *v;
3248 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003249 Py_ssize_t startinpos;
3250 Py_ssize_t endinpos;
3251 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 const char *e;
3253 PyObject *errorHandler = NULL;
3254 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003257 if (size == 1 && *(unsigned char*)s < 128) {
3258 Py_UNICODE r = *(unsigned char*)s;
3259 return PyUnicode_FromUnicode(&r, 1);
3260 }
Tim Petersced69f82003-09-16 20:30:58 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 v = _PyUnicode_New(size);
3263 if (v == NULL)
3264 goto onError;
3265 if (size == 0)
3266 return (PyObject *)v;
3267 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 e = s + size;
3269 while (s < e) {
3270 register unsigned char c = (unsigned char)*s;
3271 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 ++s;
3274 }
3275 else {
3276 startinpos = s-starts;
3277 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003278 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 if (unicode_decode_call_errorhandler(
3280 errors, &errorHandler,
3281 "ascii", "ordinal not in range(128)",
3282 starts, size, &startinpos, &endinpos, &exc, &s,
3283 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003287 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003288 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 Py_XDECREF(errorHandler);
3291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003293
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 onError:
3295 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 Py_XDECREF(errorHandler);
3297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 return NULL;
3299}
3300
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003302 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 const char *errors)
3304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306}
3307
3308PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3309{
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 return NULL;
3313 }
3314 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3315 PyUnicode_GET_SIZE(unicode),
3316 NULL);
3317}
3318
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003319#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003320
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003321/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003323#if SIZEOF_INT < SIZEOF_SSIZE_T
3324#define NEED_RETRY
3325#endif
3326
3327/* XXX This code is limited to "true" double-byte encodings, as
3328 a) it assumes an incomplete character consists of a single byte, and
3329 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3330 encodings, see IsDBCSLeadByteEx documentation. */
3331
3332static int is_dbcs_lead_byte(const char *s, int offset)
3333{
3334 const char *curr = s + offset;
3335
3336 if (IsDBCSLeadByte(*curr)) {
3337 const char *prev = CharPrev(s, curr);
3338 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3339 }
3340 return 0;
3341}
3342
3343/*
3344 * Decode MBCS string into unicode object. If 'final' is set, converts
3345 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3346 */
3347static int decode_mbcs(PyUnicodeObject **v,
3348 const char *s, /* MBCS string */
3349 int size, /* sizeof MBCS string */
3350 int final)
3351{
3352 Py_UNICODE *p;
3353 Py_ssize_t n = 0;
3354 int usize = 0;
3355
3356 assert(size >= 0);
3357
3358 /* Skip trailing lead-byte unless 'final' is set */
3359 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3360 --size;
3361
3362 /* First get the size of the result */
3363 if (size > 0) {
3364 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3365 if (usize == 0) {
3366 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3367 return -1;
3368 }
3369 }
3370
3371 if (*v == NULL) {
3372 /* Create unicode object */
3373 *v = _PyUnicode_New(usize);
3374 if (*v == NULL)
3375 return -1;
3376 }
3377 else {
3378 /* Extend unicode object */
3379 n = PyUnicode_GET_SIZE(*v);
3380 if (_PyUnicode_Resize(v, n + usize) < 0)
3381 return -1;
3382 }
3383
3384 /* Do the conversion */
3385 if (size > 0) {
3386 p = PyUnicode_AS_UNICODE(*v) + n;
3387 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3388 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3389 return -1;
3390 }
3391 }
3392
3393 return size;
3394}
3395
3396PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3397 Py_ssize_t size,
3398 const char *errors,
3399 Py_ssize_t *consumed)
3400{
3401 PyUnicodeObject *v = NULL;
3402 int done;
3403
3404 if (consumed)
3405 *consumed = 0;
3406
3407#ifdef NEED_RETRY
3408 retry:
3409 if (size > INT_MAX)
3410 done = decode_mbcs(&v, s, INT_MAX, 0);
3411 else
3412#endif
3413 done = decode_mbcs(&v, s, (int)size, !consumed);
3414
3415 if (done < 0) {
3416 Py_XDECREF(v);
3417 return NULL;
3418 }
3419
3420 if (consumed)
3421 *consumed += done;
3422
3423#ifdef NEED_RETRY
3424 if (size > INT_MAX) {
3425 s += done;
3426 size -= done;
3427 goto retry;
3428 }
3429#endif
3430
3431 return (PyObject *)v;
3432}
3433
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003434PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003435 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003436 const char *errors)
3437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003438 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3439}
3440
3441/*
3442 * Convert unicode into string object (MBCS).
3443 * Returns 0 if succeed, -1 otherwise.
3444 */
3445static int encode_mbcs(PyObject **repr,
3446 const Py_UNICODE *p, /* unicode */
3447 int size) /* size of unicode */
3448{
3449 int mbcssize = 0;
3450 Py_ssize_t n = 0;
3451
3452 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003453
3454 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003455 if (size > 0) {
3456 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3457 if (mbcssize == 0) {
3458 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3459 return -1;
3460 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003461 }
3462
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003463 if (*repr == NULL) {
3464 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003465 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003466 if (*repr == NULL)
3467 return -1;
3468 }
3469 else {
3470 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003471 n = PyBytes_Size(*repr);
3472 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003473 return -1;
3474 }
3475
3476 /* Do the conversion */
3477 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003478 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3480 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3481 return -1;
3482 }
3483 }
3484
3485 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003486}
3487
3488PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003489 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003490 const char *errors)
3491{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003492 PyObject *repr = NULL;
3493 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003494
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003495#ifdef NEED_RETRY
3496 retry:
3497 if (size > INT_MAX)
3498 ret = encode_mbcs(&repr, p, INT_MAX);
3499 else
3500#endif
3501 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003502
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003503 if (ret < 0) {
3504 Py_XDECREF(repr);
3505 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003506 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003507
3508#ifdef NEED_RETRY
3509 if (size > INT_MAX) {
3510 p += INT_MAX;
3511 size -= INT_MAX;
3512 goto retry;
3513 }
3514#endif
3515
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003516 return repr;
3517}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003518
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003519PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3520{
3521 if (!PyUnicode_Check(unicode)) {
3522 PyErr_BadArgument();
3523 return NULL;
3524 }
3525 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3526 PyUnicode_GET_SIZE(unicode),
3527 NULL);
3528}
3529
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003530#undef NEED_RETRY
3531
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003532#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534/* --- Character Mapping Codec -------------------------------------------- */
3535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 PyObject *mapping,
3539 const char *errors)
3540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003542 Py_ssize_t startinpos;
3543 Py_ssize_t endinpos;
3544 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 PyUnicodeObject *v;
3547 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003548 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 PyObject *errorHandler = NULL;
3550 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003551 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 /* Default to Latin-1 */
3555 if (mapping == NULL)
3556 return PyUnicode_DecodeLatin1(s, size, errors);
3557
3558 v = _PyUnicode_New(size);
3559 if (v == NULL)
3560 goto onError;
3561 if (size == 0)
3562 return (PyObject *)v;
3563 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003565 if (PyUnicode_CheckExact(mapping)) {
3566 mapstring = PyUnicode_AS_UNICODE(mapping);
3567 maplen = PyUnicode_GET_SIZE(mapping);
3568 while (s < e) {
3569 unsigned char ch = *s;
3570 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003572 if (ch < maplen)
3573 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003575 if (x == 0xfffe) {
3576 /* undefined mapping */
3577 outpos = p-PyUnicode_AS_UNICODE(v);
3578 startinpos = s-starts;
3579 endinpos = startinpos+1;
3580 if (unicode_decode_call_errorhandler(
3581 errors, &errorHandler,
3582 "charmap", "character maps to <undefined>",
3583 starts, size, &startinpos, &endinpos, &exc, &s,
3584 (PyObject **)&v, &outpos, &p)) {
3585 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003586 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003587 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003588 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003589 *p++ = x;
3590 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003592 }
3593 else {
3594 while (s < e) {
3595 unsigned char ch = *s;
3596 PyObject *w, *x;
3597
3598 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3599 w = PyInt_FromLong((long)ch);
3600 if (w == NULL)
3601 goto onError;
3602 x = PyObject_GetItem(mapping, w);
3603 Py_DECREF(w);
3604 if (x == NULL) {
3605 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3606 /* No mapping found means: mapping is undefined. */
3607 PyErr_Clear();
3608 x = Py_None;
3609 Py_INCREF(x);
3610 } else
3611 goto onError;
3612 }
3613
3614 /* Apply mapping */
3615 if (PyInt_Check(x)) {
3616 long value = PyInt_AS_LONG(x);
3617 if (value < 0 || value > 65535) {
3618 PyErr_SetString(PyExc_TypeError,
3619 "character mapping must be in range(65536)");
3620 Py_DECREF(x);
3621 goto onError;
3622 }
3623 *p++ = (Py_UNICODE)value;
3624 }
3625 else if (x == Py_None) {
3626 /* undefined mapping */
3627 outpos = p-PyUnicode_AS_UNICODE(v);
3628 startinpos = s-starts;
3629 endinpos = startinpos+1;
3630 if (unicode_decode_call_errorhandler(
3631 errors, &errorHandler,
3632 "charmap", "character maps to <undefined>",
3633 starts, size, &startinpos, &endinpos, &exc, &s,
3634 (PyObject **)&v, &outpos, &p)) {
3635 Py_DECREF(x);
3636 goto onError;
3637 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003638 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003639 continue;
3640 }
3641 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003642 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003643
3644 if (targetsize == 1)
3645 /* 1-1 mapping */
3646 *p++ = *PyUnicode_AS_UNICODE(x);
3647
3648 else if (targetsize > 1) {
3649 /* 1-n mapping */
3650 if (targetsize > extrachars) {
3651 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3653 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003654 (targetsize << 2);
3655 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003656 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003657 if (_PyUnicode_Resize(&v,
3658 PyUnicode_GET_SIZE(v) + needed) < 0) {
3659 Py_DECREF(x);
3660 goto onError;
3661 }
3662 p = PyUnicode_AS_UNICODE(v) + oldpos;
3663 }
3664 Py_UNICODE_COPY(p,
3665 PyUnicode_AS_UNICODE(x),
3666 targetsize);
3667 p += targetsize;
3668 extrachars -= targetsize;
3669 }
3670 /* 1-0 mapping: skip the character */
3671 }
3672 else {
3673 /* wrong return value */
3674 PyErr_SetString(PyExc_TypeError,
3675 "character mapping must return integer, None or unicode");
3676 Py_DECREF(x);
3677 goto onError;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003680 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 }
3683 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003684 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(errorHandler);
3687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_XDECREF(errorHandler);
3692 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 Py_XDECREF(v);
3694 return NULL;
3695}
3696
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003697/* Charmap encoding: the lookup table */
3698
3699struct encoding_map{
3700 PyObject_HEAD
3701 unsigned char level1[32];
3702 int count2, count3;
3703 unsigned char level23[1];
3704};
3705
3706static PyObject*
3707encoding_map_size(PyObject *obj, PyObject* args)
3708{
3709 struct encoding_map *map = (struct encoding_map*)obj;
3710 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3711 128*map->count3);
3712}
3713
3714static PyMethodDef encoding_map_methods[] = {
3715 {"size", encoding_map_size, METH_NOARGS,
3716 PyDoc_STR("Return the size (in bytes) of this object") },
3717 { 0 }
3718};
3719
3720static void
3721encoding_map_dealloc(PyObject* o)
3722{
3723 PyObject_FREE(o);
3724}
3725
3726static PyTypeObject EncodingMapType = {
3727 PyObject_HEAD_INIT(NULL)
3728 0, /*ob_size*/
3729 "EncodingMap", /*tp_name*/
3730 sizeof(struct encoding_map), /*tp_basicsize*/
3731 0, /*tp_itemsize*/
3732 /* methods */
3733 encoding_map_dealloc, /*tp_dealloc*/
3734 0, /*tp_print*/
3735 0, /*tp_getattr*/
3736 0, /*tp_setattr*/
3737 0, /*tp_compare*/
3738 0, /*tp_repr*/
3739 0, /*tp_as_number*/
3740 0, /*tp_as_sequence*/
3741 0, /*tp_as_mapping*/
3742 0, /*tp_hash*/
3743 0, /*tp_call*/
3744 0, /*tp_str*/
3745 0, /*tp_getattro*/
3746 0, /*tp_setattro*/
3747 0, /*tp_as_buffer*/
3748 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3749 0, /*tp_doc*/
3750 0, /*tp_traverse*/
3751 0, /*tp_clear*/
3752 0, /*tp_richcompare*/
3753 0, /*tp_weaklistoffset*/
3754 0, /*tp_iter*/
3755 0, /*tp_iternext*/
3756 encoding_map_methods, /*tp_methods*/
3757 0, /*tp_members*/
3758 0, /*tp_getset*/
3759 0, /*tp_base*/
3760 0, /*tp_dict*/
3761 0, /*tp_descr_get*/
3762 0, /*tp_descr_set*/
3763 0, /*tp_dictoffset*/
3764 0, /*tp_init*/
3765 0, /*tp_alloc*/
3766 0, /*tp_new*/
3767 0, /*tp_free*/
3768 0, /*tp_is_gc*/
3769};
3770
3771PyObject*
3772PyUnicode_BuildEncodingMap(PyObject* string)
3773{
3774 Py_UNICODE *decode;
3775 PyObject *result;
3776 struct encoding_map *mresult;
3777 int i;
3778 int need_dict = 0;
3779 unsigned char level1[32];
3780 unsigned char level2[512];
3781 unsigned char *mlevel1, *mlevel2, *mlevel3;
3782 int count2 = 0, count3 = 0;
3783
3784 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3785 PyErr_BadArgument();
3786 return NULL;
3787 }
3788 decode = PyUnicode_AS_UNICODE(string);
3789 memset(level1, 0xFF, sizeof level1);
3790 memset(level2, 0xFF, sizeof level2);
3791
3792 /* If there isn't a one-to-one mapping of NULL to \0,
3793 or if there are non-BMP characters, we need to use
3794 a mapping dictionary. */
3795 if (decode[0] != 0)
3796 need_dict = 1;
3797 for (i = 1; i < 256; i++) {
3798 int l1, l2;
3799 if (decode[i] == 0
3800 #ifdef Py_UNICODE_WIDE
3801 || decode[i] > 0xFFFF
3802 #endif
3803 ) {
3804 need_dict = 1;
3805 break;
3806 }
3807 if (decode[i] == 0xFFFE)
3808 /* unmapped character */
3809 continue;
3810 l1 = decode[i] >> 11;
3811 l2 = decode[i] >> 7;
3812 if (level1[l1] == 0xFF)
3813 level1[l1] = count2++;
3814 if (level2[l2] == 0xFF)
3815 level2[l2] = count3++;
3816 }
3817
3818 if (count2 >= 0xFF || count3 >= 0xFF)
3819 need_dict = 1;
3820
3821 if (need_dict) {
3822 PyObject *result = PyDict_New();
3823 PyObject *key, *value;
3824 if (!result)
3825 return NULL;
3826 for (i = 0; i < 256; i++) {
3827 key = value = NULL;
3828 key = PyInt_FromLong(decode[i]);
3829 value = PyInt_FromLong(i);
3830 if (!key || !value)
3831 goto failed1;
3832 if (PyDict_SetItem(result, key, value) == -1)
3833 goto failed1;
3834 Py_DECREF(key);
3835 Py_DECREF(value);
3836 }
3837 return result;
3838 failed1:
3839 Py_XDECREF(key);
3840 Py_XDECREF(value);
3841 Py_DECREF(result);
3842 return NULL;
3843 }
3844
3845 /* Create a three-level trie */
3846 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3847 16*count2 + 128*count3 - 1);
3848 if (!result)
3849 return PyErr_NoMemory();
3850 PyObject_Init(result, &EncodingMapType);
3851 mresult = (struct encoding_map*)result;
3852 mresult->count2 = count2;
3853 mresult->count3 = count3;
3854 mlevel1 = mresult->level1;
3855 mlevel2 = mresult->level23;
3856 mlevel3 = mresult->level23 + 16*count2;
3857 memcpy(mlevel1, level1, 32);
3858 memset(mlevel2, 0xFF, 16*count2);
3859 memset(mlevel3, 0, 128*count3);
3860 count3 = 0;
3861 for (i = 1; i < 256; i++) {
3862 int o1, o2, o3, i2, i3;
3863 if (decode[i] == 0xFFFE)
3864 /* unmapped character */
3865 continue;
3866 o1 = decode[i]>>11;
3867 o2 = (decode[i]>>7) & 0xF;
3868 i2 = 16*mlevel1[o1] + o2;
3869 if (mlevel2[i2] == 0xFF)
3870 mlevel2[i2] = count3++;
3871 o3 = decode[i] & 0x7F;
3872 i3 = 128*mlevel2[i2] + o3;
3873 mlevel3[i3] = i;
3874 }
3875 return result;
3876}
3877
3878static int
3879encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3880{
3881 struct encoding_map *map = (struct encoding_map*)mapping;
3882 int l1 = c>>11;
3883 int l2 = (c>>7) & 0xF;
3884 int l3 = c & 0x7F;
3885 int i;
3886
3887#ifdef Py_UNICODE_WIDE
3888 if (c > 0xFFFF) {
3889 return -1;
3890 }
3891#endif
3892 if (c == 0)
3893 return 0;
3894 /* level 1*/
3895 i = map->level1[l1];
3896 if (i == 0xFF) {
3897 return -1;
3898 }
3899 /* level 2*/
3900 i = map->level23[16*i+l2];
3901 if (i == 0xFF) {
3902 return -1;
3903 }
3904 /* level 3 */
3905 i = map->level23[16*map->count2 + 128*i + l3];
3906 if (i == 0) {
3907 return -1;
3908 }
3909 return i;
3910}
3911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912/* Lookup the character ch in the mapping. If the character
3913 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003914 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 PyObject *w = PyInt_FromLong((long)c);
3918 PyObject *x;
3919
3920 if (w == NULL)
3921 return NULL;
3922 x = PyObject_GetItem(mapping, w);
3923 Py_DECREF(w);
3924 if (x == NULL) {
3925 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3926 /* No mapping found means: mapping is undefined. */
3927 PyErr_Clear();
3928 x = Py_None;
3929 Py_INCREF(x);
3930 return x;
3931 } else
3932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003934 else if (x == Py_None)
3935 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 else if (PyInt_Check(x)) {
3937 long value = PyInt_AS_LONG(x);
3938 if (value < 0 || value > 255) {
3939 PyErr_SetString(PyExc_TypeError,
3940 "character mapping must be in range(256)");
3941 Py_DECREF(x);
3942 return NULL;
3943 }
3944 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 else if (PyString_Check(x))
3947 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003950 PyErr_Format(PyExc_TypeError,
3951 "character mapping must return integer, None or str8, not %.400s",
3952 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 Py_DECREF(x);
3954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 }
3956}
3957
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003958static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003959charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003960{
Walter Dörwald827b0552007-05-12 13:23:53 +00003961 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003962 /* exponentially overallocate to minimize reallocations */
3963 if (requiredsize < 2*outsize)
3964 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003965 if (PyBytes_Resize(outobj, requiredsize)) {
3966 Py_DECREF(outobj);
3967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003969 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970}
3971
3972typedef enum charmapencode_result {
3973 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3974}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003976 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 space is available. Return a new reference to the object that
3978 was put in the output buffer, or Py_None, if the mapping was undefined
3979 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003980 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003982charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003983 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003985 PyObject *rep;
3986 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003987 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003989 if (mapping->ob_type == &EncodingMapType) {
3990 int res = encoding_map_lookup(c, mapping);
3991 Py_ssize_t requiredsize = *outpos+1;
3992 if (res == -1)
3993 return enc_FAILED;
3994 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003996 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003997 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 outstart[(*outpos)++] = (char)res;
3999 return enc_SUCCESS;
4000 }
4001
4002 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004004 return enc_EXCEPTION;
4005 else if (rep==Py_None) {
4006 Py_DECREF(rep);
4007 return enc_FAILED;
4008 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004010 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004012 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004014 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004016 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4018 }
4019 else {
4020 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004021 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4022 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004024 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004028 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 memcpy(outstart + *outpos, repchars, repsize);
4030 *outpos += repsize;
4031 }
4032 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004033 Py_DECREF(rep);
4034 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035}
4036
4037/* handle an error in PyUnicode_EncodeCharmap
4038 Return 0 on success, -1 on error */
4039static
4040int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004043 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004044 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045{
4046 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004047 Py_ssize_t repsize;
4048 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 Py_UNICODE *uni2;
4050 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004051 Py_ssize_t collstartpos = *inpos;
4052 Py_ssize_t collendpos = *inpos+1;
4053 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 char *encoding = "charmap";
4055 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004056 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 /* find all unencodable characters */
4059 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004060 PyObject *rep;
4061 if (mapping->ob_type == &EncodingMapType) {
4062 int res = encoding_map_lookup(p[collendpos], mapping);
4063 if (res != -1)
4064 break;
4065 ++collendpos;
4066 continue;
4067 }
4068
4069 rep = charmapencode_lookup(p[collendpos], mapping);
4070 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004072 else if (rep!=Py_None) {
4073 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 break;
4075 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004076 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 ++collendpos;
4078 }
4079 /* cache callback name lookup
4080 * (if not done yet, i.e. it's the first error) */
4081 if (*known_errorHandler==-1) {
4082 if ((errors==NULL) || (!strcmp(errors, "strict")))
4083 *known_errorHandler = 1;
4084 else if (!strcmp(errors, "replace"))
4085 *known_errorHandler = 2;
4086 else if (!strcmp(errors, "ignore"))
4087 *known_errorHandler = 3;
4088 else if (!strcmp(errors, "xmlcharrefreplace"))
4089 *known_errorHandler = 4;
4090 else
4091 *known_errorHandler = 0;
4092 }
4093 switch (*known_errorHandler) {
4094 case 1: /* strict */
4095 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4096 return -1;
4097 case 2: /* replace */
4098 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4099 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004100 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 return -1;
4102 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004103 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4105 return -1;
4106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 }
4108 /* fall through */
4109 case 3: /* ignore */
4110 *inpos = collendpos;
4111 break;
4112 case 4: /* xmlcharrefreplace */
4113 /* generate replacement (temporarily (mis)uses p) */
4114 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4115 char buffer[2+29+1+1];
4116 char *cp;
4117 sprintf(buffer, "&#%d;", (int)p[collpos]);
4118 for (cp = buffer; *cp; ++cp) {
4119 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004120 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004122 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4124 return -1;
4125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 }
4127 }
4128 *inpos = collendpos;
4129 break;
4130 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004131 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 encoding, reason, p, size, exceptionObject,
4133 collstartpos, collendpos, &newpos);
4134 if (repunicode == NULL)
4135 return -1;
4136 /* generate replacement */
4137 repsize = PyUnicode_GET_SIZE(repunicode);
4138 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4139 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004140 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 return -1;
4142 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004143 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4146 return -1;
4147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 }
4149 *inpos = newpos;
4150 Py_DECREF(repunicode);
4151 }
4152 return 0;
4153}
4154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 PyObject *mapping,
4158 const char *errors)
4159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 /* output object */
4161 PyObject *res = NULL;
4162 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 PyObject *errorHandler = NULL;
4167 PyObject *exc = NULL;
4168 /* the following variable is used for caching string comparisons
4169 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4170 * 3=ignore, 4=xmlcharrefreplace */
4171 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172
4173 /* Default to Latin-1 */
4174 if (mapping == NULL)
4175 return PyUnicode_EncodeLatin1(p, size, errors);
4176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 /* allocate enough for a simple encoding without
4178 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004179 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 if (res == NULL)
4181 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004182 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 while (inpos<size) {
4186 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004187 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004188 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004190 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 if (charmap_encoding_error(p, size, &inpos, mapping,
4192 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004193 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004194 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004195 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 else
4199 /* done with this character => adjust input position */
4200 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004204 if (respos<PyBytes_GET_SIZE(res)) {
4205 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 goto onError;
4207 }
4208 Py_XDECREF(exc);
4209 Py_XDECREF(errorHandler);
4210 return res;
4211
4212 onError:
4213 Py_XDECREF(res);
4214 Py_XDECREF(exc);
4215 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 return NULL;
4217}
4218
4219PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4220 PyObject *mapping)
4221{
4222 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4223 PyErr_BadArgument();
4224 return NULL;
4225 }
4226 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4227 PyUnicode_GET_SIZE(unicode),
4228 mapping,
4229 NULL);
4230}
4231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232/* create or adjust a UnicodeTranslateError */
4233static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004234 const Py_UNICODE *unicode, Py_ssize_t size,
4235 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 if (*exceptionObject == NULL) {
4239 *exceptionObject = PyUnicodeTranslateError_Create(
4240 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 }
4242 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4244 goto onError;
4245 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4246 goto onError;
4247 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4248 goto onError;
4249 return;
4250 onError:
4251 Py_DECREF(*exceptionObject);
4252 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 }
4254}
4255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256/* raises a UnicodeTranslateError */
4257static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004258 const Py_UNICODE *unicode, Py_ssize_t size,
4259 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 const char *reason)
4261{
4262 make_translate_exception(exceptionObject,
4263 unicode, size, startpos, endpos, reason);
4264 if (*exceptionObject != NULL)
4265 PyCodec_StrictErrors(*exceptionObject);
4266}
4267
4268/* error handling callback helper:
4269 build arguments, call the callback and check the arguments,
4270 put the result into newpos and return the replacement string, which
4271 has to be freed by the caller */
4272static PyObject *unicode_translate_call_errorhandler(const char *errors,
4273 PyObject **errorHandler,
4274 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004275 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4276 Py_ssize_t startpos, Py_ssize_t endpos,
4277 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004279 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004281 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 PyObject *restuple;
4283 PyObject *resunicode;
4284
4285 if (*errorHandler == NULL) {
4286 *errorHandler = PyCodec_LookupError(errors);
4287 if (*errorHandler == NULL)
4288 return NULL;
4289 }
4290
4291 make_translate_exception(exceptionObject,
4292 unicode, size, startpos, endpos, reason);
4293 if (*exceptionObject == NULL)
4294 return NULL;
4295
4296 restuple = PyObject_CallFunctionObjArgs(
4297 *errorHandler, *exceptionObject, NULL);
4298 if (restuple == NULL)
4299 return NULL;
4300 if (!PyTuple_Check(restuple)) {
4301 PyErr_Format(PyExc_TypeError, &argparse[4]);
4302 Py_DECREF(restuple);
4303 return NULL;
4304 }
4305 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 Py_DECREF(restuple);
4308 return NULL;
4309 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 if (i_newpos<0)
4311 *newpos = size+i_newpos;
4312 else
4313 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004314 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004315 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 Py_DECREF(restuple);
4317 return NULL;
4318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_INCREF(resunicode);
4320 Py_DECREF(restuple);
4321 return resunicode;
4322}
4323
4324/* Lookup the character ch in the mapping and put the result in result,
4325 which must be decrefed by the caller.
4326 Return 0 on success, -1 on error */
4327static
4328int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4329{
4330 PyObject *w = PyInt_FromLong((long)c);
4331 PyObject *x;
4332
4333 if (w == NULL)
4334 return -1;
4335 x = PyObject_GetItem(mapping, w);
4336 Py_DECREF(w);
4337 if (x == NULL) {
4338 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4339 /* No mapping found means: use 1:1 mapping. */
4340 PyErr_Clear();
4341 *result = NULL;
4342 return 0;
4343 } else
4344 return -1;
4345 }
4346 else if (x == Py_None) {
4347 *result = x;
4348 return 0;
4349 }
4350 else if (PyInt_Check(x)) {
4351 long value = PyInt_AS_LONG(x);
4352 long max = PyUnicode_GetMax();
4353 if (value < 0 || value > max) {
4354 PyErr_Format(PyExc_TypeError,
4355 "character mapping must be in range(0x%lx)", max+1);
4356 Py_DECREF(x);
4357 return -1;
4358 }
4359 *result = x;
4360 return 0;
4361 }
4362 else if (PyUnicode_Check(x)) {
4363 *result = x;
4364 return 0;
4365 }
4366 else {
4367 /* wrong return value */
4368 PyErr_SetString(PyExc_TypeError,
4369 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004370 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 return -1;
4372 }
4373}
4374/* ensure that *outobj is at least requiredsize characters long,
4375if not reallocate and adjust various state variables.
4376Return 0 on success, -1 on error */
4377static
Walter Dörwald4894c302003-10-24 14:25:28 +00004378int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004382 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004386 if (requiredsize < 2 * oldsize)
4387 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004388 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return -1;
4390 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 }
4392 return 0;
4393}
4394/* lookup the character, put the result in the output string and adjust
4395 various state variables. Return a new reference to the object that
4396 was put in the output buffer in *result, or Py_None, if the mapping was
4397 undefined (in which case no character was written).
4398 The called must decref result.
4399 Return 0 on success, -1 on error. */
4400static
Walter Dörwald4894c302003-10-24 14:25:28 +00004401int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004403 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404{
Walter Dörwald4894c302003-10-24 14:25:28 +00004405 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 return -1;
4407 if (*res==NULL) {
4408 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004409 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 }
4411 else if (*res==Py_None)
4412 ;
4413 else if (PyInt_Check(*res)) {
4414 /* no overflow check, because we know that the space is enough */
4415 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4416 }
4417 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 if (repsize==1) {
4420 /* no overflow check, because we know that the space is enough */
4421 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4422 }
4423 else if (repsize!=0) {
4424 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004425 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004426 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004427 repsize - 1;
4428 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 return -1;
4430 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4431 *outp += repsize;
4432 }
4433 }
4434 else
4435 return -1;
4436 return 0;
4437}
4438
4439PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 PyObject *mapping,
4442 const char *errors)
4443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 /* output object */
4445 PyObject *res = NULL;
4446 /* pointers to the beginning and end+1 of input */
4447 const Py_UNICODE *startp = p;
4448 const Py_UNICODE *endp = p + size;
4449 /* pointer into the output */
4450 Py_UNICODE *str;
4451 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004452 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 char *reason = "character maps to <undefined>";
4454 PyObject *errorHandler = NULL;
4455 PyObject *exc = NULL;
4456 /* the following variable is used for caching string comparisons
4457 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4458 * 3=ignore, 4=xmlcharrefreplace */
4459 int known_errorHandler = -1;
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (mapping == NULL) {
4462 PyErr_BadArgument();
4463 return NULL;
4464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465
4466 /* allocate enough for a simple 1:1 translation without
4467 replacements, if we need more, we'll resize */
4468 res = PyUnicode_FromUnicode(NULL, size);
4469 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004470 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return res;
4473 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 while (p<endp) {
4476 /* try to encode it */
4477 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004478 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 goto onError;
4481 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004482 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 if (x!=Py_None) /* it worked => adjust input pointer */
4484 ++p;
4485 else { /* untranslatable character */
4486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004487 Py_ssize_t repsize;
4488 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 Py_UNICODE *uni2;
4490 /* startpos for collecting untranslatable chars */
4491 const Py_UNICODE *collstart = p;
4492 const Py_UNICODE *collend = p+1;
4493 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 /* find all untranslatable characters */
4496 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004497 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 goto onError;
4499 Py_XDECREF(x);
4500 if (x!=Py_None)
4501 break;
4502 ++collend;
4503 }
4504 /* cache callback name lookup
4505 * (if not done yet, i.e. it's the first error) */
4506 if (known_errorHandler==-1) {
4507 if ((errors==NULL) || (!strcmp(errors, "strict")))
4508 known_errorHandler = 1;
4509 else if (!strcmp(errors, "replace"))
4510 known_errorHandler = 2;
4511 else if (!strcmp(errors, "ignore"))
4512 known_errorHandler = 3;
4513 else if (!strcmp(errors, "xmlcharrefreplace"))
4514 known_errorHandler = 4;
4515 else
4516 known_errorHandler = 0;
4517 }
4518 switch (known_errorHandler) {
4519 case 1: /* strict */
4520 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4521 goto onError;
4522 case 2: /* replace */
4523 /* No need to check for space, this is a 1:1 replacement */
4524 for (coll = collstart; coll<collend; ++coll)
4525 *str++ = '?';
4526 /* fall through */
4527 case 3: /* ignore */
4528 p = collend;
4529 break;
4530 case 4: /* xmlcharrefreplace */
4531 /* generate replacement (temporarily (mis)uses p) */
4532 for (p = collstart; p < collend; ++p) {
4533 char buffer[2+29+1+1];
4534 char *cp;
4535 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004536 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4538 goto onError;
4539 for (cp = buffer; *cp; ++cp)
4540 *str++ = *cp;
4541 }
4542 p = collend;
4543 break;
4544 default:
4545 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4546 reason, startp, size, &exc,
4547 collstart-startp, collend-startp, &newpos);
4548 if (repunicode == NULL)
4549 goto onError;
4550 /* generate replacement */
4551 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004552 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4554 Py_DECREF(repunicode);
4555 goto onError;
4556 }
4557 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4558 *str++ = *uni2;
4559 p = startp + newpos;
4560 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
4562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 /* Resize if we allocated to much */
4565 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004566 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004567 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 }
4570 Py_XDECREF(exc);
4571 Py_XDECREF(errorHandler);
4572 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 onError:
4575 Py_XDECREF(res);
4576 Py_XDECREF(exc);
4577 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 return NULL;
4579}
4580
4581PyObject *PyUnicode_Translate(PyObject *str,
4582 PyObject *mapping,
4583 const char *errors)
4584{
4585 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004586
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 str = PyUnicode_FromObject(str);
4588 if (str == NULL)
4589 goto onError;
4590 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4591 PyUnicode_GET_SIZE(str),
4592 mapping,
4593 errors);
4594 Py_DECREF(str);
4595 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 onError:
4598 Py_XDECREF(str);
4599 return NULL;
4600}
Tim Petersced69f82003-09-16 20:30:58 +00004601
Guido van Rossum9e896b32000-04-05 20:11:21 +00004602/* --- Decimal Encoder ---------------------------------------------------- */
4603
4604int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004606 char *output,
4607 const char *errors)
4608{
4609 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 PyObject *errorHandler = NULL;
4611 PyObject *exc = NULL;
4612 const char *encoding = "decimal";
4613 const char *reason = "invalid decimal Unicode string";
4614 /* the following variable is used for caching string comparisons
4615 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4616 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004617
4618 if (output == NULL) {
4619 PyErr_BadArgument();
4620 return -1;
4621 }
4622
4623 p = s;
4624 end = s + length;
4625 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004627 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004629 Py_ssize_t repsize;
4630 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 Py_UNICODE *uni2;
4632 Py_UNICODE *collstart;
4633 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004634
Guido van Rossum9e896b32000-04-05 20:11:21 +00004635 if (Py_UNICODE_ISSPACE(ch)) {
4636 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004638 continue;
4639 }
4640 decimal = Py_UNICODE_TODECIMAL(ch);
4641 if (decimal >= 0) {
4642 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004644 continue;
4645 }
Guido van Rossumba477042000-04-06 18:18:10 +00004646 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004647 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004649 continue;
4650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 /* All other characters are considered unencodable */
4652 collstart = p;
4653 collend = p+1;
4654 while (collend < end) {
4655 if ((0 < *collend && *collend < 256) ||
4656 !Py_UNICODE_ISSPACE(*collend) ||
4657 Py_UNICODE_TODECIMAL(*collend))
4658 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004659 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 /* cache callback name lookup
4661 * (if not done yet, i.e. it's the first error) */
4662 if (known_errorHandler==-1) {
4663 if ((errors==NULL) || (!strcmp(errors, "strict")))
4664 known_errorHandler = 1;
4665 else if (!strcmp(errors, "replace"))
4666 known_errorHandler = 2;
4667 else if (!strcmp(errors, "ignore"))
4668 known_errorHandler = 3;
4669 else if (!strcmp(errors, "xmlcharrefreplace"))
4670 known_errorHandler = 4;
4671 else
4672 known_errorHandler = 0;
4673 }
4674 switch (known_errorHandler) {
4675 case 1: /* strict */
4676 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4677 goto onError;
4678 case 2: /* replace */
4679 for (p = collstart; p < collend; ++p)
4680 *output++ = '?';
4681 /* fall through */
4682 case 3: /* ignore */
4683 p = collend;
4684 break;
4685 case 4: /* xmlcharrefreplace */
4686 /* generate replacement (temporarily (mis)uses p) */
4687 for (p = collstart; p < collend; ++p)
4688 output += sprintf(output, "&#%d;", (int)*p);
4689 p = collend;
4690 break;
4691 default:
4692 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4693 encoding, reason, s, length, &exc,
4694 collstart-s, collend-s, &newpos);
4695 if (repunicode == NULL)
4696 goto onError;
4697 /* generate replacement */
4698 repsize = PyUnicode_GET_SIZE(repunicode);
4699 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4700 Py_UNICODE ch = *uni2;
4701 if (Py_UNICODE_ISSPACE(ch))
4702 *output++ = ' ';
4703 else {
4704 decimal = Py_UNICODE_TODECIMAL(ch);
4705 if (decimal >= 0)
4706 *output++ = '0' + decimal;
4707 else if (0 < ch && ch < 256)
4708 *output++ = (char)ch;
4709 else {
4710 Py_DECREF(repunicode);
4711 raise_encode_exception(&exc, encoding,
4712 s, length, collstart-s, collend-s, reason);
4713 goto onError;
4714 }
4715 }
4716 }
4717 p = s + newpos;
4718 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004719 }
4720 }
4721 /* 0-terminate the output string */
4722 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 Py_XDECREF(exc);
4724 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004725 return 0;
4726
4727 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004730 return -1;
4731}
4732
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733/* --- Helpers ------------------------------------------------------------ */
4734
Thomas Wouters477c8d52006-05-27 19:21:47 +00004735#define STRINGLIB_CHAR Py_UNICODE
4736
4737#define STRINGLIB_LEN PyUnicode_GET_SIZE
4738#define STRINGLIB_NEW PyUnicode_FromUnicode
4739#define STRINGLIB_STR PyUnicode_AS_UNICODE
4740
4741Py_LOCAL_INLINE(int)
4742STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004744 if (str[0] != other[0])
4745 return 1;
4746 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747}
4748
Thomas Wouters477c8d52006-05-27 19:21:47 +00004749#define STRINGLIB_EMPTY unicode_empty
4750
4751#include "stringlib/fastsearch.h"
4752
4753#include "stringlib/count.h"
4754#include "stringlib/find.h"
4755#include "stringlib/partition.h"
4756
4757/* helper macro to fixup start/end slice values */
4758#define FIX_START_END(obj) \
4759 if (start < 0) \
4760 start += (obj)->length; \
4761 if (start < 0) \
4762 start = 0; \
4763 if (end > (obj)->length) \
4764 end = (obj)->length; \
4765 if (end < 0) \
4766 end += (obj)->length; \
4767 if (end < 0) \
4768 end = 0;
4769
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004771 PyObject *substr,
4772 Py_ssize_t start,
4773 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004776 PyUnicodeObject* str_obj;
4777 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004778
Thomas Wouters477c8d52006-05-27 19:21:47 +00004779 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4780 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004782 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4783 if (!sub_obj) {
4784 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 return -1;
4786 }
Tim Petersced69f82003-09-16 20:30:58 +00004787
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004789
Thomas Wouters477c8d52006-05-27 19:21:47 +00004790 result = stringlib_count(
4791 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4792 );
4793
4794 Py_DECREF(sub_obj);
4795 Py_DECREF(str_obj);
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return result;
4798}
4799
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004801 PyObject *sub,
4802 Py_ssize_t start,
4803 Py_ssize_t end,
4804 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004809 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004810 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004811 sub = PyUnicode_FromObject(sub);
4812 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004813 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004814 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
Tim Petersced69f82003-09-16 20:30:58 +00004816
Thomas Wouters477c8d52006-05-27 19:21:47 +00004817 if (direction > 0)
4818 result = stringlib_find_slice(
4819 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4820 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4821 start, end
4822 );
4823 else
4824 result = stringlib_rfind_slice(
4825 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4826 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4827 start, end
4828 );
4829
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004831 Py_DECREF(sub);
4832
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 return result;
4834}
4835
Tim Petersced69f82003-09-16 20:30:58 +00004836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837int tailmatch(PyUnicodeObject *self,
4838 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004839 Py_ssize_t start,
4840 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 int direction)
4842{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (substring->length == 0)
4844 return 1;
4845
Thomas Wouters477c8d52006-05-27 19:21:47 +00004846 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
4848 end -= substring->length;
4849 if (end < start)
4850 return 0;
4851
4852 if (direction > 0) {
4853 if (Py_UNICODE_MATCH(self, end, substring))
4854 return 1;
4855 } else {
4856 if (Py_UNICODE_MATCH(self, start, substring))
4857 return 1;
4858 }
4859
4860 return 0;
4861}
4862
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 Py_ssize_t start,
4866 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 int direction)
4868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 str = PyUnicode_FromObject(str);
4872 if (str == NULL)
4873 return -1;
4874 substr = PyUnicode_FromObject(substr);
4875 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004876 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 return -1;
4878 }
Tim Petersced69f82003-09-16 20:30:58 +00004879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 result = tailmatch((PyUnicodeObject *)str,
4881 (PyUnicodeObject *)substr,
4882 start, end, direction);
4883 Py_DECREF(str);
4884 Py_DECREF(substr);
4885 return result;
4886}
4887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888/* Apply fixfct filter to the Unicode object self and return a
4889 reference to the modified object */
4890
Tim Petersced69f82003-09-16 20:30:58 +00004891static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892PyObject *fixup(PyUnicodeObject *self,
4893 int (*fixfct)(PyUnicodeObject *s))
4894{
4895
4896 PyUnicodeObject *u;
4897
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004898 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 if (u == NULL)
4900 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004901
4902 Py_UNICODE_COPY(u->str, self->str, self->length);
4903
Tim Peters7a29bd52001-09-12 03:03:31 +00004904 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 /* fixfct should return TRUE if it modified the buffer. If
4906 FALSE, return a reference to the original buffer instead
4907 (to save space, not time) */
4908 Py_INCREF(self);
4909 Py_DECREF(u);
4910 return (PyObject*) self;
4911 }
4912 return (PyObject*) u;
4913}
4914
Tim Petersced69f82003-09-16 20:30:58 +00004915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916int fixupper(PyUnicodeObject *self)
4917{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004918 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 Py_UNICODE *s = self->str;
4920 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 while (len-- > 0) {
4923 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004924
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 ch = Py_UNICODE_TOUPPER(*s);
4926 if (ch != *s) {
4927 status = 1;
4928 *s = ch;
4929 }
4930 s++;
4931 }
4932
4933 return status;
4934}
4935
Tim Petersced69f82003-09-16 20:30:58 +00004936static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937int fixlower(PyUnicodeObject *self)
4938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 Py_UNICODE *s = self->str;
4941 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004942
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 while (len-- > 0) {
4944 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004945
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 ch = Py_UNICODE_TOLOWER(*s);
4947 if (ch != *s) {
4948 status = 1;
4949 *s = ch;
4950 }
4951 s++;
4952 }
4953
4954 return status;
4955}
4956
Tim Petersced69f82003-09-16 20:30:58 +00004957static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958int fixswapcase(PyUnicodeObject *self)
4959{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004960 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 Py_UNICODE *s = self->str;
4962 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 while (len-- > 0) {
4965 if (Py_UNICODE_ISUPPER(*s)) {
4966 *s = Py_UNICODE_TOLOWER(*s);
4967 status = 1;
4968 } else if (Py_UNICODE_ISLOWER(*s)) {
4969 *s = Py_UNICODE_TOUPPER(*s);
4970 status = 1;
4971 }
4972 s++;
4973 }
4974
4975 return status;
4976}
4977
Tim Petersced69f82003-09-16 20:30:58 +00004978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979int fixcapitalize(PyUnicodeObject *self)
4980{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004981 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004982 Py_UNICODE *s = self->str;
4983 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004984
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004985 if (len == 0)
4986 return 0;
4987 if (Py_UNICODE_ISLOWER(*s)) {
4988 *s = Py_UNICODE_TOUPPER(*s);
4989 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004991 s++;
4992 while (--len > 0) {
4993 if (Py_UNICODE_ISUPPER(*s)) {
4994 *s = Py_UNICODE_TOLOWER(*s);
4995 status = 1;
4996 }
4997 s++;
4998 }
4999 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000}
5001
5002static
5003int fixtitle(PyUnicodeObject *self)
5004{
5005 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5006 register Py_UNICODE *e;
5007 int previous_is_cased;
5008
5009 /* Shortcut for single character strings */
5010 if (PyUnicode_GET_SIZE(self) == 1) {
5011 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5012 if (*p != ch) {
5013 *p = ch;
5014 return 1;
5015 }
5016 else
5017 return 0;
5018 }
Tim Petersced69f82003-09-16 20:30:58 +00005019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 e = p + PyUnicode_GET_SIZE(self);
5021 previous_is_cased = 0;
5022 for (; p < e; p++) {
5023 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005024
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 if (previous_is_cased)
5026 *p = Py_UNICODE_TOLOWER(ch);
5027 else
5028 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005029
5030 if (Py_UNICODE_ISLOWER(ch) ||
5031 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 Py_UNICODE_ISTITLE(ch))
5033 previous_is_cased = 1;
5034 else
5035 previous_is_cased = 0;
5036 }
5037 return 1;
5038}
5039
Tim Peters8ce9f162004-08-27 01:49:32 +00005040PyObject *
5041PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042{
Tim Peters8ce9f162004-08-27 01:49:32 +00005043 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005044 const Py_UNICODE blank = ' ';
5045 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005047 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005048 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5049 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005050 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5051 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005052 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005053 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005054 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
Tim Peters05eba1f2004-08-27 21:32:02 +00005056 fseq = PySequence_Fast(seq, "");
5057 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005058 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005059 }
5060
Tim Peters91879ab2004-08-27 22:35:44 +00005061 /* Grrrr. A codec may be invoked to convert str objects to
5062 * Unicode, and so it's possible to call back into Python code
5063 * during PyUnicode_FromObject(), and so it's possible for a sick
5064 * codec to change the size of fseq (if seq is a list). Therefore
5065 * we have to keep refetching the size -- can't assume seqlen
5066 * is invariant.
5067 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005068 seqlen = PySequence_Fast_GET_SIZE(fseq);
5069 /* If empty sequence, return u"". */
5070 if (seqlen == 0) {
5071 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5072 goto Done;
5073 }
5074 /* If singleton sequence with an exact Unicode, return that. */
5075 if (seqlen == 1) {
5076 item = PySequence_Fast_GET_ITEM(fseq, 0);
5077 if (PyUnicode_CheckExact(item)) {
5078 Py_INCREF(item);
5079 res = (PyUnicodeObject *)item;
5080 goto Done;
5081 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005082 }
5083
Tim Peters05eba1f2004-08-27 21:32:02 +00005084 /* At least two items to join, or one that isn't exact Unicode. */
5085 if (seqlen > 1) {
5086 /* Set up sep and seplen -- they're needed. */
5087 if (separator == NULL) {
5088 sep = &blank;
5089 seplen = 1;
5090 }
5091 else {
5092 internal_separator = PyUnicode_FromObject(separator);
5093 if (internal_separator == NULL)
5094 goto onError;
5095 sep = PyUnicode_AS_UNICODE(internal_separator);
5096 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005097 /* In case PyUnicode_FromObject() mutated seq. */
5098 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005099 }
5100 }
5101
5102 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005103 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005104 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005105 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005106 res_p = PyUnicode_AS_UNICODE(res);
5107 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005108
Tim Peters05eba1f2004-08-27 21:32:02 +00005109 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005110 Py_ssize_t itemlen;
5111 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005112
5113 item = PySequence_Fast_GET_ITEM(fseq, i);
5114 /* Convert item to Unicode. */
5115 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5116 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005117 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 " %.80s found",
5119 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005120 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005121 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005122 item = PyUnicode_FromObject(item);
5123 if (item == NULL)
5124 goto onError;
5125 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005126
Tim Peters91879ab2004-08-27 22:35:44 +00005127 /* In case PyUnicode_FromObject() mutated seq. */
5128 seqlen = PySequence_Fast_GET_SIZE(fseq);
5129
Tim Peters8ce9f162004-08-27 01:49:32 +00005130 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005132 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005133 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005134 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005135 if (i < seqlen - 1) {
5136 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005137 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005138 goto Overflow;
5139 }
5140 if (new_res_used > res_alloc) {
5141 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005143 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005144 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005145 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005146 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005147 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005148 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005150 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005151 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005153
5154 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005155 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005156 res_p += itemlen;
5157 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005158 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 res_p += seplen;
5160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005162 res_used = new_res_used;
5163 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005164
Tim Peters05eba1f2004-08-27 21:32:02 +00005165 /* Shrink res to match the used area; this probably can't fail,
5166 * but it's cheap to check.
5167 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005168 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005169 goto onError;
5170
5171 Done:
5172 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005173 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 return (PyObject *)res;
5175
Tim Peters8ce9f162004-08-27 01:49:32 +00005176 Overflow:
5177 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005179 Py_DECREF(item);
5180 /* fall through */
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005183 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005184 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005185 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return NULL;
5187}
5188
Tim Petersced69f82003-09-16 20:30:58 +00005189static
5190PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005191 Py_ssize_t left,
5192 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 Py_UNICODE fill)
5194{
5195 PyUnicodeObject *u;
5196
5197 if (left < 0)
5198 left = 0;
5199 if (right < 0)
5200 right = 0;
5201
Tim Peters7a29bd52001-09-12 03:03:31 +00005202 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 Py_INCREF(self);
5204 return self;
5205 }
5206
5207 u = _PyUnicode_New(left + self->length + right);
5208 if (u) {
5209 if (left)
5210 Py_UNICODE_FILL(u->str, fill, left);
5211 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5212 if (right)
5213 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5214 }
5215
5216 return u;
5217}
5218
5219#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005220 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 if (!str) \
5222 goto onError; \
5223 if (PyList_Append(list, str)) { \
5224 Py_DECREF(str); \
5225 goto onError; \
5226 } \
5227 else \
5228 Py_DECREF(str);
5229
5230static
5231PyObject *split_whitespace(PyUnicodeObject *self,
5232 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 register Py_ssize_t i;
5236 register Py_ssize_t j;
5237 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 PyObject *str;
5239
5240 for (i = j = 0; i < len; ) {
5241 /* find a token */
5242 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5243 i++;
5244 j = i;
5245 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5246 i++;
5247 if (j < i) {
5248 if (maxcount-- <= 0)
5249 break;
5250 SPLIT_APPEND(self->str, j, i);
5251 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5252 i++;
5253 j = i;
5254 }
5255 }
5256 if (j < len) {
5257 SPLIT_APPEND(self->str, j, len);
5258 }
5259 return list;
5260
5261 onError:
5262 Py_DECREF(list);
5263 return NULL;
5264}
5265
5266PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005267 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 register Py_ssize_t i;
5270 register Py_ssize_t j;
5271 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 PyObject *list;
5273 PyObject *str;
5274 Py_UNICODE *data;
5275
5276 string = PyUnicode_FromObject(string);
5277 if (string == NULL)
5278 return NULL;
5279 data = PyUnicode_AS_UNICODE(string);
5280 len = PyUnicode_GET_SIZE(string);
5281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 list = PyList_New(0);
5283 if (!list)
5284 goto onError;
5285
5286 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005287 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005290 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005294 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 if (i < len) {
5296 if (data[i] == '\r' && i + 1 < len &&
5297 data[i+1] == '\n')
5298 i += 2;
5299 else
5300 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005301 if (keepends)
5302 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 }
Guido van Rossum86662912000-04-11 15:38:46 +00005304 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 j = i;
5306 }
5307 if (j < len) {
5308 SPLIT_APPEND(data, j, len);
5309 }
5310
5311 Py_DECREF(string);
5312 return list;
5313
5314 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005315 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 Py_DECREF(string);
5317 return NULL;
5318}
5319
Tim Petersced69f82003-09-16 20:30:58 +00005320static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321PyObject *split_char(PyUnicodeObject *self,
5322 PyObject *list,
5323 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005324 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 register Py_ssize_t i;
5327 register Py_ssize_t j;
5328 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 PyObject *str;
5330
5331 for (i = j = 0; i < len; ) {
5332 if (self->str[i] == ch) {
5333 if (maxcount-- <= 0)
5334 break;
5335 SPLIT_APPEND(self->str, j, i);
5336 i = j = i + 1;
5337 } else
5338 i++;
5339 }
5340 if (j <= len) {
5341 SPLIT_APPEND(self->str, j, len);
5342 }
5343 return list;
5344
5345 onError:
5346 Py_DECREF(list);
5347 return NULL;
5348}
5349
Tim Petersced69f82003-09-16 20:30:58 +00005350static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351PyObject *split_substring(PyUnicodeObject *self,
5352 PyObject *list,
5353 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 register Py_ssize_t i;
5357 register Py_ssize_t j;
5358 Py_ssize_t len = self->length;
5359 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 PyObject *str;
5361
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005362 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 if (Py_UNICODE_MATCH(self, i, substring)) {
5364 if (maxcount-- <= 0)
5365 break;
5366 SPLIT_APPEND(self->str, j, i);
5367 i = j = i + sublen;
5368 } else
5369 i++;
5370 }
5371 if (j <= len) {
5372 SPLIT_APPEND(self->str, j, len);
5373 }
5374 return list;
5375
5376 onError:
5377 Py_DECREF(list);
5378 return NULL;
5379}
5380
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005381static
5382PyObject *rsplit_whitespace(PyUnicodeObject *self,
5383 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005384 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 register Py_ssize_t i;
5387 register Py_ssize_t j;
5388 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005389 PyObject *str;
5390
5391 for (i = j = len - 1; i >= 0; ) {
5392 /* find a token */
5393 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5394 i--;
5395 j = i;
5396 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5397 i--;
5398 if (j > i) {
5399 if (maxcount-- <= 0)
5400 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005401 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005402 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5403 i--;
5404 j = i;
5405 }
5406 }
5407 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005409 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 if (PyList_Reverse(list) < 0)
5411 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005412 return list;
5413
5414 onError:
5415 Py_DECREF(list);
5416 return NULL;
5417}
5418
5419static
5420PyObject *rsplit_char(PyUnicodeObject *self,
5421 PyObject *list,
5422 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005423 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 register Py_ssize_t i;
5426 register Py_ssize_t j;
5427 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005428 PyObject *str;
5429
5430 for (i = j = len - 1; i >= 0; ) {
5431 if (self->str[i] == ch) {
5432 if (maxcount-- <= 0)
5433 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005434 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005435 j = i = i - 1;
5436 } else
5437 i--;
5438 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005439 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005441 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 if (PyList_Reverse(list) < 0)
5443 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005444 return list;
5445
5446 onError:
5447 Py_DECREF(list);
5448 return NULL;
5449}
5450
5451static
5452PyObject *rsplit_substring(PyUnicodeObject *self,
5453 PyObject *list,
5454 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457 register Py_ssize_t i;
5458 register Py_ssize_t j;
5459 Py_ssize_t len = self->length;
5460 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005461 PyObject *str;
5462
5463 for (i = len - sublen, j = len; i >= 0; ) {
5464 if (Py_UNICODE_MATCH(self, i, substring)) {
5465 if (maxcount-- <= 0)
5466 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005467 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005468 j = i;
5469 i -= sublen;
5470 } else
5471 i--;
5472 }
5473 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005474 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005475 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005476 if (PyList_Reverse(list) < 0)
5477 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005478 return list;
5479
5480 onError:
5481 Py_DECREF(list);
5482 return NULL;
5483}
5484
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485#undef SPLIT_APPEND
5486
5487static
5488PyObject *split(PyUnicodeObject *self,
5489 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491{
5492 PyObject *list;
5493
5494 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005495 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
5497 list = PyList_New(0);
5498 if (!list)
5499 return NULL;
5500
5501 if (substring == NULL)
5502 return split_whitespace(self,list,maxcount);
5503
5504 else if (substring->length == 1)
5505 return split_char(self,list,substring->str[0],maxcount);
5506
5507 else if (substring->length == 0) {
5508 Py_DECREF(list);
5509 PyErr_SetString(PyExc_ValueError, "empty separator");
5510 return NULL;
5511 }
5512 else
5513 return split_substring(self,list,substring,maxcount);
5514}
5515
Tim Petersced69f82003-09-16 20:30:58 +00005516static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005517PyObject *rsplit(PyUnicodeObject *self,
5518 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005520{
5521 PyObject *list;
5522
5523 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005524 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005525
5526 list = PyList_New(0);
5527 if (!list)
5528 return NULL;
5529
5530 if (substring == NULL)
5531 return rsplit_whitespace(self,list,maxcount);
5532
5533 else if (substring->length == 1)
5534 return rsplit_char(self,list,substring->str[0],maxcount);
5535
5536 else if (substring->length == 0) {
5537 Py_DECREF(list);
5538 PyErr_SetString(PyExc_ValueError, "empty separator");
5539 return NULL;
5540 }
5541 else
5542 return rsplit_substring(self,list,substring,maxcount);
5543}
5544
5545static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546PyObject *replace(PyUnicodeObject *self,
5547 PyUnicodeObject *str1,
5548 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005549 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
5551 PyUnicodeObject *u;
5552
5553 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005554 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Thomas Wouters477c8d52006-05-27 19:21:47 +00005556 if (str1->length == str2->length) {
5557 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005558 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559 if (str1->length == 1) {
5560 /* replace characters */
5561 Py_UNICODE u1, u2;
5562 if (!findchar(self->str, self->length, str1->str[0]))
5563 goto nothing;
5564 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5565 if (!u)
5566 return NULL;
5567 Py_UNICODE_COPY(u->str, self->str, self->length);
5568 u1 = str1->str[0];
5569 u2 = str2->str[0];
5570 for (i = 0; i < u->length; i++)
5571 if (u->str[i] == u1) {
5572 if (--maxcount < 0)
5573 break;
5574 u->str[i] = u2;
5575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005577 i = fastsearch(
5578 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005580 if (i < 0)
5581 goto nothing;
5582 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5583 if (!u)
5584 return NULL;
5585 Py_UNICODE_COPY(u->str, self->str, self->length);
5586 while (i <= self->length - str1->length)
5587 if (Py_UNICODE_MATCH(self, i, str1)) {
5588 if (--maxcount < 0)
5589 break;
5590 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5591 i += str1->length;
5592 } else
5593 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005596
5597 Py_ssize_t n, i, j, e;
5598 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 Py_UNICODE *p;
5600
5601 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 if (n > maxcount)
5604 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005605 if (n == 0)
5606 goto nothing;
5607 /* new_size = self->length + n * (str2->length - str1->length)); */
5608 delta = (str2->length - str1->length);
5609 if (delta == 0) {
5610 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005612 product = n * (str2->length - str1->length);
5613 if ((product / (str2->length - str1->length)) != n) {
5614 PyErr_SetString(PyExc_OverflowError,
5615 "replace string is too long");
5616 return NULL;
5617 }
5618 new_size = self->length + product;
5619 if (new_size < 0) {
5620 PyErr_SetString(PyExc_OverflowError,
5621 "replace string is too long");
5622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
5624 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 u = _PyUnicode_New(new_size);
5626 if (!u)
5627 return NULL;
5628 i = 0;
5629 p = u->str;
5630 e = self->length - str1->length;
5631 if (str1->length > 0) {
5632 while (n-- > 0) {
5633 /* look for next match */
5634 j = i;
5635 while (j <= e) {
5636 if (Py_UNICODE_MATCH(self, j, str1))
5637 break;
5638 j++;
5639 }
5640 if (j > i) {
5641 if (j > e)
5642 break;
5643 /* copy unchanged part [i:j] */
5644 Py_UNICODE_COPY(p, self->str+i, j-i);
5645 p += j - i;
5646 }
5647 /* copy substitution string */
5648 if (str2->length > 0) {
5649 Py_UNICODE_COPY(p, str2->str, str2->length);
5650 p += str2->length;
5651 }
5652 i = j + str1->length;
5653 }
5654 if (i < self->length)
5655 /* copy tail [i:] */
5656 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5657 } else {
5658 /* interleave */
5659 while (n > 0) {
5660 Py_UNICODE_COPY(p, str2->str, str2->length);
5661 p += str2->length;
5662 if (--n <= 0)
5663 break;
5664 *p++ = self->str[i++];
5665 }
5666 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005670
5671nothing:
5672 /* nothing to replace; return original string (when possible) */
5673 if (PyUnicode_CheckExact(self)) {
5674 Py_INCREF(self);
5675 return (PyObject *) self;
5676 }
5677 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678}
5679
5680/* --- Unicode Object Methods --------------------------------------------- */
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683"S.title() -> unicode\n\
5684\n\
5685Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
5688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005689unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return fixup(self, fixtitle);
5692}
5693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695"S.capitalize() -> unicode\n\
5696\n\
5697Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005701unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return fixup(self, fixcapitalize);
5704}
5705
5706#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708"S.capwords() -> unicode\n\
5709\n\
5710Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005711normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
5713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005714unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
5716 PyObject *list;
5717 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 /* Split into words */
5721 list = split(self, NULL, -1);
5722 if (!list)
5723 return NULL;
5724
5725 /* Capitalize each word */
5726 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5727 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5728 fixcapitalize);
5729 if (item == NULL)
5730 goto onError;
5731 Py_DECREF(PyList_GET_ITEM(list, i));
5732 PyList_SET_ITEM(list, i, item);
5733 }
5734
5735 /* Join the words to form a new string */
5736 item = PyUnicode_Join(NULL, list);
5737
5738onError:
5739 Py_DECREF(list);
5740 return (PyObject *)item;
5741}
5742#endif
5743
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005744/* Argument converter. Coerces to a single unicode character */
5745
5746static int
5747convert_uc(PyObject *obj, void *addr)
5748{
5749 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5750 PyObject *uniobj;
5751 Py_UNICODE *unistr;
5752
5753 uniobj = PyUnicode_FromObject(obj);
5754 if (uniobj == NULL) {
5755 PyErr_SetString(PyExc_TypeError,
5756 "The fill character cannot be converted to Unicode");
5757 return 0;
5758 }
5759 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5760 PyErr_SetString(PyExc_TypeError,
5761 "The fill character must be exactly one character long");
5762 Py_DECREF(uniobj);
5763 return 0;
5764 }
5765 unistr = PyUnicode_AS_UNICODE(uniobj);
5766 *fillcharloc = unistr[0];
5767 Py_DECREF(uniobj);
5768 return 1;
5769}
5770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005771PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005772"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005774Return S centered in a Unicode string of length width. Padding is\n\
5775done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
5777static PyObject *
5778unicode_center(PyUnicodeObject *self, PyObject *args)
5779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005780 Py_ssize_t marg, left;
5781 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005782 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Thomas Woutersde017742006-02-16 19:34:37 +00005784 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 return NULL;
5786
Tim Peters7a29bd52001-09-12 03:03:31 +00005787 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 Py_INCREF(self);
5789 return (PyObject*) self;
5790 }
5791
5792 marg = width - self->length;
5793 left = marg / 2 + (marg & width & 1);
5794
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005795 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796}
5797
Marc-André Lemburge5034372000-08-08 08:04:29 +00005798#if 0
5799
5800/* This code should go into some future Unicode collation support
5801 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005802 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005803
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005804/* speedy UTF-16 code point order comparison */
5805/* gleaned from: */
5806/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5807
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005808static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005809{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005810 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005811 0, 0, 0, 0, 0, 0, 0, 0,
5812 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005813 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005814};
5815
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816static int
5817unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005820
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 Py_UNICODE *s1 = str1->str;
5822 Py_UNICODE *s2 = str2->str;
5823
5824 len1 = str1->length;
5825 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005828 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005829
5830 c1 = *s1++;
5831 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005832
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005833 if (c1 > (1<<11) * 26)
5834 c1 += utf16Fixup[c1>>11];
5835 if (c2 > (1<<11) * 26)
5836 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005837 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005838
5839 if (c1 != c2)
5840 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005842 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
5844
5845 return (len1 < len2) ? -1 : (len1 != len2);
5846}
5847
Marc-André Lemburge5034372000-08-08 08:04:29 +00005848#else
5849
5850static int
5851unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005854
5855 Py_UNICODE *s1 = str1->str;
5856 Py_UNICODE *s2 = str2->str;
5857
5858 len1 = str1->length;
5859 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005860
Marc-André Lemburge5034372000-08-08 08:04:29 +00005861 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005862 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005863
Fredrik Lundh45714e92001-06-26 16:39:36 +00005864 c1 = *s1++;
5865 c2 = *s2++;
5866
5867 if (c1 != c2)
5868 return (c1 < c2) ? -1 : 1;
5869
Marc-André Lemburge5034372000-08-08 08:04:29 +00005870 len1--; len2--;
5871 }
5872
5873 return (len1 < len2) ? -1 : (len1 != len2);
5874}
5875
5876#endif
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878int PyUnicode_Compare(PyObject *left,
5879 PyObject *right)
5880{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005881 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5882 return unicode_compare((PyUnicodeObject *)left,
5883 (PyUnicodeObject *)right);
5884 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5885 (PyUnicode_Check(left) && PyString_Check(right))) {
5886 if (PyUnicode_Check(left))
5887 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5888 if (PyUnicode_Check(right))
5889 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5890 assert(PyString_Check(left));
5891 assert(PyString_Check(right));
5892 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005894 PyErr_Format(PyExc_TypeError,
5895 "Can't compare %.100s and %.100s",
5896 left->ob_type->tp_name,
5897 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 return -1;
5899}
5900
Martin v. Löwis5b222132007-06-10 09:51:05 +00005901int
5902PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5903{
5904 int i;
5905 Py_UNICODE *id;
5906 assert(PyUnicode_Check(uni));
5907 id = PyUnicode_AS_UNICODE(uni);
5908 /* Compare Unicode string and source character set string */
5909 for (i = 0; id[i] && str[i]; i++)
5910 if (id[i] != str[i])
5911 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5912 if (id[i])
5913 return 1; /* uni is longer */
5914 if (str[i])
5915 return -1; /* str is longer */
5916 return 0;
5917}
5918
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005919PyObject *PyUnicode_RichCompare(PyObject *left,
5920 PyObject *right,
5921 int op)
5922{
5923 int result;
5924
5925 result = PyUnicode_Compare(left, right);
5926 if (result == -1 && PyErr_Occurred())
5927 goto onError;
5928
5929 /* Convert the return value to a Boolean */
5930 switch (op) {
5931 case Py_EQ:
5932 result = (result == 0);
5933 break;
5934 case Py_NE:
5935 result = (result != 0);
5936 break;
5937 case Py_LE:
5938 result = (result <= 0);
5939 break;
5940 case Py_GE:
5941 result = (result >= 0);
5942 break;
5943 case Py_LT:
5944 result = (result == -1);
5945 break;
5946 case Py_GT:
5947 result = (result == 1);
5948 break;
5949 }
5950 return PyBool_FromLong(result);
5951
5952 onError:
5953
5954 /* Standard case
5955
5956 Type errors mean that PyUnicode_FromObject() could not convert
5957 one of the arguments (usually the right hand side) to Unicode,
5958 ie. we can't handle the comparison request. However, it is
5959 possible that the other object knows a comparison method, which
5960 is why we return Py_NotImplemented to give the other object a
5961 chance.
5962
5963 */
5964 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5965 PyErr_Clear();
5966 Py_INCREF(Py_NotImplemented);
5967 return Py_NotImplemented;
5968 }
5969 if (op != Py_EQ && op != Py_NE)
5970 return NULL;
5971
5972 /* Equality comparison.
5973
5974 This is a special case: we silence any PyExc_UnicodeDecodeError
5975 and instead turn it into a PyErr_UnicodeWarning.
5976
5977 */
5978 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5979 return NULL;
5980 PyErr_Clear();
5981 if (PyErr_Warn(PyExc_UnicodeWarning,
5982 (op == Py_EQ) ?
5983 "Unicode equal comparison "
5984 "failed to convert both arguments to Unicode - "
5985 "interpreting them as being unequal" :
5986 "Unicode unequal comparison "
5987 "failed to convert both arguments to Unicode - "
5988 "interpreting them as being unequal"
5989 ) < 0)
5990 return NULL;
5991 result = (op == Py_NE);
5992 return PyBool_FromLong(result);
5993}
5994
Guido van Rossum403d68b2000-03-13 15:55:09 +00005995int PyUnicode_Contains(PyObject *container,
5996 PyObject *element)
5997{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005999 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006000
6001 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 sub = PyUnicode_FromObject(element);
6003 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006004 PyErr_Format(PyExc_TypeError,
6005 "'in <string>' requires string as left operand, not %s",
6006 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006007 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006008 }
6009
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 str = PyUnicode_FromObject(container);
6011 if (!str) {
6012 Py_DECREF(sub);
6013 return -1;
6014 }
6015
6016 result = stringlib_contains_obj(str, sub);
6017
6018 Py_DECREF(str);
6019 Py_DECREF(sub);
6020
Guido van Rossum403d68b2000-03-13 15:55:09 +00006021 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006022}
6023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024/* Concat to string or Unicode object giving a new Unicode object. */
6025
6026PyObject *PyUnicode_Concat(PyObject *left,
6027 PyObject *right)
6028{
6029 PyUnicodeObject *u = NULL, *v = NULL, *w;
6030
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006031 if (PyBytes_Check(left) || PyBytes_Check(right))
6032 return PyBytes_Concat(left, right);
6033
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 /* Coerce the two arguments */
6035 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6036 if (u == NULL)
6037 goto onError;
6038 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6039 if (v == NULL)
6040 goto onError;
6041
6042 /* Shortcuts */
6043 if (v == unicode_empty) {
6044 Py_DECREF(v);
6045 return (PyObject *)u;
6046 }
6047 if (u == unicode_empty) {
6048 Py_DECREF(u);
6049 return (PyObject *)v;
6050 }
6051
6052 /* Concat the two Unicode strings */
6053 w = _PyUnicode_New(u->length + v->length);
6054 if (w == NULL)
6055 goto onError;
6056 Py_UNICODE_COPY(w->str, u->str, u->length);
6057 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6058
6059 Py_DECREF(u);
6060 Py_DECREF(v);
6061 return (PyObject *)w;
6062
6063onError:
6064 Py_XDECREF(u);
6065 Py_XDECREF(v);
6066 return NULL;
6067}
6068
Walter Dörwald1ab83302007-05-18 17:15:44 +00006069void
6070PyUnicode_Append(PyObject **pleft, PyObject *right)
6071{
6072 PyObject *new;
6073 if (*pleft == NULL)
6074 return;
6075 if (right == NULL || !PyUnicode_Check(*pleft)) {
6076 Py_DECREF(*pleft);
6077 *pleft = NULL;
6078 return;
6079 }
6080 new = PyUnicode_Concat(*pleft, right);
6081 Py_DECREF(*pleft);
6082 *pleft = new;
6083}
6084
6085void
6086PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6087{
6088 PyUnicode_Append(pleft, right);
6089 Py_XDECREF(right);
6090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093"S.count(sub[, start[, end]]) -> int\n\
6094\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006095Return the number of non-overlapping occurrences of substring sub in\n\
6096Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099static PyObject *
6100unicode_count(PyUnicodeObject *self, PyObject *args)
6101{
6102 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006104 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 PyObject *result;
6106
Guido van Rossumb8872e62000-05-09 14:14:27 +00006107 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6108 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 return NULL;
6110
6111 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006112 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 if (substring == NULL)
6114 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006115
Thomas Wouters477c8d52006-05-27 19:21:47 +00006116 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
Thomas Wouters477c8d52006-05-27 19:21:47 +00006118 result = PyInt_FromSsize_t(
6119 stringlib_count(self->str + start, end - start,
6120 substring->str, substring->length)
6121 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
6123 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 return result;
6126}
6127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006128PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006129"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006131Encodes S using the codec registered for encoding. encoding defaults\n\
6132to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006133handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6135'xmlcharrefreplace' as well as any other name registered with\n\
6136codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
6138static PyObject *
6139unicode_encode(PyUnicodeObject *self, PyObject *args)
6140{
6141 char *encoding = NULL;
6142 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006143 PyObject *v;
6144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6146 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006147 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006148 if (v == NULL)
6149 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006150 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006151 if (PyString_Check(v)) {
6152 /* Old codec, turn it into bytes */
6153 PyObject *b = PyBytes_FromObject(v);
6154 Py_DECREF(v);
6155 return b;
6156 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006157 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006158 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006159 "(type=%.400s)",
6160 v->ob_type->tp_name);
6161 Py_DECREF(v);
6162 return NULL;
6163 }
6164 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006165
6166 onError:
6167 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006168}
6169
6170PyDoc_STRVAR(decode__doc__,
6171"S.decode([encoding[,errors]]) -> string or unicode\n\
6172\n\
6173Decodes S using the codec registered for encoding. encoding defaults\n\
6174to the default encoding. errors may be given to set a different error\n\
6175handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6176a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6177as well as any other name registerd with codecs.register_error that is\n\
6178able to handle UnicodeDecodeErrors.");
6179
6180static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006181unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006182{
6183 char *encoding = NULL;
6184 char *errors = NULL;
6185 PyObject *v;
6186
6187 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6188 return NULL;
6189 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006190 if (v == NULL)
6191 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006192 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6193 PyErr_Format(PyExc_TypeError,
6194 "decoder did not return a string/unicode object "
6195 "(type=%.400s)",
6196 v->ob_type->tp_name);
6197 Py_DECREF(v);
6198 return NULL;
6199 }
6200 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006201
6202 onError:
6203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204}
6205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207"S.expandtabs([tabsize]) -> unicode\n\
6208\n\
6209Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006210If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
6212static PyObject*
6213unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6214{
6215 Py_UNICODE *e;
6216 Py_UNICODE *p;
6217 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 PyUnicodeObject *u;
6220 int tabsize = 8;
6221
6222 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6223 return NULL;
6224
Thomas Wouters7e474022000-07-16 12:04:32 +00006225 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 i = j = 0;
6227 e = self->str + self->length;
6228 for (p = self->str; p < e; p++)
6229 if (*p == '\t') {
6230 if (tabsize > 0)
6231 j += tabsize - (j % tabsize);
6232 }
6233 else {
6234 j++;
6235 if (*p == '\n' || *p == '\r') {
6236 i += j;
6237 j = 0;
6238 }
6239 }
6240
6241 /* Second pass: create output string and fill it */
6242 u = _PyUnicode_New(i + j);
6243 if (!u)
6244 return NULL;
6245
6246 j = 0;
6247 q = u->str;
6248
6249 for (p = self->str; p < e; p++)
6250 if (*p == '\t') {
6251 if (tabsize > 0) {
6252 i = tabsize - (j % tabsize);
6253 j += i;
6254 while (i--)
6255 *q++ = ' ';
6256 }
6257 }
6258 else {
6259 j++;
6260 *q++ = *p;
6261 if (*p == '\n' || *p == '\r')
6262 j = 0;
6263 }
6264
6265 return (PyObject*) u;
6266}
6267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006268PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269"S.find(sub [,start [,end]]) -> int\n\
6270\n\
6271Return the lowest index in S where substring sub is found,\n\
6272such that sub is contained within s[start,end]. Optional\n\
6273arguments start and end are interpreted as in slice notation.\n\
6274\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006275Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
6277static PyObject *
6278unicode_find(PyUnicodeObject *self, PyObject *args)
6279{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006280 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006282 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006283 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284
Guido van Rossumb8872e62000-05-09 14:14:27 +00006285 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6286 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006288 substring = PyUnicode_FromObject(substring);
6289 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 return NULL;
6291
Thomas Wouters477c8d52006-05-27 19:21:47 +00006292 result = stringlib_find_slice(
6293 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6294 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6295 start, end
6296 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
6298 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299
6300 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301}
6302
6303static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305{
6306 if (index < 0 || index >= self->length) {
6307 PyErr_SetString(PyExc_IndexError, "string index out of range");
6308 return NULL;
6309 }
6310
6311 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6312}
6313
6314static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006315unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006317 /* Since Unicode objects compare equal to their UTF-8 string
6318 counterparts, we hash the UTF-8 string. */
6319 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6320 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321}
6322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006323PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324"S.index(sub [,start [,end]]) -> int\n\
6325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006326Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328static PyObject *
6329unicode_index(PyUnicodeObject *self, PyObject *args)
6330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006331 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006332 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006334 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
Guido van Rossumb8872e62000-05-09 14:14:27 +00006336 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6337 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006339 substring = PyUnicode_FromObject(substring);
6340 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 return NULL;
6342
Thomas Wouters477c8d52006-05-27 19:21:47 +00006343 result = stringlib_find_slice(
6344 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6345 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6346 start, end
6347 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
6349 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 if (result < 0) {
6352 PyErr_SetString(PyExc_ValueError, "substring not found");
6353 return NULL;
6354 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006355
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357}
6358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006359PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006360"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006362Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006363at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006366unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
6368 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6369 register const Py_UNICODE *e;
6370 int cased;
6371
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 /* Shortcut for single character strings */
6373 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006374 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006376 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006377 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006378 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 e = p + PyUnicode_GET_SIZE(self);
6381 cased = 0;
6382 for (; p < e; p++) {
6383 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006384
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 else if (!cased && Py_UNICODE_ISLOWER(ch))
6388 cased = 1;
6389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391}
6392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006393PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006396Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006397at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398
6399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006400unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401{
6402 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6403 register const Py_UNICODE *e;
6404 int cased;
6405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 /* Shortcut for single character strings */
6407 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006408 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006410 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006411 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 e = p + PyUnicode_GET_SIZE(self);
6415 cased = 0;
6416 for (; p < e; p++) {
6417 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 else if (!cased && Py_UNICODE_ISUPPER(ch))
6422 cased = 1;
6423 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006428"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006430Return True if S is a titlecased string and there is at least one\n\
6431character in S, i.e. upper- and titlecase characters may only\n\
6432follow uncased characters and lowercase characters only cased ones.\n\
6433Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
6435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006436unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
6438 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6439 register const Py_UNICODE *e;
6440 int cased, previous_is_cased;
6441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 /* Shortcut for single character strings */
6443 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006444 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6445 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006447 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006448 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006449 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006450
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 e = p + PyUnicode_GET_SIZE(self);
6452 cased = 0;
6453 previous_is_cased = 0;
6454 for (; p < e; p++) {
6455 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6458 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 previous_is_cased = 1;
6461 cased = 1;
6462 }
6463 else if (Py_UNICODE_ISLOWER(ch)) {
6464 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 previous_is_cased = 1;
6467 cased = 1;
6468 }
6469 else
6470 previous_is_cased = 0;
6471 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006472 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473}
6474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006475PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006476"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006478Return True if all characters in S are whitespace\n\
6479and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
6481static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006482unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483{
6484 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6485 register const Py_UNICODE *e;
6486
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 /* Shortcut for single character strings */
6488 if (PyUnicode_GET_SIZE(self) == 1 &&
6489 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006490 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006492 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006493 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006494 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006495
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 e = p + PyUnicode_GET_SIZE(self);
6497 for (; p < e; p++) {
6498 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006499 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006501 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502}
6503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006505"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006506\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006507Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006508and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006509
6510static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006511unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006512{
6513 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6514 register const Py_UNICODE *e;
6515
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006516 /* Shortcut for single character strings */
6517 if (PyUnicode_GET_SIZE(self) == 1 &&
6518 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006519 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006520
6521 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006522 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006523 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006524
6525 e = p + PyUnicode_GET_SIZE(self);
6526 for (; p < e; p++) {
6527 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006528 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006529 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006530 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006531}
6532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006533PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006534"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006536Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006537and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006538
6539static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006540unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006541{
6542 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6543 register const Py_UNICODE *e;
6544
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006545 /* Shortcut for single character strings */
6546 if (PyUnicode_GET_SIZE(self) == 1 &&
6547 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006548 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006549
6550 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006551 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006552 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006553
6554 e = p + PyUnicode_GET_SIZE(self);
6555 for (; p < e; p++) {
6556 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006558 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006559 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006560}
6561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006563"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006565Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006569unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570{
6571 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6572 register const Py_UNICODE *e;
6573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 /* Shortcut for single character strings */
6575 if (PyUnicode_GET_SIZE(self) == 1 &&
6576 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006577 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006579 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006580 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006581 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006582
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 e = p + PyUnicode_GET_SIZE(self);
6584 for (; p < e; p++) {
6585 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006586 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006588 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589}
6590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006591PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006592"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006594Return True if all characters in S are digits\n\
6595and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006598unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
6600 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6601 register const Py_UNICODE *e;
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* Shortcut for single character strings */
6604 if (PyUnicode_GET_SIZE(self) == 1 &&
6605 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006606 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006608 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006609 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006610 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006611
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 e = p + PyUnicode_GET_SIZE(self);
6613 for (; p < e; p++) {
6614 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006615 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006617 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618}
6619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006621"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006623Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006624False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
6626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006627unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6630 register const Py_UNICODE *e;
6631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 /* Shortcut for single character strings */
6633 if (PyUnicode_GET_SIZE(self) == 1 &&
6634 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006635 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006637 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006638 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006640
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 e = p + PyUnicode_GET_SIZE(self);
6642 for (; p < e; p++) {
6643 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006644 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006646 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650"S.join(sequence) -> unicode\n\
6651\n\
6652Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006656unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006658 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662unicode_length(PyUnicodeObject *self)
6663{
6664 return self->length;
6665}
6666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006668"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669\n\
6670Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006671done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject *
6674unicode_ljust(PyUnicodeObject *self, PyObject *args)
6675{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006676 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006677 Py_UNICODE fillchar = ' ';
6678
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006679 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 return NULL;
6681
Tim Peters7a29bd52001-09-12 03:03:31 +00006682 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 Py_INCREF(self);
6684 return (PyObject*) self;
6685 }
6686
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006687 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006690PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691"S.lower() -> unicode\n\
6692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694
6695static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006696unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 return fixup(self, fixlower);
6699}
6700
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006701#define LEFTSTRIP 0
6702#define RIGHTSTRIP 1
6703#define BOTHSTRIP 2
6704
6705/* Arrays indexed by above */
6706static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6707
6708#define STRIPNAME(i) (stripformat[i]+3)
6709
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006710/* externally visible for str.strip(unicode) */
6711PyObject *
6712_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6713{
6714 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006715 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006716 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006717 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6718 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006719
Thomas Wouters477c8d52006-05-27 19:21:47 +00006720 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6721
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006722 i = 0;
6723 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006724 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6725 i++;
6726 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006727 }
6728
6729 j = len;
6730 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006731 do {
6732 j--;
6733 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6734 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006735 }
6736
6737 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738 Py_INCREF(self);
6739 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006740 }
6741 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006742 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006743}
6744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
6746static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006747do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006750 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006751
6752 i = 0;
6753 if (striptype != RIGHTSTRIP) {
6754 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6755 i++;
6756 }
6757 }
6758
6759 j = len;
6760 if (striptype != LEFTSTRIP) {
6761 do {
6762 j--;
6763 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6764 j++;
6765 }
6766
6767 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6768 Py_INCREF(self);
6769 return (PyObject*)self;
6770 }
6771 else
6772 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006775
6776static PyObject *
6777do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6778{
6779 PyObject *sep = NULL;
6780
6781 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6782 return NULL;
6783
6784 if (sep != NULL && sep != Py_None) {
6785 if (PyUnicode_Check(sep))
6786 return _PyUnicode_XStrip(self, striptype, sep);
6787 else if (PyString_Check(sep)) {
6788 PyObject *res;
6789 sep = PyUnicode_FromObject(sep);
6790 if (sep==NULL)
6791 return NULL;
6792 res = _PyUnicode_XStrip(self, striptype, sep);
6793 Py_DECREF(sep);
6794 return res;
6795 }
6796 else {
6797 PyErr_Format(PyExc_TypeError,
6798 "%s arg must be None, unicode or str",
6799 STRIPNAME(striptype));
6800 return NULL;
6801 }
6802 }
6803
6804 return do_strip(self, striptype);
6805}
6806
6807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006808PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006809"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006810\n\
6811Return a copy of the string S with leading and trailing\n\
6812whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006813If chars is given and not None, remove characters in chars instead.\n\
6814If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006815
6816static PyObject *
6817unicode_strip(PyUnicodeObject *self, PyObject *args)
6818{
6819 if (PyTuple_GET_SIZE(args) == 0)
6820 return do_strip(self, BOTHSTRIP); /* Common case */
6821 else
6822 return do_argstrip(self, BOTHSTRIP, args);
6823}
6824
6825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006826PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006827"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006828\n\
6829Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006830If chars is given and not None, remove characters in chars instead.\n\
6831If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006832
6833static PyObject *
6834unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6835{
6836 if (PyTuple_GET_SIZE(args) == 0)
6837 return do_strip(self, LEFTSTRIP); /* Common case */
6838 else
6839 return do_argstrip(self, LEFTSTRIP, args);
6840}
6841
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006844"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006845\n\
6846Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006847If chars is given and not None, remove characters in chars instead.\n\
6848If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006849
6850static PyObject *
6851unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6852{
6853 if (PyTuple_GET_SIZE(args) == 0)
6854 return do_strip(self, RIGHTSTRIP); /* Common case */
6855 else
6856 return do_argstrip(self, RIGHTSTRIP, args);
6857}
6858
6859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
6863 PyUnicodeObject *u;
6864 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006865 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006866 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868 if (len < 0)
6869 len = 0;
6870
Tim Peters7a29bd52001-09-12 03:03:31 +00006871 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 /* no repeat, return original string */
6873 Py_INCREF(str);
6874 return (PyObject*) str;
6875 }
Tim Peters8f422462000-09-09 06:13:41 +00006876
6877 /* ensure # of chars needed doesn't overflow int and # of bytes
6878 * needed doesn't overflow size_t
6879 */
6880 nchars = len * str->length;
6881 if (len && nchars / len != str->length) {
6882 PyErr_SetString(PyExc_OverflowError,
6883 "repeated string is too long");
6884 return NULL;
6885 }
6886 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6887 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6888 PyErr_SetString(PyExc_OverflowError,
6889 "repeated string is too long");
6890 return NULL;
6891 }
6892 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 if (!u)
6894 return NULL;
6895
6896 p = u->str;
6897
Thomas Wouters477c8d52006-05-27 19:21:47 +00006898 if (str->length == 1 && len > 0) {
6899 Py_UNICODE_FILL(p, str->str[0], len);
6900 } else {
6901 Py_ssize_t done = 0; /* number of characters copied this far */
6902 if (done < nchars) {
6903 Py_UNICODE_COPY(p, str->str, str->length);
6904 done = str->length;
6905 }
6906 while (done < nchars) {
6907 int n = (done <= nchars-done) ? done : nchars-done;
6908 Py_UNICODE_COPY(p+done, p, n);
6909 done += n;
6910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
6912
6913 return (PyObject*) u;
6914}
6915
6916PyObject *PyUnicode_Replace(PyObject *obj,
6917 PyObject *subobj,
6918 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 PyObject *self;
6922 PyObject *str1;
6923 PyObject *str2;
6924 PyObject *result;
6925
6926 self = PyUnicode_FromObject(obj);
6927 if (self == NULL)
6928 return NULL;
6929 str1 = PyUnicode_FromObject(subobj);
6930 if (str1 == NULL) {
6931 Py_DECREF(self);
6932 return NULL;
6933 }
6934 str2 = PyUnicode_FromObject(replobj);
6935 if (str2 == NULL) {
6936 Py_DECREF(self);
6937 Py_DECREF(str1);
6938 return NULL;
6939 }
Tim Petersced69f82003-09-16 20:30:58 +00006940 result = replace((PyUnicodeObject *)self,
6941 (PyUnicodeObject *)str1,
6942 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 maxcount);
6944 Py_DECREF(self);
6945 Py_DECREF(str1);
6946 Py_DECREF(str2);
6947 return result;
6948}
6949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006950PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951"S.replace (old, new[, maxsplit]) -> unicode\n\
6952\n\
6953Return a copy of S with all occurrences of substring\n\
6954old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
6957static PyObject*
6958unicode_replace(PyUnicodeObject *self, PyObject *args)
6959{
6960 PyUnicodeObject *str1;
6961 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006962 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 PyObject *result;
6964
Martin v. Löwis18e16552006-02-15 17:27:45 +00006965 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 return NULL;
6967 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6968 if (str1 == NULL)
6969 return NULL;
6970 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006971 if (str2 == NULL) {
6972 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
6976 result = replace(self, str1, str2, maxcount);
6977
6978 Py_DECREF(str1);
6979 Py_DECREF(str2);
6980 return result;
6981}
6982
6983static
6984PyObject *unicode_repr(PyObject *unicode)
6985{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006986 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006987 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006988 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6989 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6990
6991 /* XXX(nnorwitz): rather than over-allocating, it would be
6992 better to choose a different scheme. Perhaps scan the
6993 first N-chars of the string and allocate based on that size.
6994 */
6995 /* Initial allocation is based on the longest-possible unichr
6996 escape.
6997
6998 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6999 unichr, so in this case it's the longest unichr escape. In
7000 narrow (UTF-16) builds this is five chars per source unichr
7001 since there are two unichrs in the surrogate pair, so in narrow
7002 (UTF-16) builds it's not the longest unichr escape.
7003
7004 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7005 so in the narrow (UTF-16) build case it's the longest unichr
7006 escape.
7007 */
7008
Walter Dörwald1ab83302007-05-18 17:15:44 +00007009 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007010 2 /* quotes */
7011#ifdef Py_UNICODE_WIDE
7012 + 10*size
7013#else
7014 + 6*size
7015#endif
7016 + 1);
7017 if (repr == NULL)
7018 return NULL;
7019
Walter Dörwald1ab83302007-05-18 17:15:44 +00007020 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007021
7022 /* Add quote */
7023 *p++ = (findchar(s, size, '\'') &&
7024 !findchar(s, size, '"')) ? '"' : '\'';
7025 while (size-- > 0) {
7026 Py_UNICODE ch = *s++;
7027
7028 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007029 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007030 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007031 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007032 continue;
7033 }
7034
7035#ifdef Py_UNICODE_WIDE
7036 /* Map 21-bit characters to '\U00xxxxxx' */
7037 else if (ch >= 0x10000) {
7038 *p++ = '\\';
7039 *p++ = 'U';
7040 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7041 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7042 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7043 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7044 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7045 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7046 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7047 *p++ = hexdigits[ch & 0x0000000F];
7048 continue;
7049 }
7050#else
7051 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7052 else if (ch >= 0xD800 && ch < 0xDC00) {
7053 Py_UNICODE ch2;
7054 Py_UCS4 ucs;
7055
7056 ch2 = *s++;
7057 size--;
7058 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7059 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7060 *p++ = '\\';
7061 *p++ = 'U';
7062 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7063 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7064 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7065 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7066 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7067 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7068 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7069 *p++ = hexdigits[ucs & 0x0000000F];
7070 continue;
7071 }
7072 /* Fall through: isolated surrogates are copied as-is */
7073 s--;
7074 size++;
7075 }
7076#endif
7077
7078 /* Map 16-bit characters to '\uxxxx' */
7079 if (ch >= 256) {
7080 *p++ = '\\';
7081 *p++ = 'u';
7082 *p++ = hexdigits[(ch >> 12) & 0x000F];
7083 *p++ = hexdigits[(ch >> 8) & 0x000F];
7084 *p++ = hexdigits[(ch >> 4) & 0x000F];
7085 *p++ = hexdigits[ch & 0x000F];
7086 }
7087
7088 /* Map special whitespace to '\t', \n', '\r' */
7089 else if (ch == '\t') {
7090 *p++ = '\\';
7091 *p++ = 't';
7092 }
7093 else if (ch == '\n') {
7094 *p++ = '\\';
7095 *p++ = 'n';
7096 }
7097 else if (ch == '\r') {
7098 *p++ = '\\';
7099 *p++ = 'r';
7100 }
7101
7102 /* Map non-printable US ASCII to '\xhh' */
7103 else if (ch < ' ' || ch >= 0x7F) {
7104 *p++ = '\\';
7105 *p++ = 'x';
7106 *p++ = hexdigits[(ch >> 4) & 0x000F];
7107 *p++ = hexdigits[ch & 0x000F];
7108 }
7109
7110 /* Copy everything else as-is */
7111 else
7112 *p++ = (char) ch;
7113 }
7114 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007115 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007116
7117 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007118 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007119 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120}
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123"S.rfind(sub [,start [,end]]) -> int\n\
7124\n\
7125Return the highest index in S where substring sub is found,\n\
7126such that sub is contained within s[start,end]. Optional\n\
7127arguments start and end are interpreted as in slice notation.\n\
7128\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131static PyObject *
7132unicode_rfind(PyUnicodeObject *self, PyObject *args)
7133{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007134 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007136 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007137 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
Guido van Rossumb8872e62000-05-09 14:14:27 +00007139 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7140 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007142 substring = PyUnicode_FromObject(substring);
7143 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 return NULL;
7145
Thomas Wouters477c8d52006-05-27 19:21:47 +00007146 result = stringlib_rfind_slice(
7147 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7148 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7149 start, end
7150 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153
7154 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155}
7156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007157PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158"S.rindex(sub [,start [,end]]) -> int\n\
7159\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007160Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161
7162static PyObject *
7163unicode_rindex(PyUnicodeObject *self, PyObject *args)
7164{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007165 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007166 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007167 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
Guido van Rossumb8872e62000-05-09 14:14:27 +00007170 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7171 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007173 substring = PyUnicode_FromObject(substring);
7174 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 return NULL;
7176
Thomas Wouters477c8d52006-05-27 19:21:47 +00007177 result = stringlib_rfind_slice(
7178 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7179 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7180 start, end
7181 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007184
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 if (result < 0) {
7186 PyErr_SetString(PyExc_ValueError, "substring not found");
7187 return NULL;
7188 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007189 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190}
7191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007192PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007193"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194\n\
7195Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007196done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198static PyObject *
7199unicode_rjust(PyUnicodeObject *self, PyObject *args)
7200{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007201 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007202 Py_UNICODE fillchar = ' ';
7203
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007204 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
7206
Tim Peters7a29bd52001-09-12 03:03:31 +00007207 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 Py_INCREF(self);
7209 return (PyObject*) self;
7210 }
7211
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007212 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213}
7214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007216unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217{
7218 /* standard clamping */
7219 if (start < 0)
7220 start = 0;
7221 if (end < 0)
7222 end = 0;
7223 if (end > self->length)
7224 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007225 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 /* full slice, return original string */
7227 Py_INCREF(self);
7228 return (PyObject*) self;
7229 }
7230 if (start > end)
7231 start = end;
7232 /* copy slice */
7233 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7234 end - start);
7235}
7236
7237PyObject *PyUnicode_Split(PyObject *s,
7238 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240{
7241 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007242
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 s = PyUnicode_FromObject(s);
7244 if (s == NULL)
7245 return NULL;
7246 if (sep != NULL) {
7247 sep = PyUnicode_FromObject(sep);
7248 if (sep == NULL) {
7249 Py_DECREF(s);
7250 return NULL;
7251 }
7252 }
7253
7254 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7255
7256 Py_DECREF(s);
7257 Py_XDECREF(sep);
7258 return result;
7259}
7260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007261PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262"S.split([sep [,maxsplit]]) -> list of strings\n\
7263\n\
7264Return a list of the words in S, using sep as the\n\
7265delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007266splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007267any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
7269static PyObject*
7270unicode_split(PyUnicodeObject *self, PyObject *args)
7271{
7272 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
Martin v. Löwis18e16552006-02-15 17:27:45 +00007275 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 return NULL;
7277
7278 if (substring == Py_None)
7279 return split(self, NULL, maxcount);
7280 else if (PyUnicode_Check(substring))
7281 return split(self, (PyUnicodeObject *)substring, maxcount);
7282 else
7283 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7284}
7285
Thomas Wouters477c8d52006-05-27 19:21:47 +00007286PyObject *
7287PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7288{
7289 PyObject* str_obj;
7290 PyObject* sep_obj;
7291 PyObject* out;
7292
7293 str_obj = PyUnicode_FromObject(str_in);
7294 if (!str_obj)
7295 return NULL;
7296 sep_obj = PyUnicode_FromObject(sep_in);
7297 if (!sep_obj) {
7298 Py_DECREF(str_obj);
7299 return NULL;
7300 }
7301
7302 out = stringlib_partition(
7303 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7304 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7305 );
7306
7307 Py_DECREF(sep_obj);
7308 Py_DECREF(str_obj);
7309
7310 return out;
7311}
7312
7313
7314PyObject *
7315PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7316{
7317 PyObject* str_obj;
7318 PyObject* sep_obj;
7319 PyObject* out;
7320
7321 str_obj = PyUnicode_FromObject(str_in);
7322 if (!str_obj)
7323 return NULL;
7324 sep_obj = PyUnicode_FromObject(sep_in);
7325 if (!sep_obj) {
7326 Py_DECREF(str_obj);
7327 return NULL;
7328 }
7329
7330 out = stringlib_rpartition(
7331 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7332 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7333 );
7334
7335 Py_DECREF(sep_obj);
7336 Py_DECREF(str_obj);
7337
7338 return out;
7339}
7340
7341PyDoc_STRVAR(partition__doc__,
7342"S.partition(sep) -> (head, sep, tail)\n\
7343\n\
7344Searches for the separator sep in S, and returns the part before it,\n\
7345the separator itself, and the part after it. If the separator is not\n\
7346found, returns S and two empty strings.");
7347
7348static PyObject*
7349unicode_partition(PyUnicodeObject *self, PyObject *separator)
7350{
7351 return PyUnicode_Partition((PyObject *)self, separator);
7352}
7353
7354PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007355"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007356\n\
7357Searches for the separator sep in S, starting at the end of S, and returns\n\
7358the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007359separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007360
7361static PyObject*
7362unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7363{
7364 return PyUnicode_RPartition((PyObject *)self, separator);
7365}
7366
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007367PyObject *PyUnicode_RSplit(PyObject *s,
7368 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007370{
7371 PyObject *result;
7372
7373 s = PyUnicode_FromObject(s);
7374 if (s == NULL)
7375 return NULL;
7376 if (sep != NULL) {
7377 sep = PyUnicode_FromObject(sep);
7378 if (sep == NULL) {
7379 Py_DECREF(s);
7380 return NULL;
7381 }
7382 }
7383
7384 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7385
7386 Py_DECREF(s);
7387 Py_XDECREF(sep);
7388 return result;
7389}
7390
7391PyDoc_STRVAR(rsplit__doc__,
7392"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7393\n\
7394Return a list of the words in S, using sep as the\n\
7395delimiter string, starting at the end of the string and\n\
7396working to the front. If maxsplit is given, at most maxsplit\n\
7397splits are done. If sep is not specified, any whitespace string\n\
7398is a separator.");
7399
7400static PyObject*
7401unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7402{
7403 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007404 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007405
Martin v. Löwis18e16552006-02-15 17:27:45 +00007406 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007407 return NULL;
7408
7409 if (substring == Py_None)
7410 return rsplit(self, NULL, maxcount);
7411 else if (PyUnicode_Check(substring))
7412 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7413 else
7414 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7415}
7416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007417PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007418"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419\n\
7420Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007421Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007422is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423
7424static PyObject*
7425unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7426{
Guido van Rossum86662912000-04-11 15:38:46 +00007427 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Guido van Rossum86662912000-04-11 15:38:46 +00007429 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 return NULL;
7431
Guido van Rossum86662912000-04-11 15:38:46 +00007432 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433}
7434
7435static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007436PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437{
Walter Dörwald346737f2007-05-31 10:44:43 +00007438 if (PyUnicode_CheckExact(self)) {
7439 Py_INCREF(self);
7440 return self;
7441 } else
7442 /* Subtype -- return genuine unicode string with the same value. */
7443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7444 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445}
7446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448"S.swapcase() -> unicode\n\
7449\n\
7450Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007451and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007454unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 return fixup(self, fixswapcase);
7457}
7458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007459PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460"S.translate(table) -> unicode\n\
7461\n\
7462Return a copy of the string S, where all characters have been mapped\n\
7463through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007464Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7465Unmapped characters are left untouched. Characters mapped to None\n\
7466are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007469unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
Tim Petersced69f82003-09-16 20:30:58 +00007471 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007473 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 "ignore");
7475}
7476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478"S.upper() -> unicode\n\
7479\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007480Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
7482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007483unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 return fixup(self, fixupper);
7486}
7487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489"S.zfill(width) -> unicode\n\
7490\n\
7491Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493
7494static PyObject *
7495unicode_zfill(PyUnicodeObject *self, PyObject *args)
7496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007497 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 PyUnicodeObject *u;
7499
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 Py_ssize_t width;
7501 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 return NULL;
7503
7504 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007505 if (PyUnicode_CheckExact(self)) {
7506 Py_INCREF(self);
7507 return (PyObject*) self;
7508 }
7509 else
7510 return PyUnicode_FromUnicode(
7511 PyUnicode_AS_UNICODE(self),
7512 PyUnicode_GET_SIZE(self)
7513 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 }
7515
7516 fill = width - self->length;
7517
7518 u = pad(self, fill, 0, '0');
7519
Walter Dörwald068325e2002-04-15 13:36:47 +00007520 if (u == NULL)
7521 return NULL;
7522
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 if (u->str[fill] == '+' || u->str[fill] == '-') {
7524 /* move sign to beginning of string */
7525 u->str[0] = u->str[fill];
7526 u->str[fill] = '0';
7527 }
7528
7529 return (PyObject*) u;
7530}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531
7532#if 0
7533static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007534unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return PyInt_FromLong(unicode_freelist_size);
7537}
7538#endif
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007541"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007543Return True if S starts with the specified prefix, False otherwise.\n\
7544With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007545With optional end, stop comparing S at that position.\n\
7546prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
7548static PyObject *
7549unicode_startswith(PyUnicodeObject *self,
7550 PyObject *args)
7551{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007552 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007554 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007555 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007556 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007558 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007559 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561 if (PyTuple_Check(subobj)) {
7562 Py_ssize_t i;
7563 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7564 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7565 PyTuple_GET_ITEM(subobj, i));
7566 if (substring == NULL)
7567 return NULL;
7568 result = tailmatch(self, substring, start, end, -1);
7569 Py_DECREF(substring);
7570 if (result) {
7571 Py_RETURN_TRUE;
7572 }
7573 }
7574 /* nothing matched */
7575 Py_RETURN_FALSE;
7576 }
7577 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579 return NULL;
7580 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007582 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583}
7584
7585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007586PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007587"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007589Return True if S ends with the specified suffix, False otherwise.\n\
7590With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591With optional end, stop comparing S at that position.\n\
7592suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593
7594static PyObject *
7595unicode_endswith(PyUnicodeObject *self,
7596 PyObject *args)
7597{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007600 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007601 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007604 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7605 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007607 if (PyTuple_Check(subobj)) {
7608 Py_ssize_t i;
7609 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7610 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7611 PyTuple_GET_ITEM(subobj, i));
7612 if (substring == NULL)
7613 return NULL;
7614 result = tailmatch(self, substring, start, end, +1);
7615 Py_DECREF(substring);
7616 if (result) {
7617 Py_RETURN_TRUE;
7618 }
7619 }
7620 Py_RETURN_FALSE;
7621 }
7622 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629}
7630
7631
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007632
7633static PyObject *
7634unicode_getnewargs(PyUnicodeObject *v)
7635{
7636 return Py_BuildValue("(u#)", v->str, v->length);
7637}
7638
7639
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640static PyMethodDef unicode_methods[] = {
7641
7642 /* Order is according to common usage: often used methods should
7643 appear first, since lookup is done sequentially. */
7644
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007645 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7646 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7647 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007648 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007649 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7650 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7651 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7652 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7653 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7654 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7655 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007656 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007657 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7658 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7659 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007660 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007661 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007662/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7663 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7664 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7665 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007666 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007667 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007668 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007669 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007670 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7671 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7672 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7673 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7674 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7675 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7676 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7677 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7678 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7679 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7680 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7681 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7682 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7683 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007684 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007685#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007686 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687#endif
7688
7689#if 0
7690 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692#endif
7693
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007694 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 {NULL, NULL}
7696};
7697
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007698static PyObject *
7699unicode_mod(PyObject *v, PyObject *w)
7700{
7701 if (!PyUnicode_Check(v)) {
7702 Py_INCREF(Py_NotImplemented);
7703 return Py_NotImplemented;
7704 }
7705 return PyUnicode_Format(v, w);
7706}
7707
7708static PyNumberMethods unicode_as_number = {
7709 0, /*nb_add*/
7710 0, /*nb_subtract*/
7711 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007712 unicode_mod, /*nb_remainder*/
7713};
7714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007717 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007718 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7719 (ssizeargfunc) unicode_getitem, /* sq_item */
7720 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 0, /* sq_ass_item */
7722 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007723 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724};
7725
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007726static PyObject*
7727unicode_subscript(PyUnicodeObject* self, PyObject* item)
7728{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007729 if (PyIndex_Check(item)) {
7730 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007731 if (i == -1 && PyErr_Occurred())
7732 return NULL;
7733 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007734 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007735 return unicode_getitem(self, i);
7736 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007738 Py_UNICODE* source_buf;
7739 Py_UNICODE* result_buf;
7740 PyObject* result;
7741
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007742 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007743 &start, &stop, &step, &slicelength) < 0) {
7744 return NULL;
7745 }
7746
7747 if (slicelength <= 0) {
7748 return PyUnicode_FromUnicode(NULL, 0);
7749 } else {
7750 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007751 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7752 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007753
7754 if (result_buf == NULL)
7755 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007756
7757 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7758 result_buf[i] = source_buf[cur];
7759 }
Tim Petersced69f82003-09-16 20:30:58 +00007760
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007761 result = PyUnicode_FromUnicode(result_buf, slicelength);
7762 PyMem_FREE(result_buf);
7763 return result;
7764 }
7765 } else {
7766 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7767 return NULL;
7768 }
7769}
7770
7771static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007772 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007773 (binaryfunc)unicode_subscript, /* mp_subscript */
7774 (objobjargproc)0, /* mp_ass_subscript */
7775};
7776
Martin v. Löwis18e16552006-02-15 17:27:45 +00007777static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007779 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 const void **ptr)
7781{
7782 if (index != 0) {
7783 PyErr_SetString(PyExc_SystemError,
7784 "accessing non-existent unicode segment");
7785 return -1;
7786 }
7787 *ptr = (void *) self->str;
7788 return PyUnicode_GET_DATA_SIZE(self);
7789}
7790
Martin v. Löwis18e16552006-02-15 17:27:45 +00007791static Py_ssize_t
7792unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 const void **ptr)
7794{
7795 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007796 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 return -1;
7798}
7799
7800static int
7801unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
7804 if (lenp)
7805 *lenp = PyUnicode_GET_DATA_SIZE(self);
7806 return 1;
7807}
7808
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007809static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 const void **ptr)
7813{
7814 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007815
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 if (index != 0) {
7817 PyErr_SetString(PyExc_SystemError,
7818 "accessing non-existent unicode segment");
7819 return -1;
7820 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007821 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 if (str == NULL)
7823 return -1;
7824 *ptr = (void *) PyString_AS_STRING(str);
7825 return PyString_GET_SIZE(str);
7826}
7827
7828/* Helpers for PyUnicode_Format() */
7829
7830static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 if (argidx < arglen) {
7835 (*p_argidx)++;
7836 if (arglen < 0)
7837 return args;
7838 else
7839 return PyTuple_GetItem(args, argidx);
7840 }
7841 PyErr_SetString(PyExc_TypeError,
7842 "not enough arguments for format string");
7843 return NULL;
7844}
7845
7846#define F_LJUST (1<<0)
7847#define F_SIGN (1<<1)
7848#define F_BLANK (1<<2)
7849#define F_ALT (1<<3)
7850#define F_ZERO (1<<4)
7851
Martin v. Löwis18e16552006-02-15 17:27:45 +00007852static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007853strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007855 register Py_ssize_t i;
7856 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 for (i = len - 1; i >= 0; i--)
7858 buffer[i] = (Py_UNICODE) charbuffer[i];
7859
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 return len;
7861}
7862
Neal Norwitzfc76d632006-01-10 06:03:13 +00007863static int
7864doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7865{
Tim Peters15231542006-02-16 01:08:01 +00007866 Py_ssize_t result;
7867
Neal Norwitzfc76d632006-01-10 06:03:13 +00007868 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007869 result = strtounicode(buffer, (char *)buffer);
7870 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007871}
7872
7873static int
7874longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7875{
Tim Peters15231542006-02-16 01:08:01 +00007876 Py_ssize_t result;
7877
Neal Norwitzfc76d632006-01-10 06:03:13 +00007878 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007879 result = strtounicode(buffer, (char *)buffer);
7880 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007881}
7882
Guido van Rossum078151d2002-08-11 04:24:12 +00007883/* XXX To save some code duplication, formatfloat/long/int could have been
7884 shared with stringobject.c, converting from 8-bit to Unicode after the
7885 formatting is done. */
7886
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887static int
7888formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007889 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 int flags,
7891 int prec,
7892 int type,
7893 PyObject *v)
7894{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007895 /* fmt = '%#.' + `prec` + `type`
7896 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 char fmt[20];
7898 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 x = PyFloat_AsDouble(v);
7901 if (x == -1.0 && PyErr_Occurred())
7902 return -1;
7903 if (prec < 0)
7904 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7906 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007907 /* Worst case length calc to ensure no buffer overrun:
7908
7909 'g' formats:
7910 fmt = %#.<prec>g
7911 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7912 for any double rep.)
7913 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7914
7915 'f' formats:
7916 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7917 len = 1 + 50 + 1 + prec = 52 + prec
7918
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007919 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007920 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007921
7922 */
7923 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7924 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007925 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007926 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007927 return -1;
7928 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007929 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7930 (flags&F_ALT) ? "#" : "",
7931 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007932 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933}
7934
Tim Peters38fd5b62000-09-21 05:43:11 +00007935static PyObject*
7936formatlong(PyObject *val, int flags, int prec, int type)
7937{
7938 char *buf;
7939 int i, len;
7940 PyObject *str; /* temporary string object. */
7941 PyUnicodeObject *result;
7942
7943 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7944 if (!str)
7945 return NULL;
7946 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007947 if (!result) {
7948 Py_DECREF(str);
7949 return NULL;
7950 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007951 for (i = 0; i < len; i++)
7952 result->str[i] = buf[i];
7953 result->str[len] = 0;
7954 Py_DECREF(str);
7955 return (PyObject*)result;
7956}
7957
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958static int
7959formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007960 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 int flags,
7962 int prec,
7963 int type,
7964 PyObject *v)
7965{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007966 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007967 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7968 * + 1 + 1
7969 * = 24
7970 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007971 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007972 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 long x;
7974
7975 x = PyInt_AsLong(v);
7976 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007977 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007978 if (x < 0 && type == 'u') {
7979 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007980 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007981 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7982 sign = "-";
7983 else
7984 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007986 prec = 1;
7987
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007988 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7989 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007990 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007991 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007992 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007993 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007994 return -1;
7995 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007996
7997 if ((flags & F_ALT) &&
7998 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007999 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008000 * of issues that cause pain:
8001 * - when 0 is being converted, the C standard leaves off
8002 * the '0x' or '0X', which is inconsistent with other
8003 * %#x/%#X conversions and inconsistent with Python's
8004 * hex() function
8005 * - there are platforms that violate the standard and
8006 * convert 0 with the '0x' or '0X'
8007 * (Metrowerks, Compaq Tru64)
8008 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008009 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008010 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008011 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008012 * We can achieve the desired consistency by inserting our
8013 * own '0x' or '0X' prefix, and substituting %x/%X in place
8014 * of %#x/%#X.
8015 *
8016 * Note that this is the same approach as used in
8017 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008018 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008019 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8020 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008021 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008022 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008023 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8024 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008025 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008026 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008027 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008028 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008029 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008030 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031}
8032
8033static int
8034formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008035 size_t buflen,
8036 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008038 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008039 if (PyUnicode_Check(v)) {
8040 if (PyUnicode_GET_SIZE(v) != 1)
8041 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008045 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008046 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008047 goto onError;
8048 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
8051 else {
8052 /* Integer input truncated to a character */
8053 long x;
8054 x = PyInt_AsLong(v);
8055 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008056 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008057#ifdef Py_UNICODE_WIDE
8058 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008059 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008060 "%c arg not in range(0x110000) "
8061 "(wide Python build)");
8062 return -1;
8063 }
8064#else
8065 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008066 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008067 "%c arg not in range(0x10000) "
8068 "(narrow Python build)");
8069 return -1;
8070 }
8071#endif
8072 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
8074 buf[1] = '\0';
8075 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008076
8077 onError:
8078 PyErr_SetString(PyExc_TypeError,
8079 "%c requires int or char");
8080 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081}
8082
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008083/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8084
8085 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8086 chars are formatted. XXX This is a magic number. Each formatting
8087 routine does bounds checking to ensure no overflow, but a better
8088 solution may be to malloc a buffer of appropriate size for each
8089 format. For now, the current solution is sufficient.
8090*/
8091#define FORMATBUFLEN (size_t)120
8092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093PyObject *PyUnicode_Format(PyObject *format,
8094 PyObject *args)
8095{
8096 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098 int args_owned = 0;
8099 PyUnicodeObject *result = NULL;
8100 PyObject *dict = NULL;
8101 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 if (format == NULL || args == NULL) {
8104 PyErr_BadInternalCall();
8105 return NULL;
8106 }
8107 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008108 if (uformat == NULL)
8109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 fmt = PyUnicode_AS_UNICODE(uformat);
8111 fmtcnt = PyUnicode_GET_SIZE(uformat);
8112
8113 reslen = rescnt = fmtcnt + 100;
8114 result = _PyUnicode_New(reslen);
8115 if (result == NULL)
8116 goto onError;
8117 res = PyUnicode_AS_UNICODE(result);
8118
8119 if (PyTuple_Check(args)) {
8120 arglen = PyTuple_Size(args);
8121 argidx = 0;
8122 }
8123 else {
8124 arglen = -1;
8125 argidx = -2;
8126 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008127 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8128 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 dict = args;
8130
8131 while (--fmtcnt >= 0) {
8132 if (*fmt != '%') {
8133 if (--rescnt < 0) {
8134 rescnt = fmtcnt + 100;
8135 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008136 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8139 --rescnt;
8140 }
8141 *res++ = *fmt++;
8142 }
8143 else {
8144 /* Got a format specifier */
8145 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 Py_UNICODE c = '\0';
8149 Py_UNICODE fill;
8150 PyObject *v = NULL;
8151 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008152 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008154 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008155 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156
8157 fmt++;
8158 if (*fmt == '(') {
8159 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 PyObject *key;
8162 int pcount = 1;
8163
8164 if (dict == NULL) {
8165 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008166 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 goto onError;
8168 }
8169 ++fmt;
8170 --fmtcnt;
8171 keystart = fmt;
8172 /* Skip over balanced parentheses */
8173 while (pcount > 0 && --fmtcnt >= 0) {
8174 if (*fmt == ')')
8175 --pcount;
8176 else if (*fmt == '(')
8177 ++pcount;
8178 fmt++;
8179 }
8180 keylen = fmt - keystart - 1;
8181 if (fmtcnt < 0 || pcount > 0) {
8182 PyErr_SetString(PyExc_ValueError,
8183 "incomplete format key");
8184 goto onError;
8185 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008186#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008187 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 then looked up since Python uses strings to hold
8189 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008190 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 key = PyUnicode_EncodeUTF8(keystart,
8192 keylen,
8193 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008194#else
8195 key = PyUnicode_FromUnicode(keystart, keylen);
8196#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 if (key == NULL)
8198 goto onError;
8199 if (args_owned) {
8200 Py_DECREF(args);
8201 args_owned = 0;
8202 }
8203 args = PyObject_GetItem(dict, key);
8204 Py_DECREF(key);
8205 if (args == NULL) {
8206 goto onError;
8207 }
8208 args_owned = 1;
8209 arglen = -1;
8210 argidx = -2;
8211 }
8212 while (--fmtcnt >= 0) {
8213 switch (c = *fmt++) {
8214 case '-': flags |= F_LJUST; continue;
8215 case '+': flags |= F_SIGN; continue;
8216 case ' ': flags |= F_BLANK; continue;
8217 case '#': flags |= F_ALT; continue;
8218 case '0': flags |= F_ZERO; continue;
8219 }
8220 break;
8221 }
8222 if (c == '*') {
8223 v = getnextarg(args, arglen, &argidx);
8224 if (v == NULL)
8225 goto onError;
8226 if (!PyInt_Check(v)) {
8227 PyErr_SetString(PyExc_TypeError,
8228 "* wants int");
8229 goto onError;
8230 }
8231 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008232 if (width == -1 && PyErr_Occurred())
8233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 if (width < 0) {
8235 flags |= F_LJUST;
8236 width = -width;
8237 }
8238 if (--fmtcnt >= 0)
8239 c = *fmt++;
8240 }
8241 else if (c >= '0' && c <= '9') {
8242 width = c - '0';
8243 while (--fmtcnt >= 0) {
8244 c = *fmt++;
8245 if (c < '0' || c > '9')
8246 break;
8247 if ((width*10) / 10 != width) {
8248 PyErr_SetString(PyExc_ValueError,
8249 "width too big");
8250 goto onError;
8251 }
8252 width = width*10 + (c - '0');
8253 }
8254 }
8255 if (c == '.') {
8256 prec = 0;
8257 if (--fmtcnt >= 0)
8258 c = *fmt++;
8259 if (c == '*') {
8260 v = getnextarg(args, arglen, &argidx);
8261 if (v == NULL)
8262 goto onError;
8263 if (!PyInt_Check(v)) {
8264 PyErr_SetString(PyExc_TypeError,
8265 "* wants int");
8266 goto onError;
8267 }
8268 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008269 if (prec == -1 && PyErr_Occurred())
8270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 if (prec < 0)
8272 prec = 0;
8273 if (--fmtcnt >= 0)
8274 c = *fmt++;
8275 }
8276 else if (c >= '0' && c <= '9') {
8277 prec = c - '0';
8278 while (--fmtcnt >= 0) {
8279 c = Py_CHARMASK(*fmt++);
8280 if (c < '0' || c > '9')
8281 break;
8282 if ((prec*10) / 10 != prec) {
8283 PyErr_SetString(PyExc_ValueError,
8284 "prec too big");
8285 goto onError;
8286 }
8287 prec = prec*10 + (c - '0');
8288 }
8289 }
8290 } /* prec */
8291 if (fmtcnt >= 0) {
8292 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 if (--fmtcnt >= 0)
8294 c = *fmt++;
8295 }
8296 }
8297 if (fmtcnt < 0) {
8298 PyErr_SetString(PyExc_ValueError,
8299 "incomplete format");
8300 goto onError;
8301 }
8302 if (c != '%') {
8303 v = getnextarg(args, arglen, &argidx);
8304 if (v == NULL)
8305 goto onError;
8306 }
8307 sign = 0;
8308 fill = ' ';
8309 switch (c) {
8310
8311 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008312 pbuf = formatbuf;
8313 /* presume that buffer length is at least 1 */
8314 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 len = 1;
8316 break;
8317
8318 case 's':
8319 case 'r':
8320 if (PyUnicode_Check(v) && c == 's') {
8321 temp = v;
8322 Py_INCREF(temp);
8323 }
8324 else {
8325 PyObject *unicode;
8326 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008327 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 else
8329 temp = PyObject_Repr(v);
8330 if (temp == NULL)
8331 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008332 if (PyUnicode_Check(temp))
8333 /* nothing to do */;
8334 else if (PyString_Check(temp)) {
8335 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008336 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008338 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008340 Py_DECREF(temp);
8341 temp = unicode;
8342 if (temp == NULL)
8343 goto onError;
8344 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008345 else {
8346 Py_DECREF(temp);
8347 PyErr_SetString(PyExc_TypeError,
8348 "%s argument has non-string str()");
8349 goto onError;
8350 }
8351 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008352 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 len = PyUnicode_GET_SIZE(temp);
8354 if (prec >= 0 && len > prec)
8355 len = prec;
8356 break;
8357
8358 case 'i':
8359 case 'd':
8360 case 'u':
8361 case 'o':
8362 case 'x':
8363 case 'X':
8364 if (c == 'i')
8365 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008366 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008367 temp = formatlong(v, flags, prec, c);
8368 if (!temp)
8369 goto onError;
8370 pbuf = PyUnicode_AS_UNICODE(temp);
8371 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008372 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008374 else {
8375 pbuf = formatbuf;
8376 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8377 flags, prec, c, v);
8378 if (len < 0)
8379 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008380 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008381 }
8382 if (flags & F_ZERO)
8383 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 break;
8385
8386 case 'e':
8387 case 'E':
8388 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008389 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 case 'g':
8391 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008392 if (c == 'F')
8393 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008394 pbuf = formatbuf;
8395 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8396 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 if (len < 0)
8398 goto onError;
8399 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008400 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 fill = '0';
8402 break;
8403
8404 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008405 pbuf = formatbuf;
8406 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 if (len < 0)
8408 goto onError;
8409 break;
8410
8411 default:
8412 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008413 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008414 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008415 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008416 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008417 (Py_ssize_t)(fmt - 1 -
8418 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 goto onError;
8420 }
8421 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008422 if (*pbuf == '-' || *pbuf == '+') {
8423 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 len--;
8425 }
8426 else if (flags & F_SIGN)
8427 sign = '+';
8428 else if (flags & F_BLANK)
8429 sign = ' ';
8430 else
8431 sign = 0;
8432 }
8433 if (width < len)
8434 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008435 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 reslen -= rescnt;
8437 rescnt = width + fmtcnt + 100;
8438 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008439 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008440 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008441 PyErr_NoMemory();
8442 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008443 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008444 if (_PyUnicode_Resize(&result, reslen) < 0) {
8445 Py_XDECREF(temp);
8446 goto onError;
8447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 res = PyUnicode_AS_UNICODE(result)
8449 + reslen - rescnt;
8450 }
8451 if (sign) {
8452 if (fill != ' ')
8453 *res++ = sign;
8454 rescnt--;
8455 if (width > len)
8456 width--;
8457 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008458 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8459 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008460 assert(pbuf[1] == c);
8461 if (fill != ' ') {
8462 *res++ = *pbuf++;
8463 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008464 }
Tim Petersfff53252001-04-12 18:38:48 +00008465 rescnt -= 2;
8466 width -= 2;
8467 if (width < 0)
8468 width = 0;
8469 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 if (width > len && !(flags & F_LJUST)) {
8472 do {
8473 --rescnt;
8474 *res++ = fill;
8475 } while (--width > len);
8476 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008477 if (fill == ' ') {
8478 if (sign)
8479 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008480 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008481 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008482 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008483 *res++ = *pbuf++;
8484 *res++ = *pbuf++;
8485 }
8486 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008487 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 res += len;
8489 rescnt -= len;
8490 while (--width >= len) {
8491 --rescnt;
8492 *res++ = ' ';
8493 }
8494 if (dict && (argidx < arglen) && c != '%') {
8495 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008496 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008497 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 goto onError;
8499 }
8500 Py_XDECREF(temp);
8501 } /* '%' */
8502 } /* until end */
8503 if (argidx < arglen && !dict) {
8504 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008505 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 goto onError;
8507 }
8508
Thomas Woutersa96affe2006-03-12 00:29:36 +00008509 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 if (args_owned) {
8512 Py_DECREF(args);
8513 }
8514 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 return (PyObject *)result;
8516
8517 onError:
8518 Py_XDECREF(result);
8519 Py_DECREF(uformat);
8520 if (args_owned) {
8521 Py_DECREF(args);
8522 }
8523 return NULL;
8524}
8525
8526static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 (readbufferproc) unicode_buffer_getreadbuf,
8528 (writebufferproc) unicode_buffer_getwritebuf,
8529 (segcountproc) unicode_buffer_getsegcount,
8530 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531};
8532
Jeremy Hylton938ace62002-07-17 16:30:39 +00008533static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008534unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8535
Tim Peters6d6c1a32001-08-02 04:15:00 +00008536static PyObject *
8537unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8538{
8539 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008540 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008541 char *encoding = NULL;
8542 char *errors = NULL;
8543
Guido van Rossume023fe02001-08-30 03:12:59 +00008544 if (type != &PyUnicode_Type)
8545 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008546 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8547 kwlist, &x, &encoding, &errors))
8548 return NULL;
8549 if (x == NULL)
8550 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008551 if (encoding == NULL && errors == NULL)
8552 return PyObject_Unicode(x);
8553 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008554 return PyUnicode_FromEncodedObject(x, encoding, errors);
8555}
8556
Guido van Rossume023fe02001-08-30 03:12:59 +00008557static PyObject *
8558unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8559{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008560 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008561 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008562
8563 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8564 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8565 if (tmp == NULL)
8566 return NULL;
8567 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008568 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008569 if (pnew == NULL) {
8570 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008571 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008572 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008573 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8574 if (pnew->str == NULL) {
8575 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008576 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008577 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008578 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008579 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008580 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8581 pnew->length = n;
8582 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008583 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008584 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008585}
8586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008587PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008588"unicode(string [, encoding[, errors]]) -> object\n\
8589\n\
8590Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008591encoding defaults to the current default string encoding.\n\
8592errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008593
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008594static PyObject *unicode_iter(PyObject *seq);
8595
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596PyTypeObject PyUnicode_Type = {
8597 PyObject_HEAD_INIT(&PyType_Type)
8598 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008599 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 sizeof(PyUnicodeObject), /* tp_size */
8601 0, /* tp_itemsize */
8602 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008603 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008605 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008607 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008608 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008609 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008611 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 (hashfunc) unicode_hash, /* tp_hash*/
8613 0, /* tp_call*/
8614 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008615 PyObject_GenericGetAttr, /* tp_getattro */
8616 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008618 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8619 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008620 unicode_doc, /* tp_doc */
8621 0, /* tp_traverse */
8622 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008623 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008624 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008625 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008626 0, /* tp_iternext */
8627 unicode_methods, /* tp_methods */
8628 0, /* tp_members */
8629 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008630 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008631 0, /* tp_dict */
8632 0, /* tp_descr_get */
8633 0, /* tp_descr_set */
8634 0, /* tp_dictoffset */
8635 0, /* tp_init */
8636 0, /* tp_alloc */
8637 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008638 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639};
8640
8641/* Initialize the Unicode implementation */
8642
Thomas Wouters78890102000-07-22 19:25:51 +00008643void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008645 int i;
8646
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647 /* XXX - move this array to unicodectype.c ? */
8648 Py_UNICODE linebreak[] = {
8649 0x000A, /* LINE FEED */
8650 0x000D, /* CARRIAGE RETURN */
8651 0x001C, /* FILE SEPARATOR */
8652 0x001D, /* GROUP SEPARATOR */
8653 0x001E, /* RECORD SEPARATOR */
8654 0x0085, /* NEXT LINE */
8655 0x2028, /* LINE SEPARATOR */
8656 0x2029, /* PARAGRAPH SEPARATOR */
8657 };
8658
Fred Drakee4315f52000-05-09 19:53:39 +00008659 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008660 unicode_freelist = NULL;
8661 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008663 if (!unicode_empty)
8664 return;
8665
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008666 for (i = 0; i < 256; i++)
8667 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008668 if (PyType_Ready(&PyUnicode_Type) < 0)
8669 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008670
8671 /* initialize the linebreak bloom filter */
8672 bloom_linebreak = make_bloom_mask(
8673 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8674 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008675
8676 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677}
8678
8679/* Finalize the Unicode implementation */
8680
8681void
Thomas Wouters78890102000-07-22 19:25:51 +00008682_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008684 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008685 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008687 Py_XDECREF(unicode_empty);
8688 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008689
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008690 for (i = 0; i < 256; i++) {
8691 if (unicode_latin1[i]) {
8692 Py_DECREF(unicode_latin1[i]);
8693 unicode_latin1[i] = NULL;
8694 }
8695 }
8696
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008697 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 PyUnicodeObject *v = u;
8699 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008700 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008701 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008702 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008703 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008705 unicode_freelist = NULL;
8706 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008708
Walter Dörwald16807132007-05-25 13:52:07 +00008709void
8710PyUnicode_InternInPlace(PyObject **p)
8711{
8712 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8713 PyObject *t;
8714 if (s == NULL || !PyUnicode_Check(s))
8715 Py_FatalError(
8716 "PyUnicode_InternInPlace: unicode strings only please!");
8717 /* If it's a subclass, we don't really know what putting
8718 it in the interned dict might do. */
8719 if (!PyUnicode_CheckExact(s))
8720 return;
8721 if (PyUnicode_CHECK_INTERNED(s))
8722 return;
8723 if (interned == NULL) {
8724 interned = PyDict_New();
8725 if (interned == NULL) {
8726 PyErr_Clear(); /* Don't leave an exception */
8727 return;
8728 }
8729 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008730 /* It might be that the GetItem call fails even
8731 though the key is present in the dictionary,
8732 namely when this happens during a stack overflow. */
8733 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008734 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008735 Py_END_ALLOW_RECURSION
8736
Walter Dörwald16807132007-05-25 13:52:07 +00008737 if (t) {
8738 Py_INCREF(t);
8739 Py_DECREF(*p);
8740 *p = t;
8741 return;
8742 }
8743
Martin v. Löwis5b222132007-06-10 09:51:05 +00008744 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008745 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8746 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008747 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008748 return;
8749 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008750 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008751 /* The two references in interned are not counted by refcnt.
8752 The deallocator will take care of this */
8753 s->ob_refcnt -= 2;
8754 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8755}
8756
8757void
8758PyUnicode_InternImmortal(PyObject **p)
8759{
8760 PyUnicode_InternInPlace(p);
8761 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8762 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8763 Py_INCREF(*p);
8764 }
8765}
8766
8767PyObject *
8768PyUnicode_InternFromString(const char *cp)
8769{
8770 PyObject *s = PyUnicode_FromString(cp);
8771 if (s == NULL)
8772 return NULL;
8773 PyUnicode_InternInPlace(&s);
8774 return s;
8775}
8776
8777void _Py_ReleaseInternedUnicodeStrings(void)
8778{
8779 PyObject *keys;
8780 PyUnicodeObject *s;
8781 Py_ssize_t i, n;
8782 Py_ssize_t immortal_size = 0, mortal_size = 0;
8783
8784 if (interned == NULL || !PyDict_Check(interned))
8785 return;
8786 keys = PyDict_Keys(interned);
8787 if (keys == NULL || !PyList_Check(keys)) {
8788 PyErr_Clear();
8789 return;
8790 }
8791
8792 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8793 detector, interned unicode strings are not forcibly deallocated;
8794 rather, we give them their stolen references back, and then clear
8795 and DECREF the interned dict. */
8796
8797 n = PyList_GET_SIZE(keys);
8798 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8799 n);
8800 for (i = 0; i < n; i++) {
8801 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8802 switch (s->state) {
8803 case SSTATE_NOT_INTERNED:
8804 /* XXX Shouldn't happen */
8805 break;
8806 case SSTATE_INTERNED_IMMORTAL:
8807 s->ob_refcnt += 1;
8808 immortal_size += s->length;
8809 break;
8810 case SSTATE_INTERNED_MORTAL:
8811 s->ob_refcnt += 2;
8812 mortal_size += s->length;
8813 break;
8814 default:
8815 Py_FatalError("Inconsistent interned string state.");
8816 }
8817 s->state = SSTATE_NOT_INTERNED;
8818 }
8819 fprintf(stderr, "total size of all interned strings: "
8820 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8821 "mortal/immortal\n", mortal_size, immortal_size);
8822 Py_DECREF(keys);
8823 PyDict_Clear(interned);
8824 Py_DECREF(interned);
8825 interned = NULL;
8826}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008827
8828
8829/********************* Unicode Iterator **************************/
8830
8831typedef struct {
8832 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008833 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008834 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8835} unicodeiterobject;
8836
8837static void
8838unicodeiter_dealloc(unicodeiterobject *it)
8839{
8840 _PyObject_GC_UNTRACK(it);
8841 Py_XDECREF(it->it_seq);
8842 PyObject_GC_Del(it);
8843}
8844
8845static int
8846unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8847{
8848 Py_VISIT(it->it_seq);
8849 return 0;
8850}
8851
8852static PyObject *
8853unicodeiter_next(unicodeiterobject *it)
8854{
8855 PyUnicodeObject *seq;
8856 PyObject *item;
8857
8858 assert(it != NULL);
8859 seq = it->it_seq;
8860 if (seq == NULL)
8861 return NULL;
8862 assert(PyUnicode_Check(seq));
8863
8864 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008865 item = PyUnicode_FromUnicode(
8866 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008867 if (item != NULL)
8868 ++it->it_index;
8869 return item;
8870 }
8871
8872 Py_DECREF(seq);
8873 it->it_seq = NULL;
8874 return NULL;
8875}
8876
8877static PyObject *
8878unicodeiter_len(unicodeiterobject *it)
8879{
8880 Py_ssize_t len = 0;
8881 if (it->it_seq)
8882 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8883 return PyInt_FromSsize_t(len);
8884}
8885
8886PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8887
8888static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008889 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8890 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008891 {NULL, NULL} /* sentinel */
8892};
8893
8894PyTypeObject PyUnicodeIter_Type = {
8895 PyObject_HEAD_INIT(&PyType_Type)
8896 0, /* ob_size */
8897 "unicodeiterator", /* tp_name */
8898 sizeof(unicodeiterobject), /* tp_basicsize */
8899 0, /* tp_itemsize */
8900 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008901 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008902 0, /* tp_print */
8903 0, /* tp_getattr */
8904 0, /* tp_setattr */
8905 0, /* tp_compare */
8906 0, /* tp_repr */
8907 0, /* tp_as_number */
8908 0, /* tp_as_sequence */
8909 0, /* tp_as_mapping */
8910 0, /* tp_hash */
8911 0, /* tp_call */
8912 0, /* tp_str */
8913 PyObject_GenericGetAttr, /* tp_getattro */
8914 0, /* tp_setattro */
8915 0, /* tp_as_buffer */
8916 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8917 0, /* tp_doc */
8918 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8919 0, /* tp_clear */
8920 0, /* tp_richcompare */
8921 0, /* tp_weaklistoffset */
8922 PyObject_SelfIter, /* tp_iter */
8923 (iternextfunc)unicodeiter_next, /* tp_iternext */
8924 unicodeiter_methods, /* tp_methods */
8925 0,
8926};
8927
8928static PyObject *
8929unicode_iter(PyObject *seq)
8930{
8931 unicodeiterobject *it;
8932
8933 if (!PyUnicode_Check(seq)) {
8934 PyErr_BadInternalCall();
8935 return NULL;
8936 }
8937 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8938 if (it == NULL)
8939 return NULL;
8940 it->it_index = 0;
8941 Py_INCREF(seq);
8942 it->it_seq = (PyUnicodeObject *)seq;
8943 _PyObject_GC_TRACK(it);
8944 return (PyObject *)it;
8945}
8946
Martin v. Löwis5b222132007-06-10 09:51:05 +00008947size_t
8948Py_UNICODE_strlen(const Py_UNICODE *u)
8949{
8950 int res = 0;
8951 while(*u++)
8952 res++;
8953 return res;
8954}
8955
8956Py_UNICODE*
8957Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8958{
8959 Py_UNICODE *u = s1;
8960 while ((*u++ = *s2++));
8961 return s1;
8962}
8963
8964Py_UNICODE*
8965Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8966{
8967 Py_UNICODE *u = s1;
8968 while ((*u++ = *s2++))
8969 if (n-- == 0)
8970 break;
8971 return s1;
8972}
8973
8974int
8975Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8976{
8977 while (*s1 && *s2 && *s1 == *s2)
8978 s1++, s2++;
8979 if (*s1 && *s2)
8980 return (*s1 < *s2) ? -1 : +1;
8981 if (*s1)
8982 return 1;
8983 if (*s2)
8984 return -1;
8985 return 0;
8986}
8987
8988Py_UNICODE*
8989Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
8990{
8991 const Py_UNICODE *p;
8992 for (p = s; *p; p++)
8993 if (*p == c)
8994 return (Py_UNICODE*)p;
8995 return NULL;
8996}
8997
8998
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008999#ifdef __cplusplus
9000}
9001#endif
9002
9003
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009004/*
9005Local variables:
9006c-basic-offset: 4
9007indent-tabs-mode: nil
9008End:
9009*/