blob: 39b746288b58d87c4f073e0fb33586c0e7a31985 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
543 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
808 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
809 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
810 Py_ssize_t upos;
811 for (upos = 0; upos<usize;)
812 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 ++callresult;
817 break;
818 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 case 'p':
820 sprintf(buffer, "%p", va_arg(vargs, void*));
821 /* %p is ill-defined: ensure leading 0x. */
822 if (buffer[1] == 'X')
823 buffer[1] = 'x';
824 else if (buffer[1] != 'x') {
825 memmove(buffer+2, buffer, strlen(buffer)+1);
826 buffer[0] = '0';
827 buffer[1] = 'x';
828 }
829 appendstring(buffer);
830 break;
831 case '%':
832 *s++ = '%';
833 break;
834 default:
835 appendstring(p);
836 goto end;
837 }
838 } else
839 *s++ = *f;
840 }
841
842 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000843 if (callresults)
844 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000845 if (abuffer)
846 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
848 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000849 fail:
850 if (callresults) {
851 PyObject **callresult2 = callresults;
852 while (callresult2 <= callresult) {
853 Py_DECREF(*callresult2);
854 ++callresult2;
855 }
856 PyMem_Free(callresults);
857 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000858 if (abuffer)
859 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000860 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000861}
862
863#undef appendstring
864
865PyObject *
866PyUnicode_FromFormat(const char *format, ...)
867{
868 PyObject* ret;
869 va_list vargs;
870
871#ifdef HAVE_STDARG_PROTOTYPES
872 va_start(vargs, format);
873#else
874 va_start(vargs);
875#endif
876 ret = PyUnicode_FromFormatV(format, vargs);
877 va_end(vargs);
878 return ret;
879}
880
Martin v. Löwis18e16552006-02-15 17:27:45 +0000881Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
882 wchar_t *w,
883 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884{
885 if (unicode == NULL) {
886 PyErr_BadInternalCall();
887 return -1;
888 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000889
890 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000891 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000892 size = PyUnicode_GET_SIZE(unicode) + 1;
893
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894#ifdef HAVE_USABLE_WCHAR_T
895 memcpy(w, unicode->str, size * sizeof(wchar_t));
896#else
897 {
898 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000899 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000901 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 *w++ = *u++;
903 }
904#endif
905
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000906 if (size > PyUnicode_GET_SIZE(unicode))
907 return PyUnicode_GET_SIZE(unicode);
908 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000909 return size;
910}
911
912#endif
913
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000914PyObject *PyUnicode_FromOrdinal(int ordinal)
915{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000916 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917
918#ifdef Py_UNICODE_WIDE
919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000921 "chr() arg not in range(0x110000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 "(wide Python build)");
923 return NULL;
924 }
925#else
926 if (ordinal < 0 || ordinal > 0xffff) {
927 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000928 "chr() arg not in range(0x10000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 "(narrow Python build)");
930 return NULL;
931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968#if 0
969 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000970 that no encodings is given and then redirect to
971 PyObject_Unicode() which then applies the additional logic for
972 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000973
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000974 NOTE: This API should really only be used for object which
975 represent *encoded* Unicode !
976
977 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 if (PyUnicode_Check(obj)) {
979 if (encoding) {
980 PyErr_SetString(PyExc_TypeError,
981 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986#else
987 if (PyUnicode_Check(obj)) {
988 PyErr_SetString(PyExc_TypeError,
989 "decoding Unicode is not supported");
990 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000992#endif
993
994 /* Coerce object */
995 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 s = PyString_AS_STRING(obj);
997 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000999 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1000 /* Overwrite the error message with something more useful in
1001 case of a TypeError. */
1002 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001003 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001004 "coercing to Unicode: need string or buffer, "
1005 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001006 obj->ob_type->tp_name);
1007 goto onError;
1008 }
Tim Petersced69f82003-09-16 20:30:58 +00001009
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 if (len == 0) {
1012 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 }
Tim Petersced69f82003-09-16 20:30:58 +00001015 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001017
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return v;
1019
1020 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022}
1023
1024PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001025 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 const char *encoding,
1027 const char *errors)
1028{
1029 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030
1031 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001032 encoding = PyUnicode_GetDefaultEncoding();
1033
1034 /* Shortcuts for common default encodings */
1035 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001037 else if (strcmp(encoding, "latin-1") == 0)
1038 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001039#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1040 else if (strcmp(encoding, "mbcs") == 0)
1041 return PyUnicode_DecodeMBCS(s, size, errors);
1042#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001043 else if (strcmp(encoding, "ascii") == 0)
1044 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046 /* Decode via the codec registry */
1047 buffer = PyBuffer_FromMemory((void *)s, size);
1048 if (buffer == NULL)
1049 goto onError;
1050 unicode = PyCodec_Decode(buffer, encoding, errors);
1051 if (unicode == NULL)
1052 goto onError;
1053 if (!PyUnicode_Check(unicode)) {
1054 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001055 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 unicode->ob_type->tp_name);
1057 Py_DECREF(unicode);
1058 goto onError;
1059 }
1060 Py_DECREF(buffer);
1061 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063 onError:
1064 Py_XDECREF(buffer);
1065 return NULL;
1066}
1067
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001068PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1069 const char *encoding,
1070 const char *errors)
1071{
1072 PyObject *v;
1073
1074 if (!PyUnicode_Check(unicode)) {
1075 PyErr_BadArgument();
1076 goto onError;
1077 }
1078
1079 if (encoding == NULL)
1080 encoding = PyUnicode_GetDefaultEncoding();
1081
1082 /* Decode via the codec registry */
1083 v = PyCodec_Decode(unicode, encoding, errors);
1084 if (v == NULL)
1085 goto onError;
1086 return v;
1087
1088 onError:
1089 return NULL;
1090}
1091
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 const char *encoding,
1095 const char *errors)
1096{
1097 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099 unicode = PyUnicode_FromUnicode(s, size);
1100 if (unicode == NULL)
1101 return NULL;
1102 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1103 Py_DECREF(unicode);
1104 return v;
1105}
1106
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001107PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1108 const char *encoding,
1109 const char *errors)
1110{
1111 PyObject *v;
1112
1113 if (!PyUnicode_Check(unicode)) {
1114 PyErr_BadArgument();
1115 goto onError;
1116 }
1117
1118 if (encoding == NULL)
1119 encoding = PyUnicode_GetDefaultEncoding();
1120
1121 /* Encode via the codec registry */
1122 v = PyCodec_Encode(unicode, encoding, errors);
1123 if (v == NULL)
1124 goto onError;
1125 return v;
1126
1127 onError:
1128 return NULL;
1129}
1130
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1132 const char *encoding,
1133 const char *errors)
1134{
1135 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001136
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
Fred Drakee4315f52000-05-09 19:53:39 +00001141
Tim Petersced69f82003-09-16 20:30:58 +00001142 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001143 encoding = PyUnicode_GetDefaultEncoding();
1144
1145 /* Shortcuts for common default encodings */
1146 if (errors == NULL) {
1147 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001148 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001149 else if (strcmp(encoding, "latin-1") == 0)
1150 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001151#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1152 else if (strcmp(encoding, "mbcs") == 0)
1153 return PyUnicode_AsMBCSString(unicode);
1154#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001155 else if (strcmp(encoding, "ascii") == 0)
1156 return PyUnicode_AsASCIIString(unicode);
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 /* Encode via the codec registry */
1160 v = PyCodec_Encode(unicode, encoding, errors);
1161 if (v == NULL)
1162 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001163 if (!PyBytes_Check(v)) {
1164 if (PyString_Check(v)) {
1165 /* Old codec, turn it into bytes */
1166 PyObject *b = PyBytes_FromObject(v);
1167 Py_DECREF(v);
1168 return b;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001171 "encoder did not return a bytes object "
1172 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1173 v->ob_type->tp_name,
1174 encoding ? encoding : "NULL",
1175 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 Py_DECREF(v);
1177 goto onError;
1178 }
1179 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 onError:
1182 return NULL;
1183}
1184
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001185PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1186 const char *errors)
1187{
1188 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001190 if (v)
1191 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001192 if (errors != NULL)
1193 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1194 if (errors == NULL) {
1195 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1196 PyUnicode_GET_SIZE(unicode),
1197 NULL);
1198 }
1199 else {
1200 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1201 }
1202 if (!b)
1203 return NULL;
1204 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1205 PyBytes_Size(b));
1206 Py_DECREF(b);
1207 if (!errors) {
1208 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001210 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001211 return v;
1212}
1213
Martin v. Löwis5b222132007-06-10 09:51:05 +00001214char*
1215PyUnicode_AsString(PyObject *unicode)
1216{
1217 assert(PyUnicode_Check(unicode));
1218 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1219 if (!unicode)
1220 return NULL;
1221 return PyString_AsString(unicode);
1222}
1223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1225{
1226 if (!PyUnicode_Check(unicode)) {
1227 PyErr_BadArgument();
1228 goto onError;
1229 }
1230 return PyUnicode_AS_UNICODE(unicode);
1231
1232 onError:
1233 return NULL;
1234}
1235
Martin v. Löwis18e16552006-02-15 17:27:45 +00001236Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237{
1238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
1242 return PyUnicode_GET_SIZE(unicode);
1243
1244 onError:
1245 return -1;
1246}
1247
Thomas Wouters78890102000-07-22 19:25:51 +00001248const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001249{
1250 return unicode_default_encoding;
1251}
1252
1253int PyUnicode_SetDefaultEncoding(const char *encoding)
1254{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001255 if (strcmp(encoding, unicode_default_encoding) != 0) {
1256 PyErr_Format(PyExc_ValueError,
1257 "Can only set default encoding to %s",
1258 unicode_default_encoding);
1259 return -1;
1260 }
Fred Drakee4315f52000-05-09 19:53:39 +00001261 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001262}
1263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001264/* error handling callback helper:
1265 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001266 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 and adjust various state variables.
1268 return 0 on success, -1 on error
1269*/
1270
1271static
1272int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1273 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1275 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278
1279 PyObject *restuple = NULL;
1280 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1282 Py_ssize_t requiredsize;
1283 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001285 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 int res = -1;
1287
1288 if (*errorHandler == NULL) {
1289 *errorHandler = PyCodec_LookupError(errors);
1290 if (*errorHandler == NULL)
1291 goto onError;
1292 }
1293
1294 if (*exceptionObject == NULL) {
1295 *exceptionObject = PyUnicodeDecodeError_Create(
1296 encoding, input, insize, *startinpos, *endinpos, reason);
1297 if (*exceptionObject == NULL)
1298 goto onError;
1299 }
1300 else {
1301 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1304 goto onError;
1305 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1306 goto onError;
1307 }
1308
1309 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1310 if (restuple == NULL)
1311 goto onError;
1312 if (!PyTuple_Check(restuple)) {
1313 PyErr_Format(PyExc_TypeError, &argparse[4]);
1314 goto onError;
1315 }
1316 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1317 goto onError;
1318 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001319 newpos = insize+newpos;
1320 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001321 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001322 goto onError;
1323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324
1325 /* need more space? (at least enough for what we
1326 have+the replacement+the rest of the string (starting
1327 at the new input position), so we won't have to check space
1328 when there are no errors in the rest of the string) */
1329 repptr = PyUnicode_AS_UNICODE(repunicode);
1330 repsize = PyUnicode_GET_SIZE(repunicode);
1331 requiredsize = *outpos + repsize + insize-newpos;
1332 if (requiredsize > outsize) {
1333 if (requiredsize<2*outsize)
1334 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001335 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001336 goto onError;
1337 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1338 }
1339 *endinpos = newpos;
1340 *inptr = input + newpos;
1341 Py_UNICODE_COPY(*outptr, repptr, repsize);
1342 *outptr += repsize;
1343 *outpos += repsize;
1344 /* we made it! */
1345 res = 0;
1346
1347 onError:
1348 Py_XDECREF(restuple);
1349 return res;
1350}
1351
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001352/* --- UTF-7 Codec -------------------------------------------------------- */
1353
1354/* see RFC2152 for details */
1355
Tim Petersced69f82003-09-16 20:30:58 +00001356static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001357char utf7_special[128] = {
1358 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1359 encoded:
1360 0 - not special
1361 1 - special
1362 2 - whitespace (optional)
1363 3 - RFC2152 Set O (optional) */
1364 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1365 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1366 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1368 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1370 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1372
1373};
1374
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001375/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1376 warnings about the comparison always being false; since
1377 utf7_special[0] is 1, we can safely make that one comparison
1378 true */
1379
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001380#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001381 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001382 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001383 (encodeO && (utf7_special[(c)] == 3)))
1384
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001385#define B64(n) \
1386 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1387#define B64CHAR(c) \
1388 (isalnum(c) || (c) == '+' || (c) == '/')
1389#define UB64(c) \
1390 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1391 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001393#define ENCODE(out, ch, bits) \
1394 while (bits >= 6) { \
1395 *out++ = B64(ch >> (bits-6)); \
1396 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001397 }
1398
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001399#define DECODE(out, ch, bits, surrogate) \
1400 while (bits >= 16) { \
1401 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1402 bits -= 16; \
1403 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001404 /* We have already generated an error for the high surrogate \
1405 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001406 surrogate = 0; \
1407 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001409 it in a 16-bit character */ \
1410 surrogate = 1; \
1411 errmsg = "code pairs are not supported"; \
1412 goto utf7Error; \
1413 } else { \
1414 *out++ = outCh; \
1415 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001419 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 const char *errors)
1421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t startinpos;
1424 Py_ssize_t endinpos;
1425 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001426 const char *e;
1427 PyUnicodeObject *unicode;
1428 Py_UNICODE *p;
1429 const char *errmsg = "";
1430 int inShift = 0;
1431 unsigned int bitsleft = 0;
1432 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 int surrogate = 0;
1434 PyObject *errorHandler = NULL;
1435 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001436
1437 unicode = _PyUnicode_New(size);
1438 if (!unicode)
1439 return NULL;
1440 if (size == 0)
1441 return (PyObject *)unicode;
1442
1443 p = unicode->str;
1444 e = s + size;
1445
1446 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 Py_UNICODE ch;
1448 restart:
1449 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450
1451 if (inShift) {
1452 if ((ch == '-') || !B64CHAR(ch)) {
1453 inShift = 0;
1454 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001455
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001456 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1457 if (bitsleft >= 6) {
1458 /* The shift sequence has a partial character in it. If
1459 bitsleft < 6 then we could just classify it as padding
1460 but that is not the case here */
1461
1462 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001463 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464 }
1465 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001466 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 here so indicate the potential of a misencoded character. */
1468
1469 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1470 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1471 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 }
1474
1475 if (ch == '-') {
1476 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001477 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001478 inShift = 1;
1479 }
1480 } else if (SPECIAL(ch,0,0)) {
1481 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 } else {
1484 *p++ = ch;
1485 }
1486 } else {
1487 charsleft = (charsleft << 6) | UB64(ch);
1488 bitsleft += 6;
1489 s++;
1490 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1491 }
1492 }
1493 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 s++;
1496 if (s < e && *s == '-') {
1497 s++;
1498 *p++ = '+';
1499 } else
1500 {
1501 inShift = 1;
1502 bitsleft = 0;
1503 }
1504 }
1505 else if (SPECIAL(ch,0,0)) {
1506 errmsg = "unexpected special character";
1507 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001508 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 }
1510 else {
1511 *p++ = ch;
1512 s++;
1513 }
1514 continue;
1515 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 outpos = p-PyUnicode_AS_UNICODE(unicode);
1517 endinpos = s-starts;
1518 if (unicode_decode_call_errorhandler(
1519 errors, &errorHandler,
1520 "utf7", errmsg,
1521 starts, size, &startinpos, &endinpos, &exc, &s,
1522 (PyObject **)&unicode, &outpos, &p))
1523 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 }
1525
1526 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 outpos = p-PyUnicode_AS_UNICODE(unicode);
1528 endinpos = size;
1529 if (unicode_decode_call_errorhandler(
1530 errors, &errorHandler,
1531 "utf7", "unterminated shift sequence",
1532 starts, size, &startinpos, &endinpos, &exc, &s,
1533 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001535 if (s < e)
1536 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 }
1538
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001539 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 goto onError;
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 Py_XDECREF(errorHandler);
1543 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544 return (PyObject *)unicode;
1545
1546onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_XDECREF(errorHandler);
1548 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 Py_DECREF(unicode);
1550 return NULL;
1551}
1552
1553
1554PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001555 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 int encodeSetO,
1557 int encodeWhiteSpace,
1558 const char *errors)
1559{
1560 PyObject *v;
1561 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 unsigned int bitsleft = 0;
1566 unsigned long charsleft = 0;
1567 char * out;
1568 char * start;
1569
1570 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001571 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572
Walter Dörwald51ab4142007-05-05 14:43:36 +00001573 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574 if (v == NULL)
1575 return NULL;
1576
Walter Dörwald51ab4142007-05-05 14:43:36 +00001577 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 for (;i < size; ++i) {
1579 Py_UNICODE ch = s[i];
1580
1581 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001582 if (ch == '+') {
1583 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 *out++ = '-';
1585 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1586 charsleft = ch;
1587 bitsleft = 16;
1588 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001589 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001591 } else {
1592 *out++ = (char) ch;
1593 }
1594 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1596 *out++ = B64(charsleft << (6-bitsleft));
1597 charsleft = 0;
1598 bitsleft = 0;
1599 /* Characters not in the BASE64 set implicitly unshift the sequence
1600 so no '-' is required, except if the character is itself a '-' */
1601 if (B64CHAR(ch) || ch == '-') {
1602 *out++ = '-';
1603 }
1604 inShift = 0;
1605 *out++ = (char) ch;
1606 } else {
1607 bitsleft += 16;
1608 charsleft = (charsleft << 16) | ch;
1609 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1610
1611 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001612 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 or '-' then the shift sequence will be terminated implicitly and we
1614 don't have to insert a '-'. */
1615
1616 if (bitsleft == 0) {
1617 if (i + 1 < size) {
1618 Py_UNICODE ch2 = s[i+1];
1619
1620 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001621
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 } else if (B64CHAR(ch2) || ch2 == '-') {
1623 *out++ = '-';
1624 inShift = 0;
1625 } else {
1626 inShift = 0;
1627 }
1628
1629 }
1630 else {
1631 *out++ = '-';
1632 inShift = 0;
1633 }
1634 }
Tim Petersced69f82003-09-16 20:30:58 +00001635 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 if (bitsleft) {
1639 *out++= B64(charsleft << (6-bitsleft) );
1640 *out++ = '-';
1641 }
1642
Walter Dörwald51ab4142007-05-05 14:43:36 +00001643 if (PyBytes_Resize(v, out - start)) {
1644 Py_DECREF(v);
1645 return NULL;
1646 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 return v;
1648}
1649
1650#undef SPECIAL
1651#undef B64
1652#undef B64CHAR
1653#undef UB64
1654#undef ENCODE
1655#undef DECODE
1656
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657/* --- UTF-8 Codec -------------------------------------------------------- */
1658
Tim Petersced69f82003-09-16 20:30:58 +00001659static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660char utf8_code_length[256] = {
1661 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1662 illegal prefix. see RFC 2279 for details */
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1675 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1676 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1677 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1678 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1679};
1680
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001682 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 const char *errors)
1684{
Walter Dörwald69652032004-09-07 20:24:22 +00001685 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1686}
1687
1688PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001690 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695 Py_ssize_t startinpos;
1696 Py_ssize_t endinpos;
1697 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 const char *e;
1699 PyUnicodeObject *unicode;
1700 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001701 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 PyObject *errorHandler = NULL;
1703 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704
1705 /* Note: size will always be longer than the resulting Unicode
1706 character count */
1707 unicode = _PyUnicode_New(size);
1708 if (!unicode)
1709 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001710 if (size == 0) {
1711 if (consumed)
1712 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
1716 /* Unpack UTF-8 encoded data */
1717 p = unicode->str;
1718 e = s + size;
1719
1720 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001721 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
1723 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001724 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 s++;
1726 continue;
1727 }
1728
1729 n = utf8_code_length[ch];
1730
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001731 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001732 if (consumed)
1733 break;
1734 else {
1735 errmsg = "unexpected end of data";
1736 startinpos = s-starts;
1737 endinpos = size;
1738 goto utf8Error;
1739 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 switch (n) {
1743
1744 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
1747 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001748 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
1750 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
1753 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 if ((s[1] & 0xc0) != 0x80) {
1758 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 startinpos = s-starts;
1760 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 goto utf8Error;
1762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 startinpos = s-starts;
1766 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 errmsg = "illegal encoding";
1768 goto utf8Error;
1769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001771 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 break;
1773
1774 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001775 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 (s[2] & 0xc0) != 0x80) {
1777 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 startinpos = s-starts;
1779 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 goto utf8Error;
1781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001783 if (ch < 0x0800) {
1784 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001785 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001786
1787 XXX For wide builds (UCS-4) we should probably try
1788 to recombine the surrogates into a single code
1789 unit.
1790 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001791 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 startinpos = s-starts;
1793 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 goto utf8Error;
1795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001797 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001798 break;
1799
1800 case 4:
1801 if ((s[1] & 0xc0) != 0x80 ||
1802 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 (s[3] & 0xc0) != 0x80) {
1804 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 startinpos = s-starts;
1806 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 goto utf8Error;
1808 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001809 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1810 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1811 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001812 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001813 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001815 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001816 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
1821 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001822#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001823 *p++ = (Py_UNICODE)ch;
1824#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001826
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001827 /* translate from 10000..10FFFF to 0..FFFF */
1828 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001829
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001830 /* high surrogate = top 10 bits added to D800 */
1831 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001832
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001833 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001834 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001835#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 break;
1837
1838 default:
1839 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 startinpos = s-starts;
1842 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
1845 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001847
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 outpos = p-PyUnicode_AS_UNICODE(unicode);
1850 if (unicode_decode_call_errorhandler(
1851 errors, &errorHandler,
1852 "utf8", errmsg,
1853 starts, size, &startinpos, &endinpos, &exc, &s,
1854 (PyObject **)&unicode, &outpos, &p))
1855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 }
Walter Dörwald69652032004-09-07 20:24:22 +00001857 if (consumed)
1858 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859
1860 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001861 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 goto onError;
1863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return (PyObject *)unicode;
1867
1868onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 Py_XDECREF(errorHandler);
1870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 Py_DECREF(unicode);
1872 return NULL;
1873}
1874
Tim Peters602f7402002-04-27 18:03:26 +00001875/* Allocation strategy: if the string is short, convert into a stack buffer
1876 and allocate exactly as much space needed at the end. Else allocate the
1877 maximum possible needed (4 result bytes per Unicode character), and return
1878 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001879*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001880PyObject *
1881PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001882 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001883 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884{
Tim Peters602f7402002-04-27 18:03:26 +00001885#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001886
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001888 PyObject *v; /* result string object */
1889 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001890 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001891 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001892 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001893
Tim Peters602f7402002-04-27 18:03:26 +00001894 assert(s != NULL);
1895 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896
Tim Peters602f7402002-04-27 18:03:26 +00001897 if (size <= MAX_SHORT_UNICHARS) {
1898 /* Write into the stack buffer; nallocated can't overflow.
1899 * At the end, we'll allocate exactly as much heap space as it
1900 * turns out we need.
1901 */
1902 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1903 v = NULL; /* will allocate after we're done */
1904 p = stackbuf;
1905 }
1906 else {
1907 /* Overallocate on the heap, and give the excess back at the end. */
1908 nallocated = size * 4;
1909 if (nallocated / 4 != size) /* overflow! */
1910 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001911 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001912 if (v == NULL)
1913 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001914 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001915 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001916
Tim Peters602f7402002-04-27 18:03:26 +00001917 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001919
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001920 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001921 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001923
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001925 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001926 *p++ = (char)(0xc0 | (ch >> 6));
1927 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001928 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001929 else {
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode UCS2 Unicode ordinals */
1931 if (ch < 0x10000) {
1932 /* Special case: check for high surrogate */
1933 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1934 Py_UCS4 ch2 = s[i];
1935 /* Check for low surrogate and combine the two to
1936 form a UCS4 value */
1937 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001938 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001939 i++;
1940 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Tim Peters602f7402002-04-27 18:03:26 +00001942 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001945 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1946 *p++ = (char)(0x80 | (ch & 0x3f));
1947 continue;
1948 }
1949encodeUCS4:
1950 /* Encode UCS4 Unicode ordinals */
1951 *p++ = (char)(0xf0 | (ch >> 18));
1952 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1953 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1954 *p++ = (char)(0x80 | (ch & 0x3f));
1955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001957
Tim Peters602f7402002-04-27 18:03:26 +00001958 if (v == NULL) {
1959 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001960 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001961 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001962 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001963 }
1964 else {
1965 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001966 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001967 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001968 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001971
Tim Peters602f7402002-04-27 18:03:26 +00001972#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973}
1974
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 if (!PyUnicode_Check(unicode)) {
1978 PyErr_BadArgument();
1979 return NULL;
1980 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001981 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1982 PyUnicode_GET_SIZE(unicode),
1983 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984}
1985
1986/* --- UTF-16 Codec ------------------------------------------------------- */
1987
Tim Peters772747b2001-08-09 22:21:55 +00001988PyObject *
1989PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001990 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001991 const char *errors,
1992 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993{
Walter Dörwald69652032004-09-07 20:24:22 +00001994 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1995}
1996
1997PyObject *
1998PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002000 const char *errors,
2001 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002002 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002005 Py_ssize_t startinpos;
2006 Py_ssize_t endinpos;
2007 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 PyUnicodeObject *unicode;
2009 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002010 const unsigned char *q, *e;
2011 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002013 /* Offsets from q for retrieving byte pairs in the right order. */
2014#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2015 int ihi = 1, ilo = 0;
2016#else
2017 int ihi = 0, ilo = 1;
2018#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021
2022 /* Note: size will always be longer than the resulting Unicode
2023 character count */
2024 unicode = _PyUnicode_New(size);
2025 if (!unicode)
2026 return NULL;
2027 if (size == 0)
2028 return (PyObject *)unicode;
2029
2030 /* Unpack UTF-16 encoded data */
2031 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002032 q = (unsigned char *)s;
2033 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002036 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002038 /* Check for BOM marks (U+FEFF) in the input and adjust current
2039 byte order setting accordingly. In native mode, the leading BOM
2040 mark is skipped, in all other modes, it is copied to the output
2041 stream as-is (giving a ZWNBSP character). */
2042 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002043 if (size >= 2) {
2044 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002045#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002046 if (bom == 0xFEFF) {
2047 q += 2;
2048 bo = -1;
2049 }
2050 else if (bom == 0xFFFE) {
2051 q += 2;
2052 bo = 1;
2053 }
Tim Petersced69f82003-09-16 20:30:58 +00002054#else
Walter Dörwald69652032004-09-07 20:24:22 +00002055 if (bom == 0xFEFF) {
2056 q += 2;
2057 bo = 1;
2058 }
2059 else if (bom == 0xFFFE) {
2060 q += 2;
2061 bo = -1;
2062 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002063#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002064 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
Tim Peters772747b2001-08-09 22:21:55 +00002067 if (bo == -1) {
2068 /* force LE */
2069 ihi = 1;
2070 ilo = 0;
2071 }
2072 else if (bo == 1) {
2073 /* force BE */
2074 ihi = 0;
2075 ilo = 1;
2076 }
2077
2078 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002080 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002082 if (consumed)
2083 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 errmsg = "truncated data";
2085 startinpos = ((const char *)q)-starts;
2086 endinpos = ((const char *)e)-starts;
2087 goto utf16Error;
2088 /* The remaining input chars are ignored if the callback
2089 chooses to skip the input */
2090 }
2091 ch = (q[ihi] << 8) | q[ilo];
2092
Tim Peters772747b2001-08-09 22:21:55 +00002093 q += 2;
2094
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 if (ch < 0xD800 || ch > 0xDFFF) {
2096 *p++ = ch;
2097 continue;
2098 }
2099
2100 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002101 if (q >= e) {
2102 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 startinpos = (((const char *)q)-2)-starts;
2104 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 goto utf16Error;
2106 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002107 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002108 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2109 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002110 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002111#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002112 *p++ = ch;
2113 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114#else
2115 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002118 }
2119 else {
2120 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 startinpos = (((const char *)q)-4)-starts;
2122 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002123 goto utf16Error;
2124 }
2125
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002127 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 startinpos = (((const char *)q)-2)-starts;
2129 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002130 /* Fall through to report the error */
2131
2132 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 outpos = p-PyUnicode_AS_UNICODE(unicode);
2134 if (unicode_decode_call_errorhandler(
2135 errors, &errorHandler,
2136 "utf16", errmsg,
2137 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2138 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 }
2141
2142 if (byteorder)
2143 *byteorder = bo;
2144
Walter Dörwald69652032004-09-07 20:24:22 +00002145 if (consumed)
2146 *consumed = (const char *)q-starts;
2147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002149 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 goto onError;
2151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 Py_XDECREF(errorHandler);
2153 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 return (PyObject *)unicode;
2155
2156onError:
2157 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 Py_XDECREF(errorHandler);
2159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 return NULL;
2161}
2162
Tim Peters772747b2001-08-09 22:21:55 +00002163PyObject *
2164PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002165 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002166 const char *errors,
2167 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168{
2169 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002170 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002171#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002172 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002173#else
2174 const int pairs = 0;
2175#endif
Tim Peters772747b2001-08-09 22:21:55 +00002176 /* Offsets from p for storing byte pairs in the right order. */
2177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2178 int ihi = 1, ilo = 0;
2179#else
2180 int ihi = 0, ilo = 1;
2181#endif
2182
2183#define STORECHAR(CH) \
2184 do { \
2185 p[ihi] = ((CH) >> 8) & 0xff; \
2186 p[ilo] = (CH) & 0xff; \
2187 p += 2; \
2188 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002190#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002191 for (i = pairs = 0; i < size; i++)
2192 if (s[i] >= 0x10000)
2193 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002194#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002195 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002196 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 if (v == NULL)
2198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199
Walter Dörwald3cc34522007-05-04 10:48:27 +00002200 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002202 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002203 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002204 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002205
2206 if (byteorder == -1) {
2207 /* force LE */
2208 ihi = 1;
2209 ilo = 0;
2210 }
2211 else if (byteorder == 1) {
2212 /* force BE */
2213 ihi = 0;
2214 ilo = 1;
2215 }
2216
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002217 while (size-- > 0) {
2218 Py_UNICODE ch = *s++;
2219 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002220#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002221 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002222 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2223 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002225#endif
Tim Peters772747b2001-08-09 22:21:55 +00002226 STORECHAR(ch);
2227 if (ch2)
2228 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002231#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232}
2233
2234PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2235{
2236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
2240 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode),
2242 NULL,
2243 0);
2244}
2245
2246/* --- Unicode Escape Codec ----------------------------------------------- */
2247
Fredrik Lundh06d12682001-01-24 07:59:11 +00002248static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002249
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002251 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 const char *errors)
2253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002255 Py_ssize_t startinpos;
2256 Py_ssize_t endinpos;
2257 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002262 char* message;
2263 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 PyObject *errorHandler = NULL;
2265 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002266
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 /* Escaped strings will always be longer than the resulting
2268 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269 length after conversion to the true value.
2270 (but if the error callback returns a long replacement string
2271 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002280
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 while (s < end) {
2282 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002283 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002284 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
2286 /* Non-escape characters are interpreted as Unicode ordinals */
2287 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002288 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 continue;
2290 }
2291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 /* \ - Escapes */
2294 s++;
2295 switch (*s++) {
2296
2297 /* \x escapes */
2298 case '\n': break;
2299 case '\\': *p++ = '\\'; break;
2300 case '\'': *p++ = '\''; break;
2301 case '\"': *p++ = '\"'; break;
2302 case 'b': *p++ = '\b'; break;
2303 case 'f': *p++ = '\014'; break; /* FF */
2304 case 't': *p++ = '\t'; break;
2305 case 'n': *p++ = '\n'; break;
2306 case 'r': *p++ = '\r'; break;
2307 case 'v': *p++ = '\013'; break; /* VT */
2308 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2309
2310 /* \OOO (octal) escapes */
2311 case '0': case '1': case '2': case '3':
2312 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002313 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002315 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002317 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002319 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 break;
2321
Fredrik Lundhccc74732001-02-18 22:13:49 +00002322 /* hex escapes */
2323 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002325 digits = 2;
2326 message = "truncated \\xXX escape";
2327 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328
Fredrik Lundhccc74732001-02-18 22:13:49 +00002329 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002331 digits = 4;
2332 message = "truncated \\uXXXX escape";
2333 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002336 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002337 digits = 8;
2338 message = "truncated \\UXXXXXXXX escape";
2339 hexescape:
2340 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 outpos = p-PyUnicode_AS_UNICODE(v);
2342 if (s+digits>end) {
2343 endinpos = size;
2344 if (unicode_decode_call_errorhandler(
2345 errors, &errorHandler,
2346 "unicodeescape", "end of string in escape sequence",
2347 starts, size, &startinpos, &endinpos, &exc, &s,
2348 (PyObject **)&v, &outpos, &p))
2349 goto onError;
2350 goto nextByte;
2351 }
2352 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002353 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002354 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 endinpos = (s+i+1)-starts;
2356 if (unicode_decode_call_errorhandler(
2357 errors, &errorHandler,
2358 "unicodeescape", message,
2359 starts, size, &startinpos, &endinpos, &exc, &s,
2360 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002361 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002362 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 }
2364 chr = (chr<<4) & ~0xF;
2365 if (c >= '0' && c <= '9')
2366 chr += c - '0';
2367 else if (c >= 'a' && c <= 'f')
2368 chr += 10 + c - 'a';
2369 else
2370 chr += 10 + c - 'A';
2371 }
2372 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002373 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002374 /* _decoding_error will have already written into the
2375 target buffer. */
2376 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002377 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002378 /* when we get here, chr is a 32-bit unicode character */
2379 if (chr <= 0xffff)
2380 /* UCS-2 character */
2381 *p++ = (Py_UNICODE) chr;
2382 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002383 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002384 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002385#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002386 *p++ = chr;
2387#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002388 chr -= 0x10000L;
2389 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002390 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002391#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002392 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393 endinpos = s-starts;
2394 outpos = p-PyUnicode_AS_UNICODE(v);
2395 if (unicode_decode_call_errorhandler(
2396 errors, &errorHandler,
2397 "unicodeescape", "illegal Unicode character",
2398 starts, size, &startinpos, &endinpos, &exc, &s,
2399 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002400 goto onError;
2401 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002402 break;
2403
2404 /* \N{name} */
2405 case 'N':
2406 message = "malformed \\N character escape";
2407 if (ucnhash_CAPI == NULL) {
2408 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002409 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002410 m = PyImport_ImportModule("unicodedata");
2411 if (m == NULL)
2412 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002413 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002414 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002415 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002416 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002417 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002418 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002419 if (ucnhash_CAPI == NULL)
2420 goto ucnhashError;
2421 }
2422 if (*s == '{') {
2423 const char *start = s+1;
2424 /* look for the closing brace */
2425 while (*s != '}' && s < end)
2426 s++;
2427 if (s > start && s < end && *s == '}') {
2428 /* found a name. look it up in the unicode database */
2429 message = "unknown Unicode character name";
2430 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002431 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002432 goto store;
2433 }
2434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435 endinpos = s-starts;
2436 outpos = p-PyUnicode_AS_UNICODE(v);
2437 if (unicode_decode_call_errorhandler(
2438 errors, &errorHandler,
2439 "unicodeescape", message,
2440 starts, size, &startinpos, &endinpos, &exc, &s,
2441 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002442 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002443 break;
2444
2445 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002446 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002447 message = "\\ at end of string";
2448 s--;
2449 endinpos = s-starts;
2450 outpos = p-PyUnicode_AS_UNICODE(v);
2451 if (unicode_decode_call_errorhandler(
2452 errors, &errorHandler,
2453 "unicodeescape", message,
2454 starts, size, &startinpos, &endinpos, &exc, &s,
2455 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002456 goto onError;
2457 }
2458 else {
2459 *p++ = '\\';
2460 *p++ = (unsigned char)s[-1];
2461 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002462 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 nextByte:
2465 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002467 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002469 Py_XDECREF(errorHandler);
2470 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002472
Fredrik Lundhccc74732001-02-18 22:13:49 +00002473ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002474 PyErr_SetString(
2475 PyExc_UnicodeError,
2476 "\\N escapes not supported (can't load unicodedata module)"
2477 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002478 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_XDECREF(errorHandler);
2480 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002481 return NULL;
2482
Fredrik Lundhccc74732001-02-18 22:13:49 +00002483onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 Py_XDECREF(errorHandler);
2486 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 return NULL;
2488}
2489
2490/* Return a Unicode-Escape string version of the Unicode object.
2491
2492 If quotes is true, the string is enclosed in u"" or u'' quotes as
2493 appropriate.
2494
2495*/
2496
Thomas Wouters477c8d52006-05-27 19:21:47 +00002497Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2498 Py_ssize_t size,
2499 Py_UNICODE ch)
2500{
2501 /* like wcschr, but doesn't stop at NULL characters */
2502
2503 while (size-- > 0) {
2504 if (*s == ch)
2505 return s;
2506 s++;
2507 }
2508
2509 return NULL;
2510}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002511
Walter Dörwald79e913e2007-05-12 11:08:06 +00002512static const char *hexdigits = "0123456789abcdef";
2513
2514PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2515 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516{
2517 PyObject *repr;
2518 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
Thomas Wouters89f507f2006-12-13 04:49:30 +00002520 /* XXX(nnorwitz): rather than over-allocating, it would be
2521 better to choose a different scheme. Perhaps scan the
2522 first N-chars of the string and allocate based on that size.
2523 */
2524 /* Initial allocation is based on the longest-possible unichr
2525 escape.
2526
2527 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2528 unichr, so in this case it's the longest unichr escape. In
2529 narrow (UTF-16) builds this is five chars per source unichr
2530 since there are two unichrs in the surrogate pair, so in narrow
2531 (UTF-16) builds it's not the longest unichr escape.
2532
2533 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2534 so in the narrow (UTF-16) build case it's the longest unichr
2535 escape.
2536 */
2537
Walter Dörwald79e913e2007-05-12 11:08:06 +00002538 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002539#ifdef Py_UNICODE_WIDE
2540 + 10*size
2541#else
2542 + 6*size
2543#endif
2544 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 if (repr == NULL)
2546 return NULL;
2547
Walter Dörwald79e913e2007-05-12 11:08:06 +00002548 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 while (size-- > 0) {
2551 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002552
Walter Dörwald79e913e2007-05-12 11:08:06 +00002553 /* Escape backslashes */
2554 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 *p++ = '\\';
2556 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002557 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002558 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002559
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002560#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561 /* Map 21-bit characters to '\U00xxxxxx' */
2562 else if (ch >= 0x10000) {
2563 *p++ = '\\';
2564 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002565 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2566 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2567 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2570 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2571 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2572 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002573 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002575#else
2576 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002577 else if (ch >= 0xD800 && ch < 0xDC00) {
2578 Py_UNICODE ch2;
2579 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002580
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002581 ch2 = *s++;
2582 size--;
2583 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2584 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2585 *p++ = '\\';
2586 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002587 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2588 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2589 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2592 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2593 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2594 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002595 continue;
2596 }
2597 /* Fall through: isolated surrogates are copied as-is */
2598 s--;
2599 size++;
2600 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002601#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002604 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 *p++ = '\\';
2606 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002607 *p++ = hexdigits[(ch >> 12) & 0x000F];
2608 *p++ = hexdigits[(ch >> 8) & 0x000F];
2609 *p++ = hexdigits[(ch >> 4) & 0x000F];
2610 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002612
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002613 /* Map special whitespace to '\t', \n', '\r' */
2614 else if (ch == '\t') {
2615 *p++ = '\\';
2616 *p++ = 't';
2617 }
2618 else if (ch == '\n') {
2619 *p++ = '\\';
2620 *p++ = 'n';
2621 }
2622 else if (ch == '\r') {
2623 *p++ = '\\';
2624 *p++ = 'r';
2625 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002626
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002627 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002628 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002630 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002631 *p++ = hexdigits[(ch >> 4) & 0x000F];
2632 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002633 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002634
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 /* Copy everything else as-is */
2636 else
2637 *p++ = (char) ch;
2638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639
2640 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002641 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2642 Py_DECREF(repr);
2643 return NULL;
2644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 return repr;
2646}
2647
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2649{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002650 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 if (!PyUnicode_Check(unicode)) {
2652 PyErr_BadArgument();
2653 return NULL;
2654 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002655 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2656 PyUnicode_GET_SIZE(unicode));
2657
2658 if (!s)
2659 return NULL;
2660 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2661 PyBytes_GET_SIZE(s));
2662 Py_DECREF(s);
2663 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666/* --- Raw Unicode Escape Codec ------------------------------------------- */
2667
2668PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002669 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 const char *errors)
2671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002673 Py_ssize_t startinpos;
2674 Py_ssize_t endinpos;
2675 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 const char *end;
2679 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 PyObject *errorHandler = NULL;
2681 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002682
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 /* Escaped strings will always be longer than the resulting
2684 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 length after conversion to the true value. (But decoding error
2686 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 v = _PyUnicode_New(size);
2688 if (v == NULL)
2689 goto onError;
2690 if (size == 0)
2691 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002692 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 end = s + size;
2694 while (s < end) {
2695 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002696 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002698 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699
2700 /* Non-escape characters are interpreted as Unicode ordinals */
2701 if (*s != '\\') {
2702 *p++ = (unsigned char)*s++;
2703 continue;
2704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706
2707 /* \u-escapes are only interpreted iff the number of leading
2708 backslashes if odd */
2709 bs = s;
2710 for (;s < end;) {
2711 if (*s != '\\')
2712 break;
2713 *p++ = (unsigned char)*s++;
2714 }
2715 if (((s - bs) & 1) == 0 ||
2716 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002717 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 continue;
2719 }
2720 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002721 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 s++;
2723
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002724 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002726 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 endinpos = s-starts;
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "rawunicodeescape", "truncated \\uXXXX",
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 }
2738 x = (x<<4) & ~0xF;
2739 if (c >= '0' && c <= '9')
2740 x += c - '0';
2741 else if (c >= 'a' && c <= 'f')
2742 x += 10 + c - 'a';
2743 else
2744 x += 10 + c - 'A';
2745 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002746#ifndef Py_UNICODE_WIDE
2747 if (x > 0x10000) {
2748 if (unicode_decode_call_errorhandler(
2749 errors, &errorHandler,
2750 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2751 starts, size, &startinpos, &endinpos, &exc, &s,
2752 (PyObject **)&v, &outpos, &p))
2753 goto onError;
2754 }
2755#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 *p++ = x;
2757 nextByte:
2758 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002760 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002761 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 Py_XDECREF(errorHandler);
2763 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 onError:
2767 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 Py_XDECREF(errorHandler);
2769 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 return NULL;
2771}
2772
2773PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002774 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775{
2776 PyObject *repr;
2777 char *p;
2778 char *q;
2779
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002780#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002781 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002783 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002784#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 if (repr == NULL)
2786 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002787 if (size == 0)
2788 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789
Walter Dörwald711005d2007-05-12 12:03:26 +00002790 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 while (size-- > 0) {
2792 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002793#ifdef Py_UNICODE_WIDE
2794 /* Map 32-bit characters to '\Uxxxxxxxx' */
2795 if (ch >= 0x10000) {
2796 *p++ = '\\';
2797 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002798 *p++ = hexdigits[(ch >> 28) & 0xf];
2799 *p++ = hexdigits[(ch >> 24) & 0xf];
2800 *p++ = hexdigits[(ch >> 20) & 0xf];
2801 *p++ = hexdigits[(ch >> 16) & 0xf];
2802 *p++ = hexdigits[(ch >> 12) & 0xf];
2803 *p++ = hexdigits[(ch >> 8) & 0xf];
2804 *p++ = hexdigits[(ch >> 4) & 0xf];
2805 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002806 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002807 else
2808#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 /* Map 16-bit characters to '\uxxxx' */
2810 if (ch >= 256) {
2811 *p++ = '\\';
2812 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002813 *p++ = hexdigits[(ch >> 12) & 0xf];
2814 *p++ = hexdigits[(ch >> 8) & 0xf];
2815 *p++ = hexdigits[(ch >> 4) & 0xf];
2816 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 }
2818 /* Copy everything else as-is */
2819 else
2820 *p++ = (char) ch;
2821 }
2822 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002823 if (PyBytes_Resize(repr, p - q)) {
2824 Py_DECREF(repr);
2825 return NULL;
2826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 return repr;
2828}
2829
2830PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2831{
Walter Dörwald711005d2007-05-12 12:03:26 +00002832 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002834 PyErr_BadArgument();
2835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002837 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2838 PyUnicode_GET_SIZE(unicode));
2839
2840 if (!s)
2841 return NULL;
2842 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2843 PyBytes_GET_SIZE(s));
2844 Py_DECREF(s);
2845 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846}
2847
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002848/* --- Unicode Internal Codec ------------------------------------------- */
2849
2850PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002851 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002852 const char *errors)
2853{
2854 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t startinpos;
2856 Py_ssize_t endinpos;
2857 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002858 PyUnicodeObject *v;
2859 Py_UNICODE *p;
2860 const char *end;
2861 const char *reason;
2862 PyObject *errorHandler = NULL;
2863 PyObject *exc = NULL;
2864
Neal Norwitzd43069c2006-01-08 01:12:10 +00002865#ifdef Py_UNICODE_WIDE
2866 Py_UNICODE unimax = PyUnicode_GetMax();
2867#endif
2868
Thomas Wouters89f507f2006-12-13 04:49:30 +00002869 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002870 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2871 if (v == NULL)
2872 goto onError;
2873 if (PyUnicode_GetSize((PyObject *)v) == 0)
2874 return (PyObject *)v;
2875 p = PyUnicode_AS_UNICODE(v);
2876 end = s + size;
2877
2878 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002879 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002880 /* We have to sanity check the raw data, otherwise doom looms for
2881 some malformed UCS-4 data. */
2882 if (
2883 #ifdef Py_UNICODE_WIDE
2884 *p > unimax || *p < 0 ||
2885 #endif
2886 end-s < Py_UNICODE_SIZE
2887 )
2888 {
2889 startinpos = s - starts;
2890 if (end-s < Py_UNICODE_SIZE) {
2891 endinpos = end-starts;
2892 reason = "truncated input";
2893 }
2894 else {
2895 endinpos = s - starts + Py_UNICODE_SIZE;
2896 reason = "illegal code point (> 0x10FFFF)";
2897 }
2898 outpos = p - PyUnicode_AS_UNICODE(v);
2899 if (unicode_decode_call_errorhandler(
2900 errors, &errorHandler,
2901 "unicode_internal", reason,
2902 starts, size, &startinpos, &endinpos, &exc, &s,
2903 (PyObject **)&v, &outpos, &p)) {
2904 goto onError;
2905 }
2906 }
2907 else {
2908 p++;
2909 s += Py_UNICODE_SIZE;
2910 }
2911 }
2912
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002913 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002914 goto onError;
2915 Py_XDECREF(errorHandler);
2916 Py_XDECREF(exc);
2917 return (PyObject *)v;
2918
2919 onError:
2920 Py_XDECREF(v);
2921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
2923 return NULL;
2924}
2925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926/* --- Latin-1 Codec ------------------------------------------------------ */
2927
2928PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002929 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 const char *errors)
2931{
2932 PyUnicodeObject *v;
2933 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002934
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002936 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002937 Py_UNICODE r = *(unsigned char*)s;
2938 return PyUnicode_FromUnicode(&r, 1);
2939 }
2940
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 v = _PyUnicode_New(size);
2942 if (v == NULL)
2943 goto onError;
2944 if (size == 0)
2945 return (PyObject *)v;
2946 p = PyUnicode_AS_UNICODE(v);
2947 while (size-- > 0)
2948 *p++ = (unsigned char)*s++;
2949 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 onError:
2952 Py_XDECREF(v);
2953 return NULL;
2954}
2955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956/* create or adjust a UnicodeEncodeError */
2957static void make_encode_exception(PyObject **exceptionObject,
2958 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002959 const Py_UNICODE *unicode, Py_ssize_t size,
2960 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 if (*exceptionObject == NULL) {
2964 *exceptionObject = PyUnicodeEncodeError_Create(
2965 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
2967 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2969 goto onError;
2970 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2971 goto onError;
2972 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2973 goto onError;
2974 return;
2975 onError:
2976 Py_DECREF(*exceptionObject);
2977 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
2979}
2980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981/* raises a UnicodeEncodeError */
2982static void raise_encode_exception(PyObject **exceptionObject,
2983 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002984 const Py_UNICODE *unicode, Py_ssize_t size,
2985 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 const char *reason)
2987{
2988 make_encode_exception(exceptionObject,
2989 encoding, unicode, size, startpos, endpos, reason);
2990 if (*exceptionObject != NULL)
2991 PyCodec_StrictErrors(*exceptionObject);
2992}
2993
2994/* error handling callback helper:
2995 build arguments, call the callback and check the arguments,
2996 put the result into newpos and return the replacement string, which
2997 has to be freed by the caller */
2998static PyObject *unicode_encode_call_errorhandler(const char *errors,
2999 PyObject **errorHandler,
3000 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003001 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3002 Py_ssize_t startpos, Py_ssize_t endpos,
3003 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003005 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006
3007 PyObject *restuple;
3008 PyObject *resunicode;
3009
3010 if (*errorHandler == NULL) {
3011 *errorHandler = PyCodec_LookupError(errors);
3012 if (*errorHandler == NULL)
3013 return NULL;
3014 }
3015
3016 make_encode_exception(exceptionObject,
3017 encoding, unicode, size, startpos, endpos, reason);
3018 if (*exceptionObject == NULL)
3019 return NULL;
3020
3021 restuple = PyObject_CallFunctionObjArgs(
3022 *errorHandler, *exceptionObject, NULL);
3023 if (restuple == NULL)
3024 return NULL;
3025 if (!PyTuple_Check(restuple)) {
3026 PyErr_Format(PyExc_TypeError, &argparse[4]);
3027 Py_DECREF(restuple);
3028 return NULL;
3029 }
3030 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3031 &resunicode, newpos)) {
3032 Py_DECREF(restuple);
3033 return NULL;
3034 }
3035 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003036 *newpos = size+*newpos;
3037 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003038 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003039 Py_DECREF(restuple);
3040 return NULL;
3041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 Py_INCREF(resunicode);
3043 Py_DECREF(restuple);
3044 return resunicode;
3045}
3046
3047static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003048 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 const char *errors,
3050 int limit)
3051{
3052 /* output object */
3053 PyObject *res;
3054 /* pointers to the beginning and end+1 of input */
3055 const Py_UNICODE *startp = p;
3056 const Py_UNICODE *endp = p + size;
3057 /* pointer to the beginning of the unencodable characters */
3058 /* const Py_UNICODE *badp = NULL; */
3059 /* pointer into the output */
3060 char *str;
3061 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t respos = 0;
3063 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003064 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3065 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
3068 /* the following variable is used for caching string comparisons
3069 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3070 int known_errorHandler = -1;
3071
3072 /* allocate enough for a simple encoding without
3073 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003074 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 if (res == NULL)
3076 goto onError;
3077 if (size == 0)
3078 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003079 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 ressize = size;
3081
3082 while (p<endp) {
3083 Py_UNICODE c = *p;
3084
3085 /* can we encode this? */
3086 if (c<limit) {
3087 /* no overflow check, because we know that the space is enough */
3088 *str++ = (char)c;
3089 ++p;
3090 }
3091 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003092 Py_ssize_t unicodepos = p-startp;
3093 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003095 Py_ssize_t repsize;
3096 Py_ssize_t newpos;
3097 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 Py_UNICODE *uni2;
3099 /* startpos for collecting unencodable chars */
3100 const Py_UNICODE *collstart = p;
3101 const Py_UNICODE *collend = p;
3102 /* find all unecodable characters */
3103 while ((collend < endp) && ((*collend)>=limit))
3104 ++collend;
3105 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3106 if (known_errorHandler==-1) {
3107 if ((errors==NULL) || (!strcmp(errors, "strict")))
3108 known_errorHandler = 1;
3109 else if (!strcmp(errors, "replace"))
3110 known_errorHandler = 2;
3111 else if (!strcmp(errors, "ignore"))
3112 known_errorHandler = 3;
3113 else if (!strcmp(errors, "xmlcharrefreplace"))
3114 known_errorHandler = 4;
3115 else
3116 known_errorHandler = 0;
3117 }
3118 switch (known_errorHandler) {
3119 case 1: /* strict */
3120 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3121 goto onError;
3122 case 2: /* replace */
3123 while (collstart++<collend)
3124 *str++ = '?'; /* fall through */
3125 case 3: /* ignore */
3126 p = collend;
3127 break;
3128 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003129 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 /* determine replacement size (temporarily (mis)uses p) */
3131 for (p = collstart, repsize = 0; p < collend; ++p) {
3132 if (*p<10)
3133 repsize += 2+1+1;
3134 else if (*p<100)
3135 repsize += 2+2+1;
3136 else if (*p<1000)
3137 repsize += 2+3+1;
3138 else if (*p<10000)
3139 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003140#ifndef Py_UNICODE_WIDE
3141 else
3142 repsize += 2+5+1;
3143#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 else if (*p<100000)
3145 repsize += 2+5+1;
3146 else if (*p<1000000)
3147 repsize += 2+6+1;
3148 else
3149 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003150#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 }
3152 requiredsize = respos+repsize+(endp-collend);
3153 if (requiredsize > ressize) {
3154 if (requiredsize<2*ressize)
3155 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003156 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003158 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 ressize = requiredsize;
3160 }
3161 /* generate replacement (temporarily (mis)uses p) */
3162 for (p = collstart; p < collend; ++p) {
3163 str += sprintf(str, "&#%d;", (int)*p);
3164 }
3165 p = collend;
3166 break;
3167 default:
3168 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3169 encoding, reason, startp, size, &exc,
3170 collstart-startp, collend-startp, &newpos);
3171 if (repunicode == NULL)
3172 goto onError;
3173 /* need more space? (at least enough for what we
3174 have+the replacement+the rest of the string, so
3175 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003176 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 repsize = PyUnicode_GET_SIZE(repunicode);
3178 requiredsize = respos+repsize+(endp-collend);
3179 if (requiredsize > ressize) {
3180 if (requiredsize<2*ressize)
3181 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003182 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 Py_DECREF(repunicode);
3184 goto onError;
3185 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003186 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187 ressize = requiredsize;
3188 }
3189 /* check if there is anything unencodable in the replacement
3190 and copy it to the output */
3191 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3192 c = *uni2;
3193 if (c >= limit) {
3194 raise_encode_exception(&exc, encoding, startp, size,
3195 unicodepos, unicodepos+1, reason);
3196 Py_DECREF(repunicode);
3197 goto onError;
3198 }
3199 *str = (char)c;
3200 }
3201 p = startp + newpos;
3202 Py_DECREF(repunicode);
3203 }
3204 }
3205 }
3206 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003207 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 if (respos<ressize)
3209 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003210 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_XDECREF(errorHandler);
3212 Py_XDECREF(exc);
3213 return res;
3214
3215 onError:
3216 Py_XDECREF(res);
3217 Py_XDECREF(errorHandler);
3218 Py_XDECREF(exc);
3219 return NULL;
3220}
3221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003223 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 const char *errors)
3225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227}
3228
3229PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3230{
3231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
3233 return NULL;
3234 }
3235 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3236 PyUnicode_GET_SIZE(unicode),
3237 NULL);
3238}
3239
3240/* --- 7-bit ASCII Codec -------------------------------------------------- */
3241
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003243 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 const char *errors)
3245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 PyUnicodeObject *v;
3248 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003249 Py_ssize_t startinpos;
3250 Py_ssize_t endinpos;
3251 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 const char *e;
3253 PyObject *errorHandler = NULL;
3254 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003257 if (size == 1 && *(unsigned char*)s < 128) {
3258 Py_UNICODE r = *(unsigned char*)s;
3259 return PyUnicode_FromUnicode(&r, 1);
3260 }
Tim Petersced69f82003-09-16 20:30:58 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 v = _PyUnicode_New(size);
3263 if (v == NULL)
3264 goto onError;
3265 if (size == 0)
3266 return (PyObject *)v;
3267 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 e = s + size;
3269 while (s < e) {
3270 register unsigned char c = (unsigned char)*s;
3271 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 ++s;
3274 }
3275 else {
3276 startinpos = s-starts;
3277 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003278 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 if (unicode_decode_call_errorhandler(
3280 errors, &errorHandler,
3281 "ascii", "ordinal not in range(128)",
3282 starts, size, &startinpos, &endinpos, &exc, &s,
3283 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003287 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003288 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 Py_XDECREF(errorHandler);
3291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003293
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 onError:
3295 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 Py_XDECREF(errorHandler);
3297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 return NULL;
3299}
3300
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003302 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 const char *errors)
3304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306}
3307
3308PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3309{
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 return NULL;
3313 }
3314 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3315 PyUnicode_GET_SIZE(unicode),
3316 NULL);
3317}
3318
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003319#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003320
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003321/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003323#if SIZEOF_INT < SIZEOF_SSIZE_T
3324#define NEED_RETRY
3325#endif
3326
3327/* XXX This code is limited to "true" double-byte encodings, as
3328 a) it assumes an incomplete character consists of a single byte, and
3329 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3330 encodings, see IsDBCSLeadByteEx documentation. */
3331
3332static int is_dbcs_lead_byte(const char *s, int offset)
3333{
3334 const char *curr = s + offset;
3335
3336 if (IsDBCSLeadByte(*curr)) {
3337 const char *prev = CharPrev(s, curr);
3338 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3339 }
3340 return 0;
3341}
3342
3343/*
3344 * Decode MBCS string into unicode object. If 'final' is set, converts
3345 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3346 */
3347static int decode_mbcs(PyUnicodeObject **v,
3348 const char *s, /* MBCS string */
3349 int size, /* sizeof MBCS string */
3350 int final)
3351{
3352 Py_UNICODE *p;
3353 Py_ssize_t n = 0;
3354 int usize = 0;
3355
3356 assert(size >= 0);
3357
3358 /* Skip trailing lead-byte unless 'final' is set */
3359 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3360 --size;
3361
3362 /* First get the size of the result */
3363 if (size > 0) {
3364 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3365 if (usize == 0) {
3366 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3367 return -1;
3368 }
3369 }
3370
3371 if (*v == NULL) {
3372 /* Create unicode object */
3373 *v = _PyUnicode_New(usize);
3374 if (*v == NULL)
3375 return -1;
3376 }
3377 else {
3378 /* Extend unicode object */
3379 n = PyUnicode_GET_SIZE(*v);
3380 if (_PyUnicode_Resize(v, n + usize) < 0)
3381 return -1;
3382 }
3383
3384 /* Do the conversion */
3385 if (size > 0) {
3386 p = PyUnicode_AS_UNICODE(*v) + n;
3387 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3388 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3389 return -1;
3390 }
3391 }
3392
3393 return size;
3394}
3395
3396PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3397 Py_ssize_t size,
3398 const char *errors,
3399 Py_ssize_t *consumed)
3400{
3401 PyUnicodeObject *v = NULL;
3402 int done;
3403
3404 if (consumed)
3405 *consumed = 0;
3406
3407#ifdef NEED_RETRY
3408 retry:
3409 if (size > INT_MAX)
3410 done = decode_mbcs(&v, s, INT_MAX, 0);
3411 else
3412#endif
3413 done = decode_mbcs(&v, s, (int)size, !consumed);
3414
3415 if (done < 0) {
3416 Py_XDECREF(v);
3417 return NULL;
3418 }
3419
3420 if (consumed)
3421 *consumed += done;
3422
3423#ifdef NEED_RETRY
3424 if (size > INT_MAX) {
3425 s += done;
3426 size -= done;
3427 goto retry;
3428 }
3429#endif
3430
3431 return (PyObject *)v;
3432}
3433
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003434PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003435 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003436 const char *errors)
3437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003438 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3439}
3440
3441/*
3442 * Convert unicode into string object (MBCS).
3443 * Returns 0 if succeed, -1 otherwise.
3444 */
3445static int encode_mbcs(PyObject **repr,
3446 const Py_UNICODE *p, /* unicode */
3447 int size) /* size of unicode */
3448{
3449 int mbcssize = 0;
3450 Py_ssize_t n = 0;
3451
3452 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003453
3454 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003455 if (size > 0) {
3456 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3457 if (mbcssize == 0) {
3458 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3459 return -1;
3460 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003461 }
3462
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003463 if (*repr == NULL) {
3464 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003465 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003466 if (*repr == NULL)
3467 return -1;
3468 }
3469 else {
3470 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003471 n = PyBytes_Size(*repr);
3472 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003473 return -1;
3474 }
3475
3476 /* Do the conversion */
3477 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003478 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3480 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3481 return -1;
3482 }
3483 }
3484
3485 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003486}
3487
3488PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003489 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003490 const char *errors)
3491{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003492 PyObject *repr = NULL;
3493 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003494
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003495#ifdef NEED_RETRY
3496 retry:
3497 if (size > INT_MAX)
3498 ret = encode_mbcs(&repr, p, INT_MAX);
3499 else
3500#endif
3501 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003502
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003503 if (ret < 0) {
3504 Py_XDECREF(repr);
3505 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003506 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003507
3508#ifdef NEED_RETRY
3509 if (size > INT_MAX) {
3510 p += INT_MAX;
3511 size -= INT_MAX;
3512 goto retry;
3513 }
3514#endif
3515
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003516 return repr;
3517}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003518
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003519PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3520{
3521 if (!PyUnicode_Check(unicode)) {
3522 PyErr_BadArgument();
3523 return NULL;
3524 }
3525 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3526 PyUnicode_GET_SIZE(unicode),
3527 NULL);
3528}
3529
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003530#undef NEED_RETRY
3531
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003532#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534/* --- Character Mapping Codec -------------------------------------------- */
3535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 PyObject *mapping,
3539 const char *errors)
3540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003542 Py_ssize_t startinpos;
3543 Py_ssize_t endinpos;
3544 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 PyUnicodeObject *v;
3547 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003548 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 PyObject *errorHandler = NULL;
3550 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003551 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 /* Default to Latin-1 */
3555 if (mapping == NULL)
3556 return PyUnicode_DecodeLatin1(s, size, errors);
3557
3558 v = _PyUnicode_New(size);
3559 if (v == NULL)
3560 goto onError;
3561 if (size == 0)
3562 return (PyObject *)v;
3563 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003565 if (PyUnicode_CheckExact(mapping)) {
3566 mapstring = PyUnicode_AS_UNICODE(mapping);
3567 maplen = PyUnicode_GET_SIZE(mapping);
3568 while (s < e) {
3569 unsigned char ch = *s;
3570 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003572 if (ch < maplen)
3573 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003575 if (x == 0xfffe) {
3576 /* undefined mapping */
3577 outpos = p-PyUnicode_AS_UNICODE(v);
3578 startinpos = s-starts;
3579 endinpos = startinpos+1;
3580 if (unicode_decode_call_errorhandler(
3581 errors, &errorHandler,
3582 "charmap", "character maps to <undefined>",
3583 starts, size, &startinpos, &endinpos, &exc, &s,
3584 (PyObject **)&v, &outpos, &p)) {
3585 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003586 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003587 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003588 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003589 *p++ = x;
3590 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003592 }
3593 else {
3594 while (s < e) {
3595 unsigned char ch = *s;
3596 PyObject *w, *x;
3597
3598 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3599 w = PyInt_FromLong((long)ch);
3600 if (w == NULL)
3601 goto onError;
3602 x = PyObject_GetItem(mapping, w);
3603 Py_DECREF(w);
3604 if (x == NULL) {
3605 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3606 /* No mapping found means: mapping is undefined. */
3607 PyErr_Clear();
3608 x = Py_None;
3609 Py_INCREF(x);
3610 } else
3611 goto onError;
3612 }
3613
3614 /* Apply mapping */
3615 if (PyInt_Check(x)) {
3616 long value = PyInt_AS_LONG(x);
3617 if (value < 0 || value > 65535) {
3618 PyErr_SetString(PyExc_TypeError,
3619 "character mapping must be in range(65536)");
3620 Py_DECREF(x);
3621 goto onError;
3622 }
3623 *p++ = (Py_UNICODE)value;
3624 }
3625 else if (x == Py_None) {
3626 /* undefined mapping */
3627 outpos = p-PyUnicode_AS_UNICODE(v);
3628 startinpos = s-starts;
3629 endinpos = startinpos+1;
3630 if (unicode_decode_call_errorhandler(
3631 errors, &errorHandler,
3632 "charmap", "character maps to <undefined>",
3633 starts, size, &startinpos, &endinpos, &exc, &s,
3634 (PyObject **)&v, &outpos, &p)) {
3635 Py_DECREF(x);
3636 goto onError;
3637 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003638 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003639 continue;
3640 }
3641 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003642 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003643
3644 if (targetsize == 1)
3645 /* 1-1 mapping */
3646 *p++ = *PyUnicode_AS_UNICODE(x);
3647
3648 else if (targetsize > 1) {
3649 /* 1-n mapping */
3650 if (targetsize > extrachars) {
3651 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003652 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3653 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003654 (targetsize << 2);
3655 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003656 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003657 if (_PyUnicode_Resize(&v,
3658 PyUnicode_GET_SIZE(v) + needed) < 0) {
3659 Py_DECREF(x);
3660 goto onError;
3661 }
3662 p = PyUnicode_AS_UNICODE(v) + oldpos;
3663 }
3664 Py_UNICODE_COPY(p,
3665 PyUnicode_AS_UNICODE(x),
3666 targetsize);
3667 p += targetsize;
3668 extrachars -= targetsize;
3669 }
3670 /* 1-0 mapping: skip the character */
3671 }
3672 else {
3673 /* wrong return value */
3674 PyErr_SetString(PyExc_TypeError,
3675 "character mapping must return integer, None or unicode");
3676 Py_DECREF(x);
3677 goto onError;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003680 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 }
3683 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003684 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(errorHandler);
3687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_XDECREF(errorHandler);
3692 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 Py_XDECREF(v);
3694 return NULL;
3695}
3696
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003697/* Charmap encoding: the lookup table */
3698
3699struct encoding_map{
3700 PyObject_HEAD
3701 unsigned char level1[32];
3702 int count2, count3;
3703 unsigned char level23[1];
3704};
3705
3706static PyObject*
3707encoding_map_size(PyObject *obj, PyObject* args)
3708{
3709 struct encoding_map *map = (struct encoding_map*)obj;
3710 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3711 128*map->count3);
3712}
3713
3714static PyMethodDef encoding_map_methods[] = {
3715 {"size", encoding_map_size, METH_NOARGS,
3716 PyDoc_STR("Return the size (in bytes) of this object") },
3717 { 0 }
3718};
3719
3720static void
3721encoding_map_dealloc(PyObject* o)
3722{
3723 PyObject_FREE(o);
3724}
3725
3726static PyTypeObject EncodingMapType = {
3727 PyObject_HEAD_INIT(NULL)
3728 0, /*ob_size*/
3729 "EncodingMap", /*tp_name*/
3730 sizeof(struct encoding_map), /*tp_basicsize*/
3731 0, /*tp_itemsize*/
3732 /* methods */
3733 encoding_map_dealloc, /*tp_dealloc*/
3734 0, /*tp_print*/
3735 0, /*tp_getattr*/
3736 0, /*tp_setattr*/
3737 0, /*tp_compare*/
3738 0, /*tp_repr*/
3739 0, /*tp_as_number*/
3740 0, /*tp_as_sequence*/
3741 0, /*tp_as_mapping*/
3742 0, /*tp_hash*/
3743 0, /*tp_call*/
3744 0, /*tp_str*/
3745 0, /*tp_getattro*/
3746 0, /*tp_setattro*/
3747 0, /*tp_as_buffer*/
3748 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3749 0, /*tp_doc*/
3750 0, /*tp_traverse*/
3751 0, /*tp_clear*/
3752 0, /*tp_richcompare*/
3753 0, /*tp_weaklistoffset*/
3754 0, /*tp_iter*/
3755 0, /*tp_iternext*/
3756 encoding_map_methods, /*tp_methods*/
3757 0, /*tp_members*/
3758 0, /*tp_getset*/
3759 0, /*tp_base*/
3760 0, /*tp_dict*/
3761 0, /*tp_descr_get*/
3762 0, /*tp_descr_set*/
3763 0, /*tp_dictoffset*/
3764 0, /*tp_init*/
3765 0, /*tp_alloc*/
3766 0, /*tp_new*/
3767 0, /*tp_free*/
3768 0, /*tp_is_gc*/
3769};
3770
3771PyObject*
3772PyUnicode_BuildEncodingMap(PyObject* string)
3773{
3774 Py_UNICODE *decode;
3775 PyObject *result;
3776 struct encoding_map *mresult;
3777 int i;
3778 int need_dict = 0;
3779 unsigned char level1[32];
3780 unsigned char level2[512];
3781 unsigned char *mlevel1, *mlevel2, *mlevel3;
3782 int count2 = 0, count3 = 0;
3783
3784 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3785 PyErr_BadArgument();
3786 return NULL;
3787 }
3788 decode = PyUnicode_AS_UNICODE(string);
3789 memset(level1, 0xFF, sizeof level1);
3790 memset(level2, 0xFF, sizeof level2);
3791
3792 /* If there isn't a one-to-one mapping of NULL to \0,
3793 or if there are non-BMP characters, we need to use
3794 a mapping dictionary. */
3795 if (decode[0] != 0)
3796 need_dict = 1;
3797 for (i = 1; i < 256; i++) {
3798 int l1, l2;
3799 if (decode[i] == 0
3800 #ifdef Py_UNICODE_WIDE
3801 || decode[i] > 0xFFFF
3802 #endif
3803 ) {
3804 need_dict = 1;
3805 break;
3806 }
3807 if (decode[i] == 0xFFFE)
3808 /* unmapped character */
3809 continue;
3810 l1 = decode[i] >> 11;
3811 l2 = decode[i] >> 7;
3812 if (level1[l1] == 0xFF)
3813 level1[l1] = count2++;
3814 if (level2[l2] == 0xFF)
3815 level2[l2] = count3++;
3816 }
3817
3818 if (count2 >= 0xFF || count3 >= 0xFF)
3819 need_dict = 1;
3820
3821 if (need_dict) {
3822 PyObject *result = PyDict_New();
3823 PyObject *key, *value;
3824 if (!result)
3825 return NULL;
3826 for (i = 0; i < 256; i++) {
3827 key = value = NULL;
3828 key = PyInt_FromLong(decode[i]);
3829 value = PyInt_FromLong(i);
3830 if (!key || !value)
3831 goto failed1;
3832 if (PyDict_SetItem(result, key, value) == -1)
3833 goto failed1;
3834 Py_DECREF(key);
3835 Py_DECREF(value);
3836 }
3837 return result;
3838 failed1:
3839 Py_XDECREF(key);
3840 Py_XDECREF(value);
3841 Py_DECREF(result);
3842 return NULL;
3843 }
3844
3845 /* Create a three-level trie */
3846 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3847 16*count2 + 128*count3 - 1);
3848 if (!result)
3849 return PyErr_NoMemory();
3850 PyObject_Init(result, &EncodingMapType);
3851 mresult = (struct encoding_map*)result;
3852 mresult->count2 = count2;
3853 mresult->count3 = count3;
3854 mlevel1 = mresult->level1;
3855 mlevel2 = mresult->level23;
3856 mlevel3 = mresult->level23 + 16*count2;
3857 memcpy(mlevel1, level1, 32);
3858 memset(mlevel2, 0xFF, 16*count2);
3859 memset(mlevel3, 0, 128*count3);
3860 count3 = 0;
3861 for (i = 1; i < 256; i++) {
3862 int o1, o2, o3, i2, i3;
3863 if (decode[i] == 0xFFFE)
3864 /* unmapped character */
3865 continue;
3866 o1 = decode[i]>>11;
3867 o2 = (decode[i]>>7) & 0xF;
3868 i2 = 16*mlevel1[o1] + o2;
3869 if (mlevel2[i2] == 0xFF)
3870 mlevel2[i2] = count3++;
3871 o3 = decode[i] & 0x7F;
3872 i3 = 128*mlevel2[i2] + o3;
3873 mlevel3[i3] = i;
3874 }
3875 return result;
3876}
3877
3878static int
3879encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3880{
3881 struct encoding_map *map = (struct encoding_map*)mapping;
3882 int l1 = c>>11;
3883 int l2 = (c>>7) & 0xF;
3884 int l3 = c & 0x7F;
3885 int i;
3886
3887#ifdef Py_UNICODE_WIDE
3888 if (c > 0xFFFF) {
3889 return -1;
3890 }
3891#endif
3892 if (c == 0)
3893 return 0;
3894 /* level 1*/
3895 i = map->level1[l1];
3896 if (i == 0xFF) {
3897 return -1;
3898 }
3899 /* level 2*/
3900 i = map->level23[16*i+l2];
3901 if (i == 0xFF) {
3902 return -1;
3903 }
3904 /* level 3 */
3905 i = map->level23[16*map->count2 + 128*i + l3];
3906 if (i == 0) {
3907 return -1;
3908 }
3909 return i;
3910}
3911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912/* Lookup the character ch in the mapping. If the character
3913 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003914 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917 PyObject *w = PyInt_FromLong((long)c);
3918 PyObject *x;
3919
3920 if (w == NULL)
3921 return NULL;
3922 x = PyObject_GetItem(mapping, w);
3923 Py_DECREF(w);
3924 if (x == NULL) {
3925 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3926 /* No mapping found means: mapping is undefined. */
3927 PyErr_Clear();
3928 x = Py_None;
3929 Py_INCREF(x);
3930 return x;
3931 } else
3932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003934 else if (x == Py_None)
3935 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 else if (PyInt_Check(x)) {
3937 long value = PyInt_AS_LONG(x);
3938 if (value < 0 || value > 255) {
3939 PyErr_SetString(PyExc_TypeError,
3940 "character mapping must be in range(256)");
3941 Py_DECREF(x);
3942 return NULL;
3943 }
3944 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 else if (PyString_Check(x))
3947 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003950 PyErr_Format(PyExc_TypeError,
3951 "character mapping must return integer, None or str8, not %.400s",
3952 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 Py_DECREF(x);
3954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 }
3956}
3957
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003958static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003959charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003960{
Walter Dörwald827b0552007-05-12 13:23:53 +00003961 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003962 /* exponentially overallocate to minimize reallocations */
3963 if (requiredsize < 2*outsize)
3964 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003965 if (PyBytes_Resize(outobj, requiredsize)) {
3966 Py_DECREF(outobj);
3967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003969 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970}
3971
3972typedef enum charmapencode_result {
3973 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3974}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003976 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 space is available. Return a new reference to the object that
3978 was put in the output buffer, or Py_None, if the mapping was undefined
3979 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003980 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003982charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003983 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003985 PyObject *rep;
3986 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003987 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003989 if (mapping->ob_type == &EncodingMapType) {
3990 int res = encoding_map_lookup(c, mapping);
3991 Py_ssize_t requiredsize = *outpos+1;
3992 if (res == -1)
3993 return enc_FAILED;
3994 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003996 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003997 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 outstart[(*outpos)++] = (char)res;
3999 return enc_SUCCESS;
4000 }
4001
4002 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004004 return enc_EXCEPTION;
4005 else if (rep==Py_None) {
4006 Py_DECREF(rep);
4007 return enc_FAILED;
4008 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004010 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004012 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004014 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004016 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4018 }
4019 else {
4020 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004021 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4022 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004024 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004028 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 memcpy(outstart + *outpos, repchars, repsize);
4030 *outpos += repsize;
4031 }
4032 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004033 Py_DECREF(rep);
4034 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035}
4036
4037/* handle an error in PyUnicode_EncodeCharmap
4038 Return 0 on success, -1 on error */
4039static
4040int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004043 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004044 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045{
4046 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004047 Py_ssize_t repsize;
4048 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 Py_UNICODE *uni2;
4050 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004051 Py_ssize_t collstartpos = *inpos;
4052 Py_ssize_t collendpos = *inpos+1;
4053 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 char *encoding = "charmap";
4055 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004056 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 /* find all unencodable characters */
4059 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004060 PyObject *rep;
4061 if (mapping->ob_type == &EncodingMapType) {
4062 int res = encoding_map_lookup(p[collendpos], mapping);
4063 if (res != -1)
4064 break;
4065 ++collendpos;
4066 continue;
4067 }
4068
4069 rep = charmapencode_lookup(p[collendpos], mapping);
4070 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004072 else if (rep!=Py_None) {
4073 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 break;
4075 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004076 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 ++collendpos;
4078 }
4079 /* cache callback name lookup
4080 * (if not done yet, i.e. it's the first error) */
4081 if (*known_errorHandler==-1) {
4082 if ((errors==NULL) || (!strcmp(errors, "strict")))
4083 *known_errorHandler = 1;
4084 else if (!strcmp(errors, "replace"))
4085 *known_errorHandler = 2;
4086 else if (!strcmp(errors, "ignore"))
4087 *known_errorHandler = 3;
4088 else if (!strcmp(errors, "xmlcharrefreplace"))
4089 *known_errorHandler = 4;
4090 else
4091 *known_errorHandler = 0;
4092 }
4093 switch (*known_errorHandler) {
4094 case 1: /* strict */
4095 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4096 return -1;
4097 case 2: /* replace */
4098 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4099 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004100 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 return -1;
4102 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004103 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4105 return -1;
4106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 }
4108 /* fall through */
4109 case 3: /* ignore */
4110 *inpos = collendpos;
4111 break;
4112 case 4: /* xmlcharrefreplace */
4113 /* generate replacement (temporarily (mis)uses p) */
4114 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4115 char buffer[2+29+1+1];
4116 char *cp;
4117 sprintf(buffer, "&#%d;", (int)p[collpos]);
4118 for (cp = buffer; *cp; ++cp) {
4119 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004120 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004122 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4124 return -1;
4125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 }
4127 }
4128 *inpos = collendpos;
4129 break;
4130 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004131 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 encoding, reason, p, size, exceptionObject,
4133 collstartpos, collendpos, &newpos);
4134 if (repunicode == NULL)
4135 return -1;
4136 /* generate replacement */
4137 repsize = PyUnicode_GET_SIZE(repunicode);
4138 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4139 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004140 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 return -1;
4142 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004143 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4146 return -1;
4147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 }
4149 *inpos = newpos;
4150 Py_DECREF(repunicode);
4151 }
4152 return 0;
4153}
4154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004156 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 PyObject *mapping,
4158 const char *errors)
4159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 /* output object */
4161 PyObject *res = NULL;
4162 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 PyObject *errorHandler = NULL;
4167 PyObject *exc = NULL;
4168 /* the following variable is used for caching string comparisons
4169 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4170 * 3=ignore, 4=xmlcharrefreplace */
4171 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172
4173 /* Default to Latin-1 */
4174 if (mapping == NULL)
4175 return PyUnicode_EncodeLatin1(p, size, errors);
4176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 /* allocate enough for a simple encoding without
4178 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004179 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 if (res == NULL)
4181 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004182 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 while (inpos<size) {
4186 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004187 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004188 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004190 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 if (charmap_encoding_error(p, size, &inpos, mapping,
4192 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004193 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004194 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004195 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 else
4199 /* done with this character => adjust input position */
4200 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004204 if (respos<PyBytes_GET_SIZE(res)) {
4205 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 goto onError;
4207 }
4208 Py_XDECREF(exc);
4209 Py_XDECREF(errorHandler);
4210 return res;
4211
4212 onError:
4213 Py_XDECREF(res);
4214 Py_XDECREF(exc);
4215 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 return NULL;
4217}
4218
4219PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4220 PyObject *mapping)
4221{
4222 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4223 PyErr_BadArgument();
4224 return NULL;
4225 }
4226 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4227 PyUnicode_GET_SIZE(unicode),
4228 mapping,
4229 NULL);
4230}
4231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232/* create or adjust a UnicodeTranslateError */
4233static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004234 const Py_UNICODE *unicode, Py_ssize_t size,
4235 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 if (*exceptionObject == NULL) {
4239 *exceptionObject = PyUnicodeTranslateError_Create(
4240 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241 }
4242 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4244 goto onError;
4245 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4246 goto onError;
4247 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4248 goto onError;
4249 return;
4250 onError:
4251 Py_DECREF(*exceptionObject);
4252 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 }
4254}
4255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256/* raises a UnicodeTranslateError */
4257static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004258 const Py_UNICODE *unicode, Py_ssize_t size,
4259 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 const char *reason)
4261{
4262 make_translate_exception(exceptionObject,
4263 unicode, size, startpos, endpos, reason);
4264 if (*exceptionObject != NULL)
4265 PyCodec_StrictErrors(*exceptionObject);
4266}
4267
4268/* error handling callback helper:
4269 build arguments, call the callback and check the arguments,
4270 put the result into newpos and return the replacement string, which
4271 has to be freed by the caller */
4272static PyObject *unicode_translate_call_errorhandler(const char *errors,
4273 PyObject **errorHandler,
4274 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004275 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4276 Py_ssize_t startpos, Py_ssize_t endpos,
4277 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004279 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004281 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 PyObject *restuple;
4283 PyObject *resunicode;
4284
4285 if (*errorHandler == NULL) {
4286 *errorHandler = PyCodec_LookupError(errors);
4287 if (*errorHandler == NULL)
4288 return NULL;
4289 }
4290
4291 make_translate_exception(exceptionObject,
4292 unicode, size, startpos, endpos, reason);
4293 if (*exceptionObject == NULL)
4294 return NULL;
4295
4296 restuple = PyObject_CallFunctionObjArgs(
4297 *errorHandler, *exceptionObject, NULL);
4298 if (restuple == NULL)
4299 return NULL;
4300 if (!PyTuple_Check(restuple)) {
4301 PyErr_Format(PyExc_TypeError, &argparse[4]);
4302 Py_DECREF(restuple);
4303 return NULL;
4304 }
4305 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 Py_DECREF(restuple);
4308 return NULL;
4309 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 if (i_newpos<0)
4311 *newpos = size+i_newpos;
4312 else
4313 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004314 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004315 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 Py_DECREF(restuple);
4317 return NULL;
4318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_INCREF(resunicode);
4320 Py_DECREF(restuple);
4321 return resunicode;
4322}
4323
4324/* Lookup the character ch in the mapping and put the result in result,
4325 which must be decrefed by the caller.
4326 Return 0 on success, -1 on error */
4327static
4328int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4329{
4330 PyObject *w = PyInt_FromLong((long)c);
4331 PyObject *x;
4332
4333 if (w == NULL)
4334 return -1;
4335 x = PyObject_GetItem(mapping, w);
4336 Py_DECREF(w);
4337 if (x == NULL) {
4338 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4339 /* No mapping found means: use 1:1 mapping. */
4340 PyErr_Clear();
4341 *result = NULL;
4342 return 0;
4343 } else
4344 return -1;
4345 }
4346 else if (x == Py_None) {
4347 *result = x;
4348 return 0;
4349 }
4350 else if (PyInt_Check(x)) {
4351 long value = PyInt_AS_LONG(x);
4352 long max = PyUnicode_GetMax();
4353 if (value < 0 || value > max) {
4354 PyErr_Format(PyExc_TypeError,
4355 "character mapping must be in range(0x%lx)", max+1);
4356 Py_DECREF(x);
4357 return -1;
4358 }
4359 *result = x;
4360 return 0;
4361 }
4362 else if (PyUnicode_Check(x)) {
4363 *result = x;
4364 return 0;
4365 }
4366 else {
4367 /* wrong return value */
4368 PyErr_SetString(PyExc_TypeError,
4369 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004370 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 return -1;
4372 }
4373}
4374/* ensure that *outobj is at least requiredsize characters long,
4375if not reallocate and adjust various state variables.
4376Return 0 on success, -1 on error */
4377static
Walter Dörwald4894c302003-10-24 14:25:28 +00004378int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004382 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004386 if (requiredsize < 2 * oldsize)
4387 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004388 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return -1;
4390 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 }
4392 return 0;
4393}
4394/* lookup the character, put the result in the output string and adjust
4395 various state variables. Return a new reference to the object that
4396 was put in the output buffer in *result, or Py_None, if the mapping was
4397 undefined (in which case no character was written).
4398 The called must decref result.
4399 Return 0 on success, -1 on error. */
4400static
Walter Dörwald4894c302003-10-24 14:25:28 +00004401int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004403 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404{
Walter Dörwald4894c302003-10-24 14:25:28 +00004405 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 return -1;
4407 if (*res==NULL) {
4408 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004409 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 }
4411 else if (*res==Py_None)
4412 ;
4413 else if (PyInt_Check(*res)) {
4414 /* no overflow check, because we know that the space is enough */
4415 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4416 }
4417 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004418 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 if (repsize==1) {
4420 /* no overflow check, because we know that the space is enough */
4421 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4422 }
4423 else if (repsize!=0) {
4424 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004425 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004426 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004427 repsize - 1;
4428 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 return -1;
4430 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4431 *outp += repsize;
4432 }
4433 }
4434 else
4435 return -1;
4436 return 0;
4437}
4438
4439PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 PyObject *mapping,
4442 const char *errors)
4443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 /* output object */
4445 PyObject *res = NULL;
4446 /* pointers to the beginning and end+1 of input */
4447 const Py_UNICODE *startp = p;
4448 const Py_UNICODE *endp = p + size;
4449 /* pointer into the output */
4450 Py_UNICODE *str;
4451 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004452 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 char *reason = "character maps to <undefined>";
4454 PyObject *errorHandler = NULL;
4455 PyObject *exc = NULL;
4456 /* the following variable is used for caching string comparisons
4457 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4458 * 3=ignore, 4=xmlcharrefreplace */
4459 int known_errorHandler = -1;
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (mapping == NULL) {
4462 PyErr_BadArgument();
4463 return NULL;
4464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465
4466 /* allocate enough for a simple 1:1 translation without
4467 replacements, if we need more, we'll resize */
4468 res = PyUnicode_FromUnicode(NULL, size);
4469 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004470 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 return res;
4473 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 while (p<endp) {
4476 /* try to encode it */
4477 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004478 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 goto onError;
4481 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004482 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483 if (x!=Py_None) /* it worked => adjust input pointer */
4484 ++p;
4485 else { /* untranslatable character */
4486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004487 Py_ssize_t repsize;
4488 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 Py_UNICODE *uni2;
4490 /* startpos for collecting untranslatable chars */
4491 const Py_UNICODE *collstart = p;
4492 const Py_UNICODE *collend = p+1;
4493 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 /* find all untranslatable characters */
4496 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004497 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 goto onError;
4499 Py_XDECREF(x);
4500 if (x!=Py_None)
4501 break;
4502 ++collend;
4503 }
4504 /* cache callback name lookup
4505 * (if not done yet, i.e. it's the first error) */
4506 if (known_errorHandler==-1) {
4507 if ((errors==NULL) || (!strcmp(errors, "strict")))
4508 known_errorHandler = 1;
4509 else if (!strcmp(errors, "replace"))
4510 known_errorHandler = 2;
4511 else if (!strcmp(errors, "ignore"))
4512 known_errorHandler = 3;
4513 else if (!strcmp(errors, "xmlcharrefreplace"))
4514 known_errorHandler = 4;
4515 else
4516 known_errorHandler = 0;
4517 }
4518 switch (known_errorHandler) {
4519 case 1: /* strict */
4520 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4521 goto onError;
4522 case 2: /* replace */
4523 /* No need to check for space, this is a 1:1 replacement */
4524 for (coll = collstart; coll<collend; ++coll)
4525 *str++ = '?';
4526 /* fall through */
4527 case 3: /* ignore */
4528 p = collend;
4529 break;
4530 case 4: /* xmlcharrefreplace */
4531 /* generate replacement (temporarily (mis)uses p) */
4532 for (p = collstart; p < collend; ++p) {
4533 char buffer[2+29+1+1];
4534 char *cp;
4535 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004536 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4538 goto onError;
4539 for (cp = buffer; *cp; ++cp)
4540 *str++ = *cp;
4541 }
4542 p = collend;
4543 break;
4544 default:
4545 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4546 reason, startp, size, &exc,
4547 collstart-startp, collend-startp, &newpos);
4548 if (repunicode == NULL)
4549 goto onError;
4550 /* generate replacement */
4551 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004552 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4554 Py_DECREF(repunicode);
4555 goto onError;
4556 }
4557 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4558 *str++ = *uni2;
4559 p = startp + newpos;
4560 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
4562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 /* Resize if we allocated to much */
4565 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004566 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004567 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 }
4570 Py_XDECREF(exc);
4571 Py_XDECREF(errorHandler);
4572 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 onError:
4575 Py_XDECREF(res);
4576 Py_XDECREF(exc);
4577 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 return NULL;
4579}
4580
4581PyObject *PyUnicode_Translate(PyObject *str,
4582 PyObject *mapping,
4583 const char *errors)
4584{
4585 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004586
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 str = PyUnicode_FromObject(str);
4588 if (str == NULL)
4589 goto onError;
4590 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4591 PyUnicode_GET_SIZE(str),
4592 mapping,
4593 errors);
4594 Py_DECREF(str);
4595 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 onError:
4598 Py_XDECREF(str);
4599 return NULL;
4600}
Tim Petersced69f82003-09-16 20:30:58 +00004601
Guido van Rossum9e896b32000-04-05 20:11:21 +00004602/* --- Decimal Encoder ---------------------------------------------------- */
4603
4604int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004605 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004606 char *output,
4607 const char *errors)
4608{
4609 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 PyObject *errorHandler = NULL;
4611 PyObject *exc = NULL;
4612 const char *encoding = "decimal";
4613 const char *reason = "invalid decimal Unicode string";
4614 /* the following variable is used for caching string comparisons
4615 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4616 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004617
4618 if (output == NULL) {
4619 PyErr_BadArgument();
4620 return -1;
4621 }
4622
4623 p = s;
4624 end = s + length;
4625 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004627 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004629 Py_ssize_t repsize;
4630 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 Py_UNICODE *uni2;
4632 Py_UNICODE *collstart;
4633 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004634
Guido van Rossum9e896b32000-04-05 20:11:21 +00004635 if (Py_UNICODE_ISSPACE(ch)) {
4636 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004638 continue;
4639 }
4640 decimal = Py_UNICODE_TODECIMAL(ch);
4641 if (decimal >= 0) {
4642 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004644 continue;
4645 }
Guido van Rossumba477042000-04-06 18:18:10 +00004646 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004647 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004649 continue;
4650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 /* All other characters are considered unencodable */
4652 collstart = p;
4653 collend = p+1;
4654 while (collend < end) {
4655 if ((0 < *collend && *collend < 256) ||
4656 !Py_UNICODE_ISSPACE(*collend) ||
4657 Py_UNICODE_TODECIMAL(*collend))
4658 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004659 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 /* cache callback name lookup
4661 * (if not done yet, i.e. it's the first error) */
4662 if (known_errorHandler==-1) {
4663 if ((errors==NULL) || (!strcmp(errors, "strict")))
4664 known_errorHandler = 1;
4665 else if (!strcmp(errors, "replace"))
4666 known_errorHandler = 2;
4667 else if (!strcmp(errors, "ignore"))
4668 known_errorHandler = 3;
4669 else if (!strcmp(errors, "xmlcharrefreplace"))
4670 known_errorHandler = 4;
4671 else
4672 known_errorHandler = 0;
4673 }
4674 switch (known_errorHandler) {
4675 case 1: /* strict */
4676 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4677 goto onError;
4678 case 2: /* replace */
4679 for (p = collstart; p < collend; ++p)
4680 *output++ = '?';
4681 /* fall through */
4682 case 3: /* ignore */
4683 p = collend;
4684 break;
4685 case 4: /* xmlcharrefreplace */
4686 /* generate replacement (temporarily (mis)uses p) */
4687 for (p = collstart; p < collend; ++p)
4688 output += sprintf(output, "&#%d;", (int)*p);
4689 p = collend;
4690 break;
4691 default:
4692 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4693 encoding, reason, s, length, &exc,
4694 collstart-s, collend-s, &newpos);
4695 if (repunicode == NULL)
4696 goto onError;
4697 /* generate replacement */
4698 repsize = PyUnicode_GET_SIZE(repunicode);
4699 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4700 Py_UNICODE ch = *uni2;
4701 if (Py_UNICODE_ISSPACE(ch))
4702 *output++ = ' ';
4703 else {
4704 decimal = Py_UNICODE_TODECIMAL(ch);
4705 if (decimal >= 0)
4706 *output++ = '0' + decimal;
4707 else if (0 < ch && ch < 256)
4708 *output++ = (char)ch;
4709 else {
4710 Py_DECREF(repunicode);
4711 raise_encode_exception(&exc, encoding,
4712 s, length, collstart-s, collend-s, reason);
4713 goto onError;
4714 }
4715 }
4716 }
4717 p = s + newpos;
4718 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004719 }
4720 }
4721 /* 0-terminate the output string */
4722 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 Py_XDECREF(exc);
4724 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004725 return 0;
4726
4727 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(exc);
4729 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004730 return -1;
4731}
4732
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733/* --- Helpers ------------------------------------------------------------ */
4734
Thomas Wouters477c8d52006-05-27 19:21:47 +00004735#define STRINGLIB_CHAR Py_UNICODE
4736
4737#define STRINGLIB_LEN PyUnicode_GET_SIZE
4738#define STRINGLIB_NEW PyUnicode_FromUnicode
4739#define STRINGLIB_STR PyUnicode_AS_UNICODE
4740
4741Py_LOCAL_INLINE(int)
4742STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004744 if (str[0] != other[0])
4745 return 1;
4746 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747}
4748
Thomas Wouters477c8d52006-05-27 19:21:47 +00004749#define STRINGLIB_EMPTY unicode_empty
4750
4751#include "stringlib/fastsearch.h"
4752
4753#include "stringlib/count.h"
4754#include "stringlib/find.h"
4755#include "stringlib/partition.h"
4756
4757/* helper macro to fixup start/end slice values */
4758#define FIX_START_END(obj) \
4759 if (start < 0) \
4760 start += (obj)->length; \
4761 if (start < 0) \
4762 start = 0; \
4763 if (end > (obj)->length) \
4764 end = (obj)->length; \
4765 if (end < 0) \
4766 end += (obj)->length; \
4767 if (end < 0) \
4768 end = 0;
4769
Martin v. Löwis18e16552006-02-15 17:27:45 +00004770Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004771 PyObject *substr,
4772 Py_ssize_t start,
4773 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004776 PyUnicodeObject* str_obj;
4777 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004778
Thomas Wouters477c8d52006-05-27 19:21:47 +00004779 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4780 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004782 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4783 if (!sub_obj) {
4784 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 return -1;
4786 }
Tim Petersced69f82003-09-16 20:30:58 +00004787
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004789
Thomas Wouters477c8d52006-05-27 19:21:47 +00004790 result = stringlib_count(
4791 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4792 );
4793
4794 Py_DECREF(sub_obj);
4795 Py_DECREF(str_obj);
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return result;
4798}
4799
Martin v. Löwis18e16552006-02-15 17:27:45 +00004800Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004801 PyObject *sub,
4802 Py_ssize_t start,
4803 Py_ssize_t end,
4804 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004809 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004810 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004811 sub = PyUnicode_FromObject(sub);
4812 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004813 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004814 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
Tim Petersced69f82003-09-16 20:30:58 +00004816
Thomas Wouters477c8d52006-05-27 19:21:47 +00004817 if (direction > 0)
4818 result = stringlib_find_slice(
4819 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4820 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4821 start, end
4822 );
4823 else
4824 result = stringlib_rfind_slice(
4825 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4826 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4827 start, end
4828 );
4829
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004831 Py_DECREF(sub);
4832
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 return result;
4834}
4835
Tim Petersced69f82003-09-16 20:30:58 +00004836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837int tailmatch(PyUnicodeObject *self,
4838 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004839 Py_ssize_t start,
4840 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 int direction)
4842{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (substring->length == 0)
4844 return 1;
4845
Thomas Wouters477c8d52006-05-27 19:21:47 +00004846 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
4848 end -= substring->length;
4849 if (end < start)
4850 return 0;
4851
4852 if (direction > 0) {
4853 if (Py_UNICODE_MATCH(self, end, substring))
4854 return 1;
4855 } else {
4856 if (Py_UNICODE_MATCH(self, start, substring))
4857 return 1;
4858 }
4859
4860 return 0;
4861}
4862
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 Py_ssize_t start,
4866 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 int direction)
4868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 str = PyUnicode_FromObject(str);
4872 if (str == NULL)
4873 return -1;
4874 substr = PyUnicode_FromObject(substr);
4875 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004876 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 return -1;
4878 }
Tim Petersced69f82003-09-16 20:30:58 +00004879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 result = tailmatch((PyUnicodeObject *)str,
4881 (PyUnicodeObject *)substr,
4882 start, end, direction);
4883 Py_DECREF(str);
4884 Py_DECREF(substr);
4885 return result;
4886}
4887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888/* Apply fixfct filter to the Unicode object self and return a
4889 reference to the modified object */
4890
Tim Petersced69f82003-09-16 20:30:58 +00004891static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892PyObject *fixup(PyUnicodeObject *self,
4893 int (*fixfct)(PyUnicodeObject *s))
4894{
4895
4896 PyUnicodeObject *u;
4897
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004898 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 if (u == NULL)
4900 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004901
4902 Py_UNICODE_COPY(u->str, self->str, self->length);
4903
Tim Peters7a29bd52001-09-12 03:03:31 +00004904 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 /* fixfct should return TRUE if it modified the buffer. If
4906 FALSE, return a reference to the original buffer instead
4907 (to save space, not time) */
4908 Py_INCREF(self);
4909 Py_DECREF(u);
4910 return (PyObject*) self;
4911 }
4912 return (PyObject*) u;
4913}
4914
Tim Petersced69f82003-09-16 20:30:58 +00004915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916int fixupper(PyUnicodeObject *self)
4917{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004918 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 Py_UNICODE *s = self->str;
4920 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 while (len-- > 0) {
4923 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004924
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 ch = Py_UNICODE_TOUPPER(*s);
4926 if (ch != *s) {
4927 status = 1;
4928 *s = ch;
4929 }
4930 s++;
4931 }
4932
4933 return status;
4934}
4935
Tim Petersced69f82003-09-16 20:30:58 +00004936static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937int fixlower(PyUnicodeObject *self)
4938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 Py_UNICODE *s = self->str;
4941 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004942
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 while (len-- > 0) {
4944 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004945
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 ch = Py_UNICODE_TOLOWER(*s);
4947 if (ch != *s) {
4948 status = 1;
4949 *s = ch;
4950 }
4951 s++;
4952 }
4953
4954 return status;
4955}
4956
Tim Petersced69f82003-09-16 20:30:58 +00004957static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958int fixswapcase(PyUnicodeObject *self)
4959{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004960 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 Py_UNICODE *s = self->str;
4962 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 while (len-- > 0) {
4965 if (Py_UNICODE_ISUPPER(*s)) {
4966 *s = Py_UNICODE_TOLOWER(*s);
4967 status = 1;
4968 } else if (Py_UNICODE_ISLOWER(*s)) {
4969 *s = Py_UNICODE_TOUPPER(*s);
4970 status = 1;
4971 }
4972 s++;
4973 }
4974
4975 return status;
4976}
4977
Tim Petersced69f82003-09-16 20:30:58 +00004978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979int fixcapitalize(PyUnicodeObject *self)
4980{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004981 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004982 Py_UNICODE *s = self->str;
4983 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004984
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004985 if (len == 0)
4986 return 0;
4987 if (Py_UNICODE_ISLOWER(*s)) {
4988 *s = Py_UNICODE_TOUPPER(*s);
4989 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004991 s++;
4992 while (--len > 0) {
4993 if (Py_UNICODE_ISUPPER(*s)) {
4994 *s = Py_UNICODE_TOLOWER(*s);
4995 status = 1;
4996 }
4997 s++;
4998 }
4999 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000}
5001
5002static
5003int fixtitle(PyUnicodeObject *self)
5004{
5005 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5006 register Py_UNICODE *e;
5007 int previous_is_cased;
5008
5009 /* Shortcut for single character strings */
5010 if (PyUnicode_GET_SIZE(self) == 1) {
5011 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5012 if (*p != ch) {
5013 *p = ch;
5014 return 1;
5015 }
5016 else
5017 return 0;
5018 }
Tim Petersced69f82003-09-16 20:30:58 +00005019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 e = p + PyUnicode_GET_SIZE(self);
5021 previous_is_cased = 0;
5022 for (; p < e; p++) {
5023 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005024
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 if (previous_is_cased)
5026 *p = Py_UNICODE_TOLOWER(ch);
5027 else
5028 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005029
5030 if (Py_UNICODE_ISLOWER(ch) ||
5031 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 Py_UNICODE_ISTITLE(ch))
5033 previous_is_cased = 1;
5034 else
5035 previous_is_cased = 0;
5036 }
5037 return 1;
5038}
5039
Tim Peters8ce9f162004-08-27 01:49:32 +00005040PyObject *
5041PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042{
Tim Peters8ce9f162004-08-27 01:49:32 +00005043 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005044 const Py_UNICODE blank = ' ';
5045 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005047 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005048 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5049 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005050 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5051 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005052 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005053 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005054 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
Tim Peters05eba1f2004-08-27 21:32:02 +00005056 fseq = PySequence_Fast(seq, "");
5057 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005058 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005059 }
5060
Tim Peters91879ab2004-08-27 22:35:44 +00005061 /* Grrrr. A codec may be invoked to convert str objects to
5062 * Unicode, and so it's possible to call back into Python code
5063 * during PyUnicode_FromObject(), and so it's possible for a sick
5064 * codec to change the size of fseq (if seq is a list). Therefore
5065 * we have to keep refetching the size -- can't assume seqlen
5066 * is invariant.
5067 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005068 seqlen = PySequence_Fast_GET_SIZE(fseq);
5069 /* If empty sequence, return u"". */
5070 if (seqlen == 0) {
5071 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5072 goto Done;
5073 }
5074 /* If singleton sequence with an exact Unicode, return that. */
5075 if (seqlen == 1) {
5076 item = PySequence_Fast_GET_ITEM(fseq, 0);
5077 if (PyUnicode_CheckExact(item)) {
5078 Py_INCREF(item);
5079 res = (PyUnicodeObject *)item;
5080 goto Done;
5081 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005082 }
5083
Tim Peters05eba1f2004-08-27 21:32:02 +00005084 /* At least two items to join, or one that isn't exact Unicode. */
5085 if (seqlen > 1) {
5086 /* Set up sep and seplen -- they're needed. */
5087 if (separator == NULL) {
5088 sep = &blank;
5089 seplen = 1;
5090 }
5091 else {
5092 internal_separator = PyUnicode_FromObject(separator);
5093 if (internal_separator == NULL)
5094 goto onError;
5095 sep = PyUnicode_AS_UNICODE(internal_separator);
5096 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005097 /* In case PyUnicode_FromObject() mutated seq. */
5098 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005099 }
5100 }
5101
5102 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005103 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005104 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005105 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005106 res_p = PyUnicode_AS_UNICODE(res);
5107 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005108
Tim Peters05eba1f2004-08-27 21:32:02 +00005109 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005110 Py_ssize_t itemlen;
5111 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005112
5113 item = PySequence_Fast_GET_ITEM(fseq, i);
5114 /* Convert item to Unicode. */
5115 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5116 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005117 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 " %.80s found",
5119 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005120 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005121 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005122 item = PyUnicode_FromObject(item);
5123 if (item == NULL)
5124 goto onError;
5125 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005126
Tim Peters91879ab2004-08-27 22:35:44 +00005127 /* In case PyUnicode_FromObject() mutated seq. */
5128 seqlen = PySequence_Fast_GET_SIZE(fseq);
5129
Tim Peters8ce9f162004-08-27 01:49:32 +00005130 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005132 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005133 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005134 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005135 if (i < seqlen - 1) {
5136 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005137 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005138 goto Overflow;
5139 }
5140 if (new_res_used > res_alloc) {
5141 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005143 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005144 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005145 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005146 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005147 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005148 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005150 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005151 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005153
5154 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005155 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005156 res_p += itemlen;
5157 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005158 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 res_p += seplen;
5160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005162 res_used = new_res_used;
5163 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005164
Tim Peters05eba1f2004-08-27 21:32:02 +00005165 /* Shrink res to match the used area; this probably can't fail,
5166 * but it's cheap to check.
5167 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005168 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005169 goto onError;
5170
5171 Done:
5172 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005173 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 return (PyObject *)res;
5175
Tim Peters8ce9f162004-08-27 01:49:32 +00005176 Overflow:
5177 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005179 Py_DECREF(item);
5180 /* fall through */
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005183 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005184 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005185 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return NULL;
5187}
5188
Tim Petersced69f82003-09-16 20:30:58 +00005189static
5190PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005191 Py_ssize_t left,
5192 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 Py_UNICODE fill)
5194{
5195 PyUnicodeObject *u;
5196
5197 if (left < 0)
5198 left = 0;
5199 if (right < 0)
5200 right = 0;
5201
Tim Peters7a29bd52001-09-12 03:03:31 +00005202 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 Py_INCREF(self);
5204 return self;
5205 }
5206
5207 u = _PyUnicode_New(left + self->length + right);
5208 if (u) {
5209 if (left)
5210 Py_UNICODE_FILL(u->str, fill, left);
5211 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5212 if (right)
5213 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5214 }
5215
5216 return u;
5217}
5218
5219#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005220 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 if (!str) \
5222 goto onError; \
5223 if (PyList_Append(list, str)) { \
5224 Py_DECREF(str); \
5225 goto onError; \
5226 } \
5227 else \
5228 Py_DECREF(str);
5229
5230static
5231PyObject *split_whitespace(PyUnicodeObject *self,
5232 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 register Py_ssize_t i;
5236 register Py_ssize_t j;
5237 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 PyObject *str;
5239
5240 for (i = j = 0; i < len; ) {
5241 /* find a token */
5242 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5243 i++;
5244 j = i;
5245 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5246 i++;
5247 if (j < i) {
5248 if (maxcount-- <= 0)
5249 break;
5250 SPLIT_APPEND(self->str, j, i);
5251 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5252 i++;
5253 j = i;
5254 }
5255 }
5256 if (j < len) {
5257 SPLIT_APPEND(self->str, j, len);
5258 }
5259 return list;
5260
5261 onError:
5262 Py_DECREF(list);
5263 return NULL;
5264}
5265
5266PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005267 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005269 register Py_ssize_t i;
5270 register Py_ssize_t j;
5271 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 PyObject *list;
5273 PyObject *str;
5274 Py_UNICODE *data;
5275
5276 string = PyUnicode_FromObject(string);
5277 if (string == NULL)
5278 return NULL;
5279 data = PyUnicode_AS_UNICODE(string);
5280 len = PyUnicode_GET_SIZE(string);
5281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 list = PyList_New(0);
5283 if (!list)
5284 goto onError;
5285
5286 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005287 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005290 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
5293 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005294 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 if (i < len) {
5296 if (data[i] == '\r' && i + 1 < len &&
5297 data[i+1] == '\n')
5298 i += 2;
5299 else
5300 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005301 if (keepends)
5302 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 }
Guido van Rossum86662912000-04-11 15:38:46 +00005304 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 j = i;
5306 }
5307 if (j < len) {
5308 SPLIT_APPEND(data, j, len);
5309 }
5310
5311 Py_DECREF(string);
5312 return list;
5313
5314 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005315 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 Py_DECREF(string);
5317 return NULL;
5318}
5319
Tim Petersced69f82003-09-16 20:30:58 +00005320static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321PyObject *split_char(PyUnicodeObject *self,
5322 PyObject *list,
5323 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005324 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 register Py_ssize_t i;
5327 register Py_ssize_t j;
5328 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 PyObject *str;
5330
5331 for (i = j = 0; i < len; ) {
5332 if (self->str[i] == ch) {
5333 if (maxcount-- <= 0)
5334 break;
5335 SPLIT_APPEND(self->str, j, i);
5336 i = j = i + 1;
5337 } else
5338 i++;
5339 }
5340 if (j <= len) {
5341 SPLIT_APPEND(self->str, j, len);
5342 }
5343 return list;
5344
5345 onError:
5346 Py_DECREF(list);
5347 return NULL;
5348}
5349
Tim Petersced69f82003-09-16 20:30:58 +00005350static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351PyObject *split_substring(PyUnicodeObject *self,
5352 PyObject *list,
5353 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 register Py_ssize_t i;
5357 register Py_ssize_t j;
5358 Py_ssize_t len = self->length;
5359 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 PyObject *str;
5361
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005362 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 if (Py_UNICODE_MATCH(self, i, substring)) {
5364 if (maxcount-- <= 0)
5365 break;
5366 SPLIT_APPEND(self->str, j, i);
5367 i = j = i + sublen;
5368 } else
5369 i++;
5370 }
5371 if (j <= len) {
5372 SPLIT_APPEND(self->str, j, len);
5373 }
5374 return list;
5375
5376 onError:
5377 Py_DECREF(list);
5378 return NULL;
5379}
5380
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005381static
5382PyObject *rsplit_whitespace(PyUnicodeObject *self,
5383 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005384 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 register Py_ssize_t i;
5387 register Py_ssize_t j;
5388 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005389 PyObject *str;
5390
5391 for (i = j = len - 1; i >= 0; ) {
5392 /* find a token */
5393 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5394 i--;
5395 j = i;
5396 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5397 i--;
5398 if (j > i) {
5399 if (maxcount-- <= 0)
5400 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005401 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005402 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5403 i--;
5404 j = i;
5405 }
5406 }
5407 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005409 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 if (PyList_Reverse(list) < 0)
5411 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005412 return list;
5413
5414 onError:
5415 Py_DECREF(list);
5416 return NULL;
5417}
5418
5419static
5420PyObject *rsplit_char(PyUnicodeObject *self,
5421 PyObject *list,
5422 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005423 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 register Py_ssize_t i;
5426 register Py_ssize_t j;
5427 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005428 PyObject *str;
5429
5430 for (i = j = len - 1; i >= 0; ) {
5431 if (self->str[i] == ch) {
5432 if (maxcount-- <= 0)
5433 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005434 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005435 j = i = i - 1;
5436 } else
5437 i--;
5438 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005439 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005441 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 if (PyList_Reverse(list) < 0)
5443 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005444 return list;
5445
5446 onError:
5447 Py_DECREF(list);
5448 return NULL;
5449}
5450
5451static
5452PyObject *rsplit_substring(PyUnicodeObject *self,
5453 PyObject *list,
5454 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457 register Py_ssize_t i;
5458 register Py_ssize_t j;
5459 Py_ssize_t len = self->length;
5460 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005461 PyObject *str;
5462
5463 for (i = len - sublen, j = len; i >= 0; ) {
5464 if (Py_UNICODE_MATCH(self, i, substring)) {
5465 if (maxcount-- <= 0)
5466 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005467 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005468 j = i;
5469 i -= sublen;
5470 } else
5471 i--;
5472 }
5473 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005474 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005475 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005476 if (PyList_Reverse(list) < 0)
5477 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005478 return list;
5479
5480 onError:
5481 Py_DECREF(list);
5482 return NULL;
5483}
5484
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485#undef SPLIT_APPEND
5486
5487static
5488PyObject *split(PyUnicodeObject *self,
5489 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005490 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491{
5492 PyObject *list;
5493
5494 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005495 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
5497 list = PyList_New(0);
5498 if (!list)
5499 return NULL;
5500
5501 if (substring == NULL)
5502 return split_whitespace(self,list,maxcount);
5503
5504 else if (substring->length == 1)
5505 return split_char(self,list,substring->str[0],maxcount);
5506
5507 else if (substring->length == 0) {
5508 Py_DECREF(list);
5509 PyErr_SetString(PyExc_ValueError, "empty separator");
5510 return NULL;
5511 }
5512 else
5513 return split_substring(self,list,substring,maxcount);
5514}
5515
Tim Petersced69f82003-09-16 20:30:58 +00005516static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005517PyObject *rsplit(PyUnicodeObject *self,
5518 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005520{
5521 PyObject *list;
5522
5523 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005524 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005525
5526 list = PyList_New(0);
5527 if (!list)
5528 return NULL;
5529
5530 if (substring == NULL)
5531 return rsplit_whitespace(self,list,maxcount);
5532
5533 else if (substring->length == 1)
5534 return rsplit_char(self,list,substring->str[0],maxcount);
5535
5536 else if (substring->length == 0) {
5537 Py_DECREF(list);
5538 PyErr_SetString(PyExc_ValueError, "empty separator");
5539 return NULL;
5540 }
5541 else
5542 return rsplit_substring(self,list,substring,maxcount);
5543}
5544
5545static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546PyObject *replace(PyUnicodeObject *self,
5547 PyUnicodeObject *str1,
5548 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005549 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
5551 PyUnicodeObject *u;
5552
5553 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005554 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Thomas Wouters477c8d52006-05-27 19:21:47 +00005556 if (str1->length == str2->length) {
5557 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005558 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559 if (str1->length == 1) {
5560 /* replace characters */
5561 Py_UNICODE u1, u2;
5562 if (!findchar(self->str, self->length, str1->str[0]))
5563 goto nothing;
5564 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5565 if (!u)
5566 return NULL;
5567 Py_UNICODE_COPY(u->str, self->str, self->length);
5568 u1 = str1->str[0];
5569 u2 = str2->str[0];
5570 for (i = 0; i < u->length; i++)
5571 if (u->str[i] == u1) {
5572 if (--maxcount < 0)
5573 break;
5574 u->str[i] = u2;
5575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005577 i = fastsearch(
5578 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005580 if (i < 0)
5581 goto nothing;
5582 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5583 if (!u)
5584 return NULL;
5585 Py_UNICODE_COPY(u->str, self->str, self->length);
5586 while (i <= self->length - str1->length)
5587 if (Py_UNICODE_MATCH(self, i, str1)) {
5588 if (--maxcount < 0)
5589 break;
5590 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5591 i += str1->length;
5592 } else
5593 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005596
5597 Py_ssize_t n, i, j, e;
5598 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 Py_UNICODE *p;
5600
5601 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 if (n > maxcount)
5604 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005605 if (n == 0)
5606 goto nothing;
5607 /* new_size = self->length + n * (str2->length - str1->length)); */
5608 delta = (str2->length - str1->length);
5609 if (delta == 0) {
5610 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005612 product = n * (str2->length - str1->length);
5613 if ((product / (str2->length - str1->length)) != n) {
5614 PyErr_SetString(PyExc_OverflowError,
5615 "replace string is too long");
5616 return NULL;
5617 }
5618 new_size = self->length + product;
5619 if (new_size < 0) {
5620 PyErr_SetString(PyExc_OverflowError,
5621 "replace string is too long");
5622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
5624 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625 u = _PyUnicode_New(new_size);
5626 if (!u)
5627 return NULL;
5628 i = 0;
5629 p = u->str;
5630 e = self->length - str1->length;
5631 if (str1->length > 0) {
5632 while (n-- > 0) {
5633 /* look for next match */
5634 j = i;
5635 while (j <= e) {
5636 if (Py_UNICODE_MATCH(self, j, str1))
5637 break;
5638 j++;
5639 }
5640 if (j > i) {
5641 if (j > e)
5642 break;
5643 /* copy unchanged part [i:j] */
5644 Py_UNICODE_COPY(p, self->str+i, j-i);
5645 p += j - i;
5646 }
5647 /* copy substitution string */
5648 if (str2->length > 0) {
5649 Py_UNICODE_COPY(p, str2->str, str2->length);
5650 p += str2->length;
5651 }
5652 i = j + str1->length;
5653 }
5654 if (i < self->length)
5655 /* copy tail [i:] */
5656 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5657 } else {
5658 /* interleave */
5659 while (n > 0) {
5660 Py_UNICODE_COPY(p, str2->str, str2->length);
5661 p += str2->length;
5662 if (--n <= 0)
5663 break;
5664 *p++ = self->str[i++];
5665 }
5666 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005670
5671nothing:
5672 /* nothing to replace; return original string (when possible) */
5673 if (PyUnicode_CheckExact(self)) {
5674 Py_INCREF(self);
5675 return (PyObject *) self;
5676 }
5677 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678}
5679
5680/* --- Unicode Object Methods --------------------------------------------- */
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683"S.title() -> unicode\n\
5684\n\
5685Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
5688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005689unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return fixup(self, fixtitle);
5692}
5693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695"S.capitalize() -> unicode\n\
5696\n\
5697Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005701unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return fixup(self, fixcapitalize);
5704}
5705
5706#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005707PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708"S.capwords() -> unicode\n\
5709\n\
5710Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005711normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712
5713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005714unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715{
5716 PyObject *list;
5717 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 /* Split into words */
5721 list = split(self, NULL, -1);
5722 if (!list)
5723 return NULL;
5724
5725 /* Capitalize each word */
5726 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5727 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5728 fixcapitalize);
5729 if (item == NULL)
5730 goto onError;
5731 Py_DECREF(PyList_GET_ITEM(list, i));
5732 PyList_SET_ITEM(list, i, item);
5733 }
5734
5735 /* Join the words to form a new string */
5736 item = PyUnicode_Join(NULL, list);
5737
5738onError:
5739 Py_DECREF(list);
5740 return (PyObject *)item;
5741}
5742#endif
5743
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005744/* Argument converter. Coerces to a single unicode character */
5745
5746static int
5747convert_uc(PyObject *obj, void *addr)
5748{
5749 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5750 PyObject *uniobj;
5751 Py_UNICODE *unistr;
5752
5753 uniobj = PyUnicode_FromObject(obj);
5754 if (uniobj == NULL) {
5755 PyErr_SetString(PyExc_TypeError,
5756 "The fill character cannot be converted to Unicode");
5757 return 0;
5758 }
5759 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5760 PyErr_SetString(PyExc_TypeError,
5761 "The fill character must be exactly one character long");
5762 Py_DECREF(uniobj);
5763 return 0;
5764 }
5765 unistr = PyUnicode_AS_UNICODE(uniobj);
5766 *fillcharloc = unistr[0];
5767 Py_DECREF(uniobj);
5768 return 1;
5769}
5770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005771PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005772"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005774Return S centered in a Unicode string of length width. Padding is\n\
5775done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
5777static PyObject *
5778unicode_center(PyUnicodeObject *self, PyObject *args)
5779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005780 Py_ssize_t marg, left;
5781 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005782 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Thomas Woutersde017742006-02-16 19:34:37 +00005784 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 return NULL;
5786
Tim Peters7a29bd52001-09-12 03:03:31 +00005787 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 Py_INCREF(self);
5789 return (PyObject*) self;
5790 }
5791
5792 marg = width - self->length;
5793 left = marg / 2 + (marg & width & 1);
5794
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005795 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796}
5797
Marc-André Lemburge5034372000-08-08 08:04:29 +00005798#if 0
5799
5800/* This code should go into some future Unicode collation support
5801 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005802 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005803
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005804/* speedy UTF-16 code point order comparison */
5805/* gleaned from: */
5806/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5807
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005808static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005809{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005810 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005811 0, 0, 0, 0, 0, 0, 0, 0,
5812 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005813 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005814};
5815
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816static int
5817unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005819 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005820
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 Py_UNICODE *s1 = str1->str;
5822 Py_UNICODE *s2 = str2->str;
5823
5824 len1 = str1->length;
5825 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005828 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005829
5830 c1 = *s1++;
5831 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005832
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005833 if (c1 > (1<<11) * 26)
5834 c1 += utf16Fixup[c1>>11];
5835 if (c2 > (1<<11) * 26)
5836 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005837 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005838
5839 if (c1 != c2)
5840 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005842 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
5844
5845 return (len1 < len2) ? -1 : (len1 != len2);
5846}
5847
Marc-André Lemburge5034372000-08-08 08:04:29 +00005848#else
5849
5850static int
5851unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005853 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005854
5855 Py_UNICODE *s1 = str1->str;
5856 Py_UNICODE *s2 = str2->str;
5857
5858 len1 = str1->length;
5859 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005860
Marc-André Lemburge5034372000-08-08 08:04:29 +00005861 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005862 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005863
Fredrik Lundh45714e92001-06-26 16:39:36 +00005864 c1 = *s1++;
5865 c2 = *s2++;
5866
5867 if (c1 != c2)
5868 return (c1 < c2) ? -1 : 1;
5869
Marc-André Lemburge5034372000-08-08 08:04:29 +00005870 len1--; len2--;
5871 }
5872
5873 return (len1 < len2) ? -1 : (len1 != len2);
5874}
5875
5876#endif
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878int PyUnicode_Compare(PyObject *left,
5879 PyObject *right)
5880{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005881 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5882 return unicode_compare((PyUnicodeObject *)left,
5883 (PyUnicodeObject *)right);
5884 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5885 (PyUnicode_Check(left) && PyString_Check(right))) {
5886 if (PyUnicode_Check(left))
5887 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5888 if (PyUnicode_Check(right))
5889 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5890 assert(PyString_Check(left));
5891 assert(PyString_Check(right));
5892 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005894 PyErr_Format(PyExc_TypeError,
5895 "Can't compare %.100s and %.100s",
5896 left->ob_type->tp_name,
5897 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 return -1;
5899}
5900
Martin v. Löwis5b222132007-06-10 09:51:05 +00005901int
5902PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5903{
5904 int i;
5905 Py_UNICODE *id;
5906 assert(PyUnicode_Check(uni));
5907 id = PyUnicode_AS_UNICODE(uni);
5908 /* Compare Unicode string and source character set string */
5909 for (i = 0; id[i] && str[i]; i++)
5910 if (id[i] != str[i])
5911 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5912 if (id[i])
5913 return 1; /* uni is longer */
5914 if (str[i])
5915 return -1; /* str is longer */
5916 return 0;
5917}
5918
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005919PyObject *PyUnicode_RichCompare(PyObject *left,
5920 PyObject *right,
5921 int op)
5922{
5923 int result;
5924
5925 result = PyUnicode_Compare(left, right);
5926 if (result == -1 && PyErr_Occurred())
5927 goto onError;
5928
5929 /* Convert the return value to a Boolean */
5930 switch (op) {
5931 case Py_EQ:
5932 result = (result == 0);
5933 break;
5934 case Py_NE:
5935 result = (result != 0);
5936 break;
5937 case Py_LE:
5938 result = (result <= 0);
5939 break;
5940 case Py_GE:
5941 result = (result >= 0);
5942 break;
5943 case Py_LT:
5944 result = (result == -1);
5945 break;
5946 case Py_GT:
5947 result = (result == 1);
5948 break;
5949 }
5950 return PyBool_FromLong(result);
5951
5952 onError:
5953
5954 /* Standard case
5955
5956 Type errors mean that PyUnicode_FromObject() could not convert
5957 one of the arguments (usually the right hand side) to Unicode,
5958 ie. we can't handle the comparison request. However, it is
5959 possible that the other object knows a comparison method, which
5960 is why we return Py_NotImplemented to give the other object a
5961 chance.
5962
5963 */
5964 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5965 PyErr_Clear();
5966 Py_INCREF(Py_NotImplemented);
5967 return Py_NotImplemented;
5968 }
5969 if (op != Py_EQ && op != Py_NE)
5970 return NULL;
5971
5972 /* Equality comparison.
5973
5974 This is a special case: we silence any PyExc_UnicodeDecodeError
5975 and instead turn it into a PyErr_UnicodeWarning.
5976
5977 */
5978 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5979 return NULL;
5980 PyErr_Clear();
5981 if (PyErr_Warn(PyExc_UnicodeWarning,
5982 (op == Py_EQ) ?
5983 "Unicode equal comparison "
5984 "failed to convert both arguments to Unicode - "
5985 "interpreting them as being unequal" :
5986 "Unicode unequal comparison "
5987 "failed to convert both arguments to Unicode - "
5988 "interpreting them as being unequal"
5989 ) < 0)
5990 return NULL;
5991 result = (op == Py_NE);
5992 return PyBool_FromLong(result);
5993}
5994
Guido van Rossum403d68b2000-03-13 15:55:09 +00005995int PyUnicode_Contains(PyObject *container,
5996 PyObject *element)
5997{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005999 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006000
6001 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 sub = PyUnicode_FromObject(element);
6003 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006004 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006005 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006006 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006007 }
6008
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 str = PyUnicode_FromObject(container);
6010 if (!str) {
6011 Py_DECREF(sub);
6012 return -1;
6013 }
6014
6015 result = stringlib_contains_obj(str, sub);
6016
6017 Py_DECREF(str);
6018 Py_DECREF(sub);
6019
Guido van Rossum403d68b2000-03-13 15:55:09 +00006020 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006021}
6022
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023/* Concat to string or Unicode object giving a new Unicode object. */
6024
6025PyObject *PyUnicode_Concat(PyObject *left,
6026 PyObject *right)
6027{
6028 PyUnicodeObject *u = NULL, *v = NULL, *w;
6029
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006030 if (PyBytes_Check(left) || PyBytes_Check(right))
6031 return PyBytes_Concat(left, right);
6032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 /* Coerce the two arguments */
6034 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6035 if (u == NULL)
6036 goto onError;
6037 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6038 if (v == NULL)
6039 goto onError;
6040
6041 /* Shortcuts */
6042 if (v == unicode_empty) {
6043 Py_DECREF(v);
6044 return (PyObject *)u;
6045 }
6046 if (u == unicode_empty) {
6047 Py_DECREF(u);
6048 return (PyObject *)v;
6049 }
6050
6051 /* Concat the two Unicode strings */
6052 w = _PyUnicode_New(u->length + v->length);
6053 if (w == NULL)
6054 goto onError;
6055 Py_UNICODE_COPY(w->str, u->str, u->length);
6056 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6057
6058 Py_DECREF(u);
6059 Py_DECREF(v);
6060 return (PyObject *)w;
6061
6062onError:
6063 Py_XDECREF(u);
6064 Py_XDECREF(v);
6065 return NULL;
6066}
6067
Walter Dörwald1ab83302007-05-18 17:15:44 +00006068void
6069PyUnicode_Append(PyObject **pleft, PyObject *right)
6070{
6071 PyObject *new;
6072 if (*pleft == NULL)
6073 return;
6074 if (right == NULL || !PyUnicode_Check(*pleft)) {
6075 Py_DECREF(*pleft);
6076 *pleft = NULL;
6077 return;
6078 }
6079 new = PyUnicode_Concat(*pleft, right);
6080 Py_DECREF(*pleft);
6081 *pleft = new;
6082}
6083
6084void
6085PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6086{
6087 PyUnicode_Append(pleft, right);
6088 Py_XDECREF(right);
6089}
6090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092"S.count(sub[, start[, end]]) -> int\n\
6093\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006094Return the number of non-overlapping occurrences of substring sub in\n\
6095Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098static PyObject *
6099unicode_count(PyUnicodeObject *self, PyObject *args)
6100{
6101 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006103 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 PyObject *result;
6105
Guido van Rossumb8872e62000-05-09 14:14:27 +00006106 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6107 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 return NULL;
6109
6110 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 if (substring == NULL)
6113 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006114
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Thomas Wouters477c8d52006-05-27 19:21:47 +00006117 result = PyInt_FromSsize_t(
6118 stringlib_count(self->str + start, end - start,
6119 substring->str, substring->length)
6120 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return result;
6125}
6126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006127PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006128"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006130Encodes S using the codec registered for encoding. encoding defaults\n\
6131to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006132handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6134'xmlcharrefreplace' as well as any other name registered with\n\
6135codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
6137static PyObject *
6138unicode_encode(PyUnicodeObject *self, PyObject *args)
6139{
6140 char *encoding = NULL;
6141 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006142 PyObject *v;
6143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6145 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006146 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006147 if (v == NULL)
6148 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006149 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006150 if (PyString_Check(v)) {
6151 /* Old codec, turn it into bytes */
6152 PyObject *b = PyBytes_FromObject(v);
6153 Py_DECREF(v);
6154 return b;
6155 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006157 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006158 "(type=%.400s)",
6159 v->ob_type->tp_name);
6160 Py_DECREF(v);
6161 return NULL;
6162 }
6163 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006164
6165 onError:
6166 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006167}
6168
6169PyDoc_STRVAR(decode__doc__,
6170"S.decode([encoding[,errors]]) -> string or unicode\n\
6171\n\
6172Decodes S using the codec registered for encoding. encoding defaults\n\
6173to the default encoding. errors may be given to set a different error\n\
6174handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6175a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6176as well as any other name registerd with codecs.register_error that is\n\
6177able to handle UnicodeDecodeErrors.");
6178
6179static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006180unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006181{
6182 char *encoding = NULL;
6183 char *errors = NULL;
6184 PyObject *v;
6185
6186 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6187 return NULL;
6188 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006189 if (v == NULL)
6190 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006191 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6192 PyErr_Format(PyExc_TypeError,
6193 "decoder did not return a string/unicode object "
6194 "(type=%.400s)",
6195 v->ob_type->tp_name);
6196 Py_DECREF(v);
6197 return NULL;
6198 }
6199 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006200
6201 onError:
6202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206"S.expandtabs([tabsize]) -> unicode\n\
6207\n\
6208Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211static PyObject*
6212unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6213{
6214 Py_UNICODE *e;
6215 Py_UNICODE *p;
6216 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006217 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 PyUnicodeObject *u;
6219 int tabsize = 8;
6220
6221 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6222 return NULL;
6223
Thomas Wouters7e474022000-07-16 12:04:32 +00006224 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 i = j = 0;
6226 e = self->str + self->length;
6227 for (p = self->str; p < e; p++)
6228 if (*p == '\t') {
6229 if (tabsize > 0)
6230 j += tabsize - (j % tabsize);
6231 }
6232 else {
6233 j++;
6234 if (*p == '\n' || *p == '\r') {
6235 i += j;
6236 j = 0;
6237 }
6238 }
6239
6240 /* Second pass: create output string and fill it */
6241 u = _PyUnicode_New(i + j);
6242 if (!u)
6243 return NULL;
6244
6245 j = 0;
6246 q = u->str;
6247
6248 for (p = self->str; p < e; p++)
6249 if (*p == '\t') {
6250 if (tabsize > 0) {
6251 i = tabsize - (j % tabsize);
6252 j += i;
6253 while (i--)
6254 *q++ = ' ';
6255 }
6256 }
6257 else {
6258 j++;
6259 *q++ = *p;
6260 if (*p == '\n' || *p == '\r')
6261 j = 0;
6262 }
6263
6264 return (PyObject*) u;
6265}
6266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006267PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268"S.find(sub [,start [,end]]) -> int\n\
6269\n\
6270Return the lowest index in S where substring sub is found,\n\
6271such that sub is contained within s[start,end]. Optional\n\
6272arguments start and end are interpreted as in slice notation.\n\
6273\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006274Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
6276static PyObject *
6277unicode_find(PyUnicodeObject *self, PyObject *args)
6278{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006279 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006280 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006281 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006282 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
Guido van Rossumb8872e62000-05-09 14:14:27 +00006284 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6285 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006287 substring = PyUnicode_FromObject(substring);
6288 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 return NULL;
6290
Thomas Wouters477c8d52006-05-27 19:21:47 +00006291 result = stringlib_find_slice(
6292 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6293 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6294 start, end
6295 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
6297 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006298
6299 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300}
6301
6302static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
6305 if (index < 0 || index >= self->length) {
6306 PyErr_SetString(PyExc_IndexError, "string index out of range");
6307 return NULL;
6308 }
6309
6310 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6311}
6312
6313static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006314unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006316 /* Since Unicode objects compare equal to their UTF-8 string
6317 counterparts, we hash the UTF-8 string. */
6318 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6319 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320}
6321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006322PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323"S.index(sub [,start [,end]]) -> int\n\
6324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006325Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327static PyObject *
6328unicode_index(PyUnicodeObject *self, PyObject *args)
6329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006331 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006332 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006333 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
Guido van Rossumb8872e62000-05-09 14:14:27 +00006335 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6336 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006338 substring = PyUnicode_FromObject(substring);
6339 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 return NULL;
6341
Thomas Wouters477c8d52006-05-27 19:21:47 +00006342 result = stringlib_find_slice(
6343 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6344 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6345 start, end
6346 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
6348 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 if (result < 0) {
6351 PyErr_SetString(PyExc_ValueError, "substring not found");
6352 return NULL;
6353 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006354
Martin v. Löwis18e16552006-02-15 17:27:45 +00006355 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356}
6357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006358PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006359"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006361Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006362at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006365unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366{
6367 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6368 register const Py_UNICODE *e;
6369 int cased;
6370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 /* Shortcut for single character strings */
6372 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006373 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006375 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006376 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 e = p + PyUnicode_GET_SIZE(self);
6380 cased = 0;
6381 for (; p < e; p++) {
6382 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006385 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 else if (!cased && Py_UNICODE_ISLOWER(ch))
6387 cased = 1;
6388 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006389 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390}
6391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006392PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006395Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006399unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
6401 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6402 register const Py_UNICODE *e;
6403 int cased;
6404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 /* Shortcut for single character strings */
6406 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006407 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006409 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006410 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 e = p + PyUnicode_GET_SIZE(self);
6414 cased = 0;
6415 for (; p < e; p++) {
6416 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 else if (!cased && Py_UNICODE_ISUPPER(ch))
6421 cased = 1;
6422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006429Return True if S is a titlecased string and there is at least one\n\
6430character in S, i.e. upper- and titlecase characters may only\n\
6431follow uncased characters and lowercase characters only cased ones.\n\
6432Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
6434static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006435unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
6437 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6438 register const Py_UNICODE *e;
6439 int cased, previous_is_cased;
6440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 /* Shortcut for single character strings */
6442 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006443 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6444 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006446 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006447 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006448 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 e = p + PyUnicode_GET_SIZE(self);
6451 cased = 0;
6452 previous_is_cased = 0;
6453 for (; p < e; p++) {
6454 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6457 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006458 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 previous_is_cased = 1;
6460 cased = 1;
6461 }
6462 else if (Py_UNICODE_ISLOWER(ch)) {
6463 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 previous_is_cased = 1;
6466 cased = 1;
6467 }
6468 else
6469 previous_is_cased = 0;
6470 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006471 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006474PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006475"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006477Return True if all characters in S are whitespace\n\
6478and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
6480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006481unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482{
6483 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6484 register const Py_UNICODE *e;
6485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 /* Shortcut for single character strings */
6487 if (PyUnicode_GET_SIZE(self) == 1 &&
6488 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006489 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006491 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006492 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006493 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006494
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 e = p + PyUnicode_GET_SIZE(self);
6496 for (; p < e; p++) {
6497 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006498 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006500 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501}
6502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006503PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006504"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006505\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006506Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006507and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006508
6509static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006510unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006511{
6512 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6513 register const Py_UNICODE *e;
6514
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006515 /* Shortcut for single character strings */
6516 if (PyUnicode_GET_SIZE(self) == 1 &&
6517 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006518 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006519
6520 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006521 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006522 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006523
6524 e = p + PyUnicode_GET_SIZE(self);
6525 for (; p < e; p++) {
6526 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006527 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006528 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006529 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006530}
6531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006533"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006534\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006535Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006536and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006537
6538static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006539unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006540{
6541 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6542 register const Py_UNICODE *e;
6543
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006544 /* Shortcut for single character strings */
6545 if (PyUnicode_GET_SIZE(self) == 1 &&
6546 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006547 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006548
6549 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006550 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006551 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006552
6553 e = p + PyUnicode_GET_SIZE(self);
6554 for (; p < e; p++) {
6555 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006556 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006558 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006559}
6560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006561PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006562"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
6567static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006568unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569{
6570 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6571 register const Py_UNICODE *e;
6572
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 /* Shortcut for single character strings */
6574 if (PyUnicode_GET_SIZE(self) == 1 &&
6575 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006576 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006578 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006579 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 e = p + PyUnicode_GET_SIZE(self);
6583 for (; p < e; p++) {
6584 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006585 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006587 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588}
6589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006590PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006593Return True if all characters in S are digits\n\
6594and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006597unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598{
6599 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6600 register const Py_UNICODE *e;
6601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 /* Shortcut for single character strings */
6603 if (PyUnicode_GET_SIZE(self) == 1 &&
6604 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006605 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006607 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006608 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006609 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 e = p + PyUnicode_GET_SIZE(self);
6612 for (; p < e; p++) {
6613 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006614 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006616 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617}
6618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006619PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006620"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624
6625static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006626unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6629 register const Py_UNICODE *e;
6630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 /* Shortcut for single character strings */
6632 if (PyUnicode_GET_SIZE(self) == 1 &&
6633 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006634 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006636 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006637 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006638 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006639
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 e = p + PyUnicode_GET_SIZE(self);
6641 for (; p < e; p++) {
6642 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006645 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646}
6647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649"S.join(sequence) -> unicode\n\
6650\n\
6651Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006652sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006655unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006657 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658}
6659
Martin v. Löwis18e16552006-02-15 17:27:45 +00006660static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661unicode_length(PyUnicodeObject *self)
6662{
6663 return self->length;
6664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006667"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
6669Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006670done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject *
6673unicode_ljust(PyUnicodeObject *self, PyObject *args)
6674{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006675 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006676 Py_UNICODE fillchar = ' ';
6677
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006678 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 return NULL;
6680
Tim Peters7a29bd52001-09-12 03:03:31 +00006681 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 Py_INCREF(self);
6683 return (PyObject*) self;
6684 }
6685
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006686 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687}
6688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690"S.lower() -> unicode\n\
6691\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006692Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
6694static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006695unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 return fixup(self, fixlower);
6698}
6699
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006700#define LEFTSTRIP 0
6701#define RIGHTSTRIP 1
6702#define BOTHSTRIP 2
6703
6704/* Arrays indexed by above */
6705static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6706
6707#define STRIPNAME(i) (stripformat[i]+3)
6708
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006709/* externally visible for str.strip(unicode) */
6710PyObject *
6711_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6712{
6713 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006715 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006716 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6717 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006718
Thomas Wouters477c8d52006-05-27 19:21:47 +00006719 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6720
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006721 i = 0;
6722 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006723 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6724 i++;
6725 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006726 }
6727
6728 j = len;
6729 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006730 do {
6731 j--;
6732 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6733 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006734 }
6735
6736 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006737 Py_INCREF(self);
6738 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006739 }
6740 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006741 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006742}
6743
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744
6745static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006746do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006748 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006749 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006750
6751 i = 0;
6752 if (striptype != RIGHTSTRIP) {
6753 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6754 i++;
6755 }
6756 }
6757
6758 j = len;
6759 if (striptype != LEFTSTRIP) {
6760 do {
6761 j--;
6762 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6763 j++;
6764 }
6765
6766 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6767 Py_INCREF(self);
6768 return (PyObject*)self;
6769 }
6770 else
6771 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772}
6773
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006774
6775static PyObject *
6776do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6777{
6778 PyObject *sep = NULL;
6779
6780 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6781 return NULL;
6782
6783 if (sep != NULL && sep != Py_None) {
6784 if (PyUnicode_Check(sep))
6785 return _PyUnicode_XStrip(self, striptype, sep);
6786 else if (PyString_Check(sep)) {
6787 PyObject *res;
6788 sep = PyUnicode_FromObject(sep);
6789 if (sep==NULL)
6790 return NULL;
6791 res = _PyUnicode_XStrip(self, striptype, sep);
6792 Py_DECREF(sep);
6793 return res;
6794 }
6795 else {
6796 PyErr_Format(PyExc_TypeError,
6797 "%s arg must be None, unicode or str",
6798 STRIPNAME(striptype));
6799 return NULL;
6800 }
6801 }
6802
6803 return do_strip(self, striptype);
6804}
6805
6806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006808"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006809\n\
6810Return a copy of the string S with leading and trailing\n\
6811whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006812If chars is given and not None, remove characters in chars instead.\n\
6813If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006814
6815static PyObject *
6816unicode_strip(PyUnicodeObject *self, PyObject *args)
6817{
6818 if (PyTuple_GET_SIZE(args) == 0)
6819 return do_strip(self, BOTHSTRIP); /* Common case */
6820 else
6821 return do_argstrip(self, BOTHSTRIP, args);
6822}
6823
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006826"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006827\n\
6828Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006829If chars is given and not None, remove characters in chars instead.\n\
6830If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006831
6832static PyObject *
6833unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6834{
6835 if (PyTuple_GET_SIZE(args) == 0)
6836 return do_strip(self, LEFTSTRIP); /* Common case */
6837 else
6838 return do_argstrip(self, LEFTSTRIP, args);
6839}
6840
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006843"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006844\n\
6845Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006846If chars is given and not None, remove characters in chars instead.\n\
6847If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006848
6849static PyObject *
6850unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6851{
6852 if (PyTuple_GET_SIZE(args) == 0)
6853 return do_strip(self, RIGHTSTRIP); /* Common case */
6854 else
6855 return do_argstrip(self, RIGHTSTRIP, args);
6856}
6857
6858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006860unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
6862 PyUnicodeObject *u;
6863 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006864 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006865 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866
6867 if (len < 0)
6868 len = 0;
6869
Tim Peters7a29bd52001-09-12 03:03:31 +00006870 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 /* no repeat, return original string */
6872 Py_INCREF(str);
6873 return (PyObject*) str;
6874 }
Tim Peters8f422462000-09-09 06:13:41 +00006875
6876 /* ensure # of chars needed doesn't overflow int and # of bytes
6877 * needed doesn't overflow size_t
6878 */
6879 nchars = len * str->length;
6880 if (len && nchars / len != str->length) {
6881 PyErr_SetString(PyExc_OverflowError,
6882 "repeated string is too long");
6883 return NULL;
6884 }
6885 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6886 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6887 PyErr_SetString(PyExc_OverflowError,
6888 "repeated string is too long");
6889 return NULL;
6890 }
6891 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 if (!u)
6893 return NULL;
6894
6895 p = u->str;
6896
Thomas Wouters477c8d52006-05-27 19:21:47 +00006897 if (str->length == 1 && len > 0) {
6898 Py_UNICODE_FILL(p, str->str[0], len);
6899 } else {
6900 Py_ssize_t done = 0; /* number of characters copied this far */
6901 if (done < nchars) {
6902 Py_UNICODE_COPY(p, str->str, str->length);
6903 done = str->length;
6904 }
6905 while (done < nchars) {
6906 int n = (done <= nchars-done) ? done : nchars-done;
6907 Py_UNICODE_COPY(p+done, p, n);
6908 done += n;
6909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
6911
6912 return (PyObject*) u;
6913}
6914
6915PyObject *PyUnicode_Replace(PyObject *obj,
6916 PyObject *subobj,
6917 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006918 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
6920 PyObject *self;
6921 PyObject *str1;
6922 PyObject *str2;
6923 PyObject *result;
6924
6925 self = PyUnicode_FromObject(obj);
6926 if (self == NULL)
6927 return NULL;
6928 str1 = PyUnicode_FromObject(subobj);
6929 if (str1 == NULL) {
6930 Py_DECREF(self);
6931 return NULL;
6932 }
6933 str2 = PyUnicode_FromObject(replobj);
6934 if (str2 == NULL) {
6935 Py_DECREF(self);
6936 Py_DECREF(str1);
6937 return NULL;
6938 }
Tim Petersced69f82003-09-16 20:30:58 +00006939 result = replace((PyUnicodeObject *)self,
6940 (PyUnicodeObject *)str1,
6941 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 maxcount);
6943 Py_DECREF(self);
6944 Py_DECREF(str1);
6945 Py_DECREF(str2);
6946 return result;
6947}
6948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006949PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950"S.replace (old, new[, maxsplit]) -> unicode\n\
6951\n\
6952Return a copy of S with all occurrences of substring\n\
6953old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006954given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955
6956static PyObject*
6957unicode_replace(PyUnicodeObject *self, PyObject *args)
6958{
6959 PyUnicodeObject *str1;
6960 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006961 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 PyObject *result;
6963
Martin v. Löwis18e16552006-02-15 17:27:45 +00006964 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 return NULL;
6966 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6967 if (str1 == NULL)
6968 return NULL;
6969 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006970 if (str2 == NULL) {
6971 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974
6975 result = replace(self, str1, str2, maxcount);
6976
6977 Py_DECREF(str1);
6978 Py_DECREF(str2);
6979 return result;
6980}
6981
6982static
6983PyObject *unicode_repr(PyObject *unicode)
6984{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006985 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006986 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006987 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6988 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6989
6990 /* XXX(nnorwitz): rather than over-allocating, it would be
6991 better to choose a different scheme. Perhaps scan the
6992 first N-chars of the string and allocate based on that size.
6993 */
6994 /* Initial allocation is based on the longest-possible unichr
6995 escape.
6996
6997 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6998 unichr, so in this case it's the longest unichr escape. In
6999 narrow (UTF-16) builds this is five chars per source unichr
7000 since there are two unichrs in the surrogate pair, so in narrow
7001 (UTF-16) builds it's not the longest unichr escape.
7002
7003 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7004 so in the narrow (UTF-16) build case it's the longest unichr
7005 escape.
7006 */
7007
Walter Dörwald1ab83302007-05-18 17:15:44 +00007008 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007009 2 /* quotes */
7010#ifdef Py_UNICODE_WIDE
7011 + 10*size
7012#else
7013 + 6*size
7014#endif
7015 + 1);
7016 if (repr == NULL)
7017 return NULL;
7018
Walter Dörwald1ab83302007-05-18 17:15:44 +00007019 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007020
7021 /* Add quote */
7022 *p++ = (findchar(s, size, '\'') &&
7023 !findchar(s, size, '"')) ? '"' : '\'';
7024 while (size-- > 0) {
7025 Py_UNICODE ch = *s++;
7026
7027 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007028 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007029 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007030 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007031 continue;
7032 }
7033
7034#ifdef Py_UNICODE_WIDE
7035 /* Map 21-bit characters to '\U00xxxxxx' */
7036 else if (ch >= 0x10000) {
7037 *p++ = '\\';
7038 *p++ = 'U';
7039 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7040 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7041 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7042 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7043 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7044 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7045 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7046 *p++ = hexdigits[ch & 0x0000000F];
7047 continue;
7048 }
7049#else
7050 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7051 else if (ch >= 0xD800 && ch < 0xDC00) {
7052 Py_UNICODE ch2;
7053 Py_UCS4 ucs;
7054
7055 ch2 = *s++;
7056 size--;
7057 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7058 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7059 *p++ = '\\';
7060 *p++ = 'U';
7061 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7062 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7063 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7064 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7065 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7066 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7067 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7068 *p++ = hexdigits[ucs & 0x0000000F];
7069 continue;
7070 }
7071 /* Fall through: isolated surrogates are copied as-is */
7072 s--;
7073 size++;
7074 }
7075#endif
7076
7077 /* Map 16-bit characters to '\uxxxx' */
7078 if (ch >= 256) {
7079 *p++ = '\\';
7080 *p++ = 'u';
7081 *p++ = hexdigits[(ch >> 12) & 0x000F];
7082 *p++ = hexdigits[(ch >> 8) & 0x000F];
7083 *p++ = hexdigits[(ch >> 4) & 0x000F];
7084 *p++ = hexdigits[ch & 0x000F];
7085 }
7086
7087 /* Map special whitespace to '\t', \n', '\r' */
7088 else if (ch == '\t') {
7089 *p++ = '\\';
7090 *p++ = 't';
7091 }
7092 else if (ch == '\n') {
7093 *p++ = '\\';
7094 *p++ = 'n';
7095 }
7096 else if (ch == '\r') {
7097 *p++ = '\\';
7098 *p++ = 'r';
7099 }
7100
7101 /* Map non-printable US ASCII to '\xhh' */
7102 else if (ch < ' ' || ch >= 0x7F) {
7103 *p++ = '\\';
7104 *p++ = 'x';
7105 *p++ = hexdigits[(ch >> 4) & 0x000F];
7106 *p++ = hexdigits[ch & 0x000F];
7107 }
7108
7109 /* Copy everything else as-is */
7110 else
7111 *p++ = (char) ch;
7112 }
7113 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007114 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007115
7116 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007117 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007118 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119}
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122"S.rfind(sub [,start [,end]]) -> int\n\
7123\n\
7124Return the highest index in S where substring sub is found,\n\
7125such that sub is contained within s[start,end]. Optional\n\
7126arguments start and end are interpreted as in slice notation.\n\
7127\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130static PyObject *
7131unicode_rfind(PyUnicodeObject *self, PyObject *args)
7132{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007133 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007134 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007135 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007136 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
Guido van Rossumb8872e62000-05-09 14:14:27 +00007138 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7139 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007141 substring = PyUnicode_FromObject(substring);
7142 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 return NULL;
7144
Thomas Wouters477c8d52006-05-27 19:21:47 +00007145 result = stringlib_rfind_slice(
7146 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7147 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7148 start, end
7149 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152
7153 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154}
7155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157"S.rindex(sub [,start [,end]]) -> int\n\
7158\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007159Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
7161static PyObject *
7162unicode_rindex(PyUnicodeObject *self, PyObject *args)
7163{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007165 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007166 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
Guido van Rossumb8872e62000-05-09 14:14:27 +00007169 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7170 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172 substring = PyUnicode_FromObject(substring);
7173 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return NULL;
7175
Thomas Wouters477c8d52006-05-27 19:21:47 +00007176 result = stringlib_rfind_slice(
7177 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7178 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7179 start, end
7180 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007183
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 if (result < 0) {
7185 PyErr_SetString(PyExc_ValueError, "substring not found");
7186 return NULL;
7187 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007188 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189}
7190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007191PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007192"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193\n\
7194Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007195done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject *
7198unicode_rjust(PyUnicodeObject *self, PyObject *args)
7199{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007200 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007201 Py_UNICODE fillchar = ' ';
7202
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007203 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 return NULL;
7205
Tim Peters7a29bd52001-09-12 03:03:31 +00007206 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 Py_INCREF(self);
7208 return (PyObject*) self;
7209 }
7210
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007211 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212}
7213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007215unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216{
7217 /* standard clamping */
7218 if (start < 0)
7219 start = 0;
7220 if (end < 0)
7221 end = 0;
7222 if (end > self->length)
7223 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007224 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 /* full slice, return original string */
7226 Py_INCREF(self);
7227 return (PyObject*) self;
7228 }
7229 if (start > end)
7230 start = end;
7231 /* copy slice */
7232 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7233 end - start);
7234}
7235
7236PyObject *PyUnicode_Split(PyObject *s,
7237 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007238 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239{
7240 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 s = PyUnicode_FromObject(s);
7243 if (s == NULL)
7244 return NULL;
7245 if (sep != NULL) {
7246 sep = PyUnicode_FromObject(sep);
7247 if (sep == NULL) {
7248 Py_DECREF(s);
7249 return NULL;
7250 }
7251 }
7252
7253 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7254
7255 Py_DECREF(s);
7256 Py_XDECREF(sep);
7257 return result;
7258}
7259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007260PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261"S.split([sep [,maxsplit]]) -> list of strings\n\
7262\n\
7263Return a list of the words in S, using sep as the\n\
7264delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007265splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007266any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267
7268static PyObject*
7269unicode_split(PyUnicodeObject *self, PyObject *args)
7270{
7271 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007272 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
Martin v. Löwis18e16552006-02-15 17:27:45 +00007274 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 return NULL;
7276
7277 if (substring == Py_None)
7278 return split(self, NULL, maxcount);
7279 else if (PyUnicode_Check(substring))
7280 return split(self, (PyUnicodeObject *)substring, maxcount);
7281 else
7282 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7283}
7284
Thomas Wouters477c8d52006-05-27 19:21:47 +00007285PyObject *
7286PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7287{
7288 PyObject* str_obj;
7289 PyObject* sep_obj;
7290 PyObject* out;
7291
7292 str_obj = PyUnicode_FromObject(str_in);
7293 if (!str_obj)
7294 return NULL;
7295 sep_obj = PyUnicode_FromObject(sep_in);
7296 if (!sep_obj) {
7297 Py_DECREF(str_obj);
7298 return NULL;
7299 }
7300
7301 out = stringlib_partition(
7302 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7303 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7304 );
7305
7306 Py_DECREF(sep_obj);
7307 Py_DECREF(str_obj);
7308
7309 return out;
7310}
7311
7312
7313PyObject *
7314PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7315{
7316 PyObject* str_obj;
7317 PyObject* sep_obj;
7318 PyObject* out;
7319
7320 str_obj = PyUnicode_FromObject(str_in);
7321 if (!str_obj)
7322 return NULL;
7323 sep_obj = PyUnicode_FromObject(sep_in);
7324 if (!sep_obj) {
7325 Py_DECREF(str_obj);
7326 return NULL;
7327 }
7328
7329 out = stringlib_rpartition(
7330 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7331 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7332 );
7333
7334 Py_DECREF(sep_obj);
7335 Py_DECREF(str_obj);
7336
7337 return out;
7338}
7339
7340PyDoc_STRVAR(partition__doc__,
7341"S.partition(sep) -> (head, sep, tail)\n\
7342\n\
7343Searches for the separator sep in S, and returns the part before it,\n\
7344the separator itself, and the part after it. If the separator is not\n\
7345found, returns S and two empty strings.");
7346
7347static PyObject*
7348unicode_partition(PyUnicodeObject *self, PyObject *separator)
7349{
7350 return PyUnicode_Partition((PyObject *)self, separator);
7351}
7352
7353PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007354"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007355\n\
7356Searches for the separator sep in S, starting at the end of S, and returns\n\
7357the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007358separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007359
7360static PyObject*
7361unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7362{
7363 return PyUnicode_RPartition((PyObject *)self, separator);
7364}
7365
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007366PyObject *PyUnicode_RSplit(PyObject *s,
7367 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007369{
7370 PyObject *result;
7371
7372 s = PyUnicode_FromObject(s);
7373 if (s == NULL)
7374 return NULL;
7375 if (sep != NULL) {
7376 sep = PyUnicode_FromObject(sep);
7377 if (sep == NULL) {
7378 Py_DECREF(s);
7379 return NULL;
7380 }
7381 }
7382
7383 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7384
7385 Py_DECREF(s);
7386 Py_XDECREF(sep);
7387 return result;
7388}
7389
7390PyDoc_STRVAR(rsplit__doc__,
7391"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7392\n\
7393Return a list of the words in S, using sep as the\n\
7394delimiter string, starting at the end of the string and\n\
7395working to the front. If maxsplit is given, at most maxsplit\n\
7396splits are done. If sep is not specified, any whitespace string\n\
7397is a separator.");
7398
7399static PyObject*
7400unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7401{
7402 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007403 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007404
Martin v. Löwis18e16552006-02-15 17:27:45 +00007405 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007406 return NULL;
7407
7408 if (substring == Py_None)
7409 return rsplit(self, NULL, maxcount);
7410 else if (PyUnicode_Check(substring))
7411 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7412 else
7413 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7414}
7415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007416PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007417"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418\n\
7419Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007420Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007421is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
7423static PyObject*
7424unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7425{
Guido van Rossum86662912000-04-11 15:38:46 +00007426 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
Guido van Rossum86662912000-04-11 15:38:46 +00007428 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 return NULL;
7430
Guido van Rossum86662912000-04-11 15:38:46 +00007431 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432}
7433
7434static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007435PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436{
Walter Dörwald346737f2007-05-31 10:44:43 +00007437 if (PyUnicode_CheckExact(self)) {
7438 Py_INCREF(self);
7439 return self;
7440 } else
7441 /* Subtype -- return genuine unicode string with the same value. */
7442 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7443 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444}
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447"S.swapcase() -> unicode\n\
7448\n\
7449Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007450and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
7452static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007453unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 return fixup(self, fixswapcase);
7456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459"S.translate(table) -> unicode\n\
7460\n\
7461Return a copy of the string S, where all characters have been mapped\n\
7462through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007463Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7464Unmapped characters are left untouched. Characters mapped to None\n\
7465are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007468unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
Tim Petersced69f82003-09-16 20:30:58 +00007470 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007472 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 "ignore");
7474}
7475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007476PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477"S.upper() -> unicode\n\
7478\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
7481static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007482unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 return fixup(self, fixupper);
7485}
7486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007487PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488"S.zfill(width) -> unicode\n\
7489\n\
7490Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007491of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492
7493static PyObject *
7494unicode_zfill(PyUnicodeObject *self, PyObject *args)
7495{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007496 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 PyUnicodeObject *u;
7498
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t width;
7500 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 return NULL;
7502
7503 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007504 if (PyUnicode_CheckExact(self)) {
7505 Py_INCREF(self);
7506 return (PyObject*) self;
7507 }
7508 else
7509 return PyUnicode_FromUnicode(
7510 PyUnicode_AS_UNICODE(self),
7511 PyUnicode_GET_SIZE(self)
7512 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 }
7514
7515 fill = width - self->length;
7516
7517 u = pad(self, fill, 0, '0');
7518
Walter Dörwald068325e2002-04-15 13:36:47 +00007519 if (u == NULL)
7520 return NULL;
7521
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 if (u->str[fill] == '+' || u->str[fill] == '-') {
7523 /* move sign to beginning of string */
7524 u->str[0] = u->str[fill];
7525 u->str[fill] = '0';
7526 }
7527
7528 return (PyObject*) u;
7529}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530
7531#if 0
7532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007533unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 return PyInt_FromLong(unicode_freelist_size);
7536}
7537#endif
7538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007539PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007540"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007542Return True if S starts with the specified prefix, False otherwise.\n\
7543With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544With optional end, stop comparing S at that position.\n\
7545prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547static PyObject *
7548unicode_startswith(PyUnicodeObject *self,
7549 PyObject *args)
7550{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007551 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007554 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007555 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007557 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007558 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007560 if (PyTuple_Check(subobj)) {
7561 Py_ssize_t i;
7562 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7563 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7564 PyTuple_GET_ITEM(subobj, i));
7565 if (substring == NULL)
7566 return NULL;
7567 result = tailmatch(self, substring, start, end, -1);
7568 Py_DECREF(substring);
7569 if (result) {
7570 Py_RETURN_TRUE;
7571 }
7572 }
7573 /* nothing matched */
7574 Py_RETURN_FALSE;
7575 }
7576 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578 return NULL;
7579 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582}
7583
7584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007585PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007586"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007588Return True if S ends with the specified suffix, False otherwise.\n\
7589With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590With optional end, stop comparing S at that position.\n\
7591suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593static PyObject *
7594unicode_endswith(PyUnicodeObject *self,
7595 PyObject *args)
7596{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007600 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007601 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007603 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7604 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606 if (PyTuple_Check(subobj)) {
7607 Py_ssize_t i;
7608 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7609 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7610 PyTuple_GET_ITEM(subobj, i));
7611 if (substring == NULL)
7612 return NULL;
7613 result = tailmatch(self, substring, start, end, +1);
7614 Py_DECREF(substring);
7615 if (result) {
7616 Py_RETURN_TRUE;
7617 }
7618 }
7619 Py_RETURN_FALSE;
7620 }
7621 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007625 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007627 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628}
7629
7630
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007631
7632static PyObject *
7633unicode_getnewargs(PyUnicodeObject *v)
7634{
7635 return Py_BuildValue("(u#)", v->str, v->length);
7636}
7637
7638
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639static PyMethodDef unicode_methods[] = {
7640
7641 /* Order is according to common usage: often used methods should
7642 appear first, since lookup is done sequentially. */
7643
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007644 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7645 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7646 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007647 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007648 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7649 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7650 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7651 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7652 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7653 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7654 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007655 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007656 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7657 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7658 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007659 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007660 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007661/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7662 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7663 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7664 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007666 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007667 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007668 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007669 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7670 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7671 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7672 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7673 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7674 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7675 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7676 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7677 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7678 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7679 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7680 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7681 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7682 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007683 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007684#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007685 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686#endif
7687
7688#if 0
7689 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007690 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691#endif
7692
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007693 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 {NULL, NULL}
7695};
7696
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007697static PyObject *
7698unicode_mod(PyObject *v, PyObject *w)
7699{
7700 if (!PyUnicode_Check(v)) {
7701 Py_INCREF(Py_NotImplemented);
7702 return Py_NotImplemented;
7703 }
7704 return PyUnicode_Format(v, w);
7705}
7706
7707static PyNumberMethods unicode_as_number = {
7708 0, /*nb_add*/
7709 0, /*nb_subtract*/
7710 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007711 unicode_mod, /*nb_remainder*/
7712};
7713
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007716 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007717 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7718 (ssizeargfunc) unicode_getitem, /* sq_item */
7719 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 0, /* sq_ass_item */
7721 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007722 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723};
7724
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007725static PyObject*
7726unicode_subscript(PyUnicodeObject* self, PyObject* item)
7727{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007728 if (PyIndex_Check(item)) {
7729 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007730 if (i == -1 && PyErr_Occurred())
7731 return NULL;
7732 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007733 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007734 return unicode_getitem(self, i);
7735 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007736 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007737 Py_UNICODE* source_buf;
7738 Py_UNICODE* result_buf;
7739 PyObject* result;
7740
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007741 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007742 &start, &stop, &step, &slicelength) < 0) {
7743 return NULL;
7744 }
7745
7746 if (slicelength <= 0) {
7747 return PyUnicode_FromUnicode(NULL, 0);
7748 } else {
7749 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007750 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7751 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007752
7753 if (result_buf == NULL)
7754 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007755
7756 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7757 result_buf[i] = source_buf[cur];
7758 }
Tim Petersced69f82003-09-16 20:30:58 +00007759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007760 result = PyUnicode_FromUnicode(result_buf, slicelength);
7761 PyMem_FREE(result_buf);
7762 return result;
7763 }
7764 } else {
7765 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7766 return NULL;
7767 }
7768}
7769
7770static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007771 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007772 (binaryfunc)unicode_subscript, /* mp_subscript */
7773 (objobjargproc)0, /* mp_ass_subscript */
7774};
7775
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007778 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 const void **ptr)
7780{
7781 if (index != 0) {
7782 PyErr_SetString(PyExc_SystemError,
7783 "accessing non-existent unicode segment");
7784 return -1;
7785 }
7786 *ptr = (void *) self->str;
7787 return PyUnicode_GET_DATA_SIZE(self);
7788}
7789
Martin v. Löwis18e16552006-02-15 17:27:45 +00007790static Py_ssize_t
7791unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 const void **ptr)
7793{
7794 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007795 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 return -1;
7797}
7798
7799static int
7800unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802{
7803 if (lenp)
7804 *lenp = PyUnicode_GET_DATA_SIZE(self);
7805 return 1;
7806}
7807
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007808static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007810 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 const void **ptr)
7812{
7813 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007814
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 if (index != 0) {
7816 PyErr_SetString(PyExc_SystemError,
7817 "accessing non-existent unicode segment");
7818 return -1;
7819 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007820 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 if (str == NULL)
7822 return -1;
7823 *ptr = (void *) PyString_AS_STRING(str);
7824 return PyString_GET_SIZE(str);
7825}
7826
7827/* Helpers for PyUnicode_Format() */
7828
7829static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007830getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007832 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 if (argidx < arglen) {
7834 (*p_argidx)++;
7835 if (arglen < 0)
7836 return args;
7837 else
7838 return PyTuple_GetItem(args, argidx);
7839 }
7840 PyErr_SetString(PyExc_TypeError,
7841 "not enough arguments for format string");
7842 return NULL;
7843}
7844
7845#define F_LJUST (1<<0)
7846#define F_SIGN (1<<1)
7847#define F_BLANK (1<<2)
7848#define F_ALT (1<<3)
7849#define F_ZERO (1<<4)
7850
Martin v. Löwis18e16552006-02-15 17:27:45 +00007851static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007852strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007854 register Py_ssize_t i;
7855 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 for (i = len - 1; i >= 0; i--)
7857 buffer[i] = (Py_UNICODE) charbuffer[i];
7858
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 return len;
7860}
7861
Neal Norwitzfc76d632006-01-10 06:03:13 +00007862static int
7863doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7864{
Tim Peters15231542006-02-16 01:08:01 +00007865 Py_ssize_t result;
7866
Neal Norwitzfc76d632006-01-10 06:03:13 +00007867 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007868 result = strtounicode(buffer, (char *)buffer);
7869 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007870}
7871
7872static int
7873longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7874{
Tim Peters15231542006-02-16 01:08:01 +00007875 Py_ssize_t result;
7876
Neal Norwitzfc76d632006-01-10 06:03:13 +00007877 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007878 result = strtounicode(buffer, (char *)buffer);
7879 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007880}
7881
Guido van Rossum078151d2002-08-11 04:24:12 +00007882/* XXX To save some code duplication, formatfloat/long/int could have been
7883 shared with stringobject.c, converting from 8-bit to Unicode after the
7884 formatting is done. */
7885
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886static int
7887formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007888 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 int flags,
7890 int prec,
7891 int type,
7892 PyObject *v)
7893{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007894 /* fmt = '%#.' + `prec` + `type`
7895 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 char fmt[20];
7897 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007898
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 x = PyFloat_AsDouble(v);
7900 if (x == -1.0 && PyErr_Occurred())
7901 return -1;
7902 if (prec < 0)
7903 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7905 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007906 /* Worst case length calc to ensure no buffer overrun:
7907
7908 'g' formats:
7909 fmt = %#.<prec>g
7910 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7911 for any double rep.)
7912 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7913
7914 'f' formats:
7915 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7916 len = 1 + 50 + 1 + prec = 52 + prec
7917
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007918 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007919 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007920
7921 */
7922 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7923 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007924 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007925 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007926 return -1;
7927 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007928 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7929 (flags&F_ALT) ? "#" : "",
7930 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007931 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932}
7933
Tim Peters38fd5b62000-09-21 05:43:11 +00007934static PyObject*
7935formatlong(PyObject *val, int flags, int prec, int type)
7936{
7937 char *buf;
7938 int i, len;
7939 PyObject *str; /* temporary string object. */
7940 PyUnicodeObject *result;
7941
7942 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7943 if (!str)
7944 return NULL;
7945 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007946 if (!result) {
7947 Py_DECREF(str);
7948 return NULL;
7949 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007950 for (i = 0; i < len; i++)
7951 result->str[i] = buf[i];
7952 result->str[len] = 0;
7953 Py_DECREF(str);
7954 return (PyObject*)result;
7955}
7956
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957static int
7958formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007959 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 int flags,
7961 int prec,
7962 int type,
7963 PyObject *v)
7964{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007965 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007966 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7967 * + 1 + 1
7968 * = 24
7969 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007970 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007971 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 long x;
7973
7974 x = PyInt_AsLong(v);
7975 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007976 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007977 if (x < 0 && type == 'u') {
7978 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007979 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007980 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7981 sign = "-";
7982 else
7983 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007985 prec = 1;
7986
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007987 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7988 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007989 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007990 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007991 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007992 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007993 return -1;
7994 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007995
7996 if ((flags & F_ALT) &&
7997 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007998 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007999 * of issues that cause pain:
8000 * - when 0 is being converted, the C standard leaves off
8001 * the '0x' or '0X', which is inconsistent with other
8002 * %#x/%#X conversions and inconsistent with Python's
8003 * hex() function
8004 * - there are platforms that violate the standard and
8005 * convert 0 with the '0x' or '0X'
8006 * (Metrowerks, Compaq Tru64)
8007 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008008 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008009 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008010 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008011 * We can achieve the desired consistency by inserting our
8012 * own '0x' or '0X' prefix, and substituting %x/%X in place
8013 * of %#x/%#X.
8014 *
8015 * Note that this is the same approach as used in
8016 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008017 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008018 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8019 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008020 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008021 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008022 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8023 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008024 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008025 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008026 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008027 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008028 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008029 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030}
8031
8032static int
8033formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008034 size_t buflen,
8035 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008037 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008038 if (PyUnicode_Check(v)) {
8039 if (PyUnicode_GET_SIZE(v) != 1)
8040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008044 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008045 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008046 goto onError;
8047 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
8050 else {
8051 /* Integer input truncated to a character */
8052 long x;
8053 x = PyInt_AsLong(v);
8054 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008055 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008056#ifdef Py_UNICODE_WIDE
8057 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008058 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008059 "%c arg not in range(0x110000) "
8060 "(wide Python build)");
8061 return -1;
8062 }
8063#else
8064 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008065 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008066 "%c arg not in range(0x10000) "
8067 "(narrow Python build)");
8068 return -1;
8069 }
8070#endif
8071 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 }
8073 buf[1] = '\0';
8074 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008075
8076 onError:
8077 PyErr_SetString(PyExc_TypeError,
8078 "%c requires int or char");
8079 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080}
8081
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008082/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8083
8084 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8085 chars are formatted. XXX This is a magic number. Each formatting
8086 routine does bounds checking to ensure no overflow, but a better
8087 solution may be to malloc a buffer of appropriate size for each
8088 format. For now, the current solution is sufficient.
8089*/
8090#define FORMATBUFLEN (size_t)120
8091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092PyObject *PyUnicode_Format(PyObject *format,
8093 PyObject *args)
8094{
8095 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 int args_owned = 0;
8098 PyUnicodeObject *result = NULL;
8099 PyObject *dict = NULL;
8100 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008101
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 if (format == NULL || args == NULL) {
8103 PyErr_BadInternalCall();
8104 return NULL;
8105 }
8106 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008107 if (uformat == NULL)
8108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 fmt = PyUnicode_AS_UNICODE(uformat);
8110 fmtcnt = PyUnicode_GET_SIZE(uformat);
8111
8112 reslen = rescnt = fmtcnt + 100;
8113 result = _PyUnicode_New(reslen);
8114 if (result == NULL)
8115 goto onError;
8116 res = PyUnicode_AS_UNICODE(result);
8117
8118 if (PyTuple_Check(args)) {
8119 arglen = PyTuple_Size(args);
8120 argidx = 0;
8121 }
8122 else {
8123 arglen = -1;
8124 argidx = -2;
8125 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008126 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8127 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 dict = args;
8129
8130 while (--fmtcnt >= 0) {
8131 if (*fmt != '%') {
8132 if (--rescnt < 0) {
8133 rescnt = fmtcnt + 100;
8134 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008135 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8138 --rescnt;
8139 }
8140 *res++ = *fmt++;
8141 }
8142 else {
8143 /* Got a format specifier */
8144 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 Py_UNICODE c = '\0';
8148 Py_UNICODE fill;
8149 PyObject *v = NULL;
8150 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008151 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008153 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008154 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155
8156 fmt++;
8157 if (*fmt == '(') {
8158 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 PyObject *key;
8161 int pcount = 1;
8162
8163 if (dict == NULL) {
8164 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008165 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 goto onError;
8167 }
8168 ++fmt;
8169 --fmtcnt;
8170 keystart = fmt;
8171 /* Skip over balanced parentheses */
8172 while (pcount > 0 && --fmtcnt >= 0) {
8173 if (*fmt == ')')
8174 --pcount;
8175 else if (*fmt == '(')
8176 ++pcount;
8177 fmt++;
8178 }
8179 keylen = fmt - keystart - 1;
8180 if (fmtcnt < 0 || pcount > 0) {
8181 PyErr_SetString(PyExc_ValueError,
8182 "incomplete format key");
8183 goto onError;
8184 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008185#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008186 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 then looked up since Python uses strings to hold
8188 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008189 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 key = PyUnicode_EncodeUTF8(keystart,
8191 keylen,
8192 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008193#else
8194 key = PyUnicode_FromUnicode(keystart, keylen);
8195#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 if (key == NULL)
8197 goto onError;
8198 if (args_owned) {
8199 Py_DECREF(args);
8200 args_owned = 0;
8201 }
8202 args = PyObject_GetItem(dict, key);
8203 Py_DECREF(key);
8204 if (args == NULL) {
8205 goto onError;
8206 }
8207 args_owned = 1;
8208 arglen = -1;
8209 argidx = -2;
8210 }
8211 while (--fmtcnt >= 0) {
8212 switch (c = *fmt++) {
8213 case '-': flags |= F_LJUST; continue;
8214 case '+': flags |= F_SIGN; continue;
8215 case ' ': flags |= F_BLANK; continue;
8216 case '#': flags |= F_ALT; continue;
8217 case '0': flags |= F_ZERO; continue;
8218 }
8219 break;
8220 }
8221 if (c == '*') {
8222 v = getnextarg(args, arglen, &argidx);
8223 if (v == NULL)
8224 goto onError;
8225 if (!PyInt_Check(v)) {
8226 PyErr_SetString(PyExc_TypeError,
8227 "* wants int");
8228 goto onError;
8229 }
8230 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008231 if (width == -1 && PyErr_Occurred())
8232 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 if (width < 0) {
8234 flags |= F_LJUST;
8235 width = -width;
8236 }
8237 if (--fmtcnt >= 0)
8238 c = *fmt++;
8239 }
8240 else if (c >= '0' && c <= '9') {
8241 width = c - '0';
8242 while (--fmtcnt >= 0) {
8243 c = *fmt++;
8244 if (c < '0' || c > '9')
8245 break;
8246 if ((width*10) / 10 != width) {
8247 PyErr_SetString(PyExc_ValueError,
8248 "width too big");
8249 goto onError;
8250 }
8251 width = width*10 + (c - '0');
8252 }
8253 }
8254 if (c == '.') {
8255 prec = 0;
8256 if (--fmtcnt >= 0)
8257 c = *fmt++;
8258 if (c == '*') {
8259 v = getnextarg(args, arglen, &argidx);
8260 if (v == NULL)
8261 goto onError;
8262 if (!PyInt_Check(v)) {
8263 PyErr_SetString(PyExc_TypeError,
8264 "* wants int");
8265 goto onError;
8266 }
8267 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008268 if (prec == -1 && PyErr_Occurred())
8269 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 if (prec < 0)
8271 prec = 0;
8272 if (--fmtcnt >= 0)
8273 c = *fmt++;
8274 }
8275 else if (c >= '0' && c <= '9') {
8276 prec = c - '0';
8277 while (--fmtcnt >= 0) {
8278 c = Py_CHARMASK(*fmt++);
8279 if (c < '0' || c > '9')
8280 break;
8281 if ((prec*10) / 10 != prec) {
8282 PyErr_SetString(PyExc_ValueError,
8283 "prec too big");
8284 goto onError;
8285 }
8286 prec = prec*10 + (c - '0');
8287 }
8288 }
8289 } /* prec */
8290 if (fmtcnt >= 0) {
8291 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 if (--fmtcnt >= 0)
8293 c = *fmt++;
8294 }
8295 }
8296 if (fmtcnt < 0) {
8297 PyErr_SetString(PyExc_ValueError,
8298 "incomplete format");
8299 goto onError;
8300 }
8301 if (c != '%') {
8302 v = getnextarg(args, arglen, &argidx);
8303 if (v == NULL)
8304 goto onError;
8305 }
8306 sign = 0;
8307 fill = ' ';
8308 switch (c) {
8309
8310 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008311 pbuf = formatbuf;
8312 /* presume that buffer length is at least 1 */
8313 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 len = 1;
8315 break;
8316
8317 case 's':
8318 case 'r':
8319 if (PyUnicode_Check(v) && c == 's') {
8320 temp = v;
8321 Py_INCREF(temp);
8322 }
8323 else {
8324 PyObject *unicode;
8325 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008326 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 else
8328 temp = PyObject_Repr(v);
8329 if (temp == NULL)
8330 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008331 if (PyUnicode_Check(temp))
8332 /* nothing to do */;
8333 else if (PyString_Check(temp)) {
8334 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008335 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008337 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008339 Py_DECREF(temp);
8340 temp = unicode;
8341 if (temp == NULL)
8342 goto onError;
8343 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008344 else {
8345 Py_DECREF(temp);
8346 PyErr_SetString(PyExc_TypeError,
8347 "%s argument has non-string str()");
8348 goto onError;
8349 }
8350 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008351 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 len = PyUnicode_GET_SIZE(temp);
8353 if (prec >= 0 && len > prec)
8354 len = prec;
8355 break;
8356
8357 case 'i':
8358 case 'd':
8359 case 'u':
8360 case 'o':
8361 case 'x':
8362 case 'X':
8363 if (c == 'i')
8364 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008365 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008366 temp = formatlong(v, flags, prec, c);
8367 if (!temp)
8368 goto onError;
8369 pbuf = PyUnicode_AS_UNICODE(temp);
8370 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008371 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008373 else {
8374 pbuf = formatbuf;
8375 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8376 flags, prec, c, v);
8377 if (len < 0)
8378 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008379 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008380 }
8381 if (flags & F_ZERO)
8382 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 break;
8384
8385 case 'e':
8386 case 'E':
8387 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008388 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 case 'g':
8390 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008391 if (c == 'F')
8392 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008393 pbuf = formatbuf;
8394 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8395 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 if (len < 0)
8397 goto onError;
8398 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008399 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 fill = '0';
8401 break;
8402
8403 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008404 pbuf = formatbuf;
8405 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 if (len < 0)
8407 goto onError;
8408 break;
8409
8410 default:
8411 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008412 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008413 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008414 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008415 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008416 (Py_ssize_t)(fmt - 1 -
8417 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 goto onError;
8419 }
8420 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008421 if (*pbuf == '-' || *pbuf == '+') {
8422 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 len--;
8424 }
8425 else if (flags & F_SIGN)
8426 sign = '+';
8427 else if (flags & F_BLANK)
8428 sign = ' ';
8429 else
8430 sign = 0;
8431 }
8432 if (width < len)
8433 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008434 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 reslen -= rescnt;
8436 rescnt = width + fmtcnt + 100;
8437 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008438 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008439 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008440 PyErr_NoMemory();
8441 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008442 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008443 if (_PyUnicode_Resize(&result, reslen) < 0) {
8444 Py_XDECREF(temp);
8445 goto onError;
8446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 res = PyUnicode_AS_UNICODE(result)
8448 + reslen - rescnt;
8449 }
8450 if (sign) {
8451 if (fill != ' ')
8452 *res++ = sign;
8453 rescnt--;
8454 if (width > len)
8455 width--;
8456 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008457 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8458 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008459 assert(pbuf[1] == c);
8460 if (fill != ' ') {
8461 *res++ = *pbuf++;
8462 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008463 }
Tim Petersfff53252001-04-12 18:38:48 +00008464 rescnt -= 2;
8465 width -= 2;
8466 if (width < 0)
8467 width = 0;
8468 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 if (width > len && !(flags & F_LJUST)) {
8471 do {
8472 --rescnt;
8473 *res++ = fill;
8474 } while (--width > len);
8475 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008476 if (fill == ' ') {
8477 if (sign)
8478 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008479 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008480 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008481 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008482 *res++ = *pbuf++;
8483 *res++ = *pbuf++;
8484 }
8485 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008486 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 res += len;
8488 rescnt -= len;
8489 while (--width >= len) {
8490 --rescnt;
8491 *res++ = ' ';
8492 }
8493 if (dict && (argidx < arglen) && c != '%') {
8494 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008495 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008496 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 goto onError;
8498 }
8499 Py_XDECREF(temp);
8500 } /* '%' */
8501 } /* until end */
8502 if (argidx < arglen && !dict) {
8503 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008504 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 goto onError;
8506 }
8507
Thomas Woutersa96affe2006-03-12 00:29:36 +00008508 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 if (args_owned) {
8511 Py_DECREF(args);
8512 }
8513 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 return (PyObject *)result;
8515
8516 onError:
8517 Py_XDECREF(result);
8518 Py_DECREF(uformat);
8519 if (args_owned) {
8520 Py_DECREF(args);
8521 }
8522 return NULL;
8523}
8524
8525static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008526 (readbufferproc) unicode_buffer_getreadbuf,
8527 (writebufferproc) unicode_buffer_getwritebuf,
8528 (segcountproc) unicode_buffer_getsegcount,
8529 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530};
8531
Jeremy Hylton938ace62002-07-17 16:30:39 +00008532static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008533unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8534
Tim Peters6d6c1a32001-08-02 04:15:00 +00008535static PyObject *
8536unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8537{
8538 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008539 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008540 char *encoding = NULL;
8541 char *errors = NULL;
8542
Guido van Rossume023fe02001-08-30 03:12:59 +00008543 if (type != &PyUnicode_Type)
8544 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008545 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8546 kwlist, &x, &encoding, &errors))
8547 return NULL;
8548 if (x == NULL)
8549 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008550 if (encoding == NULL && errors == NULL)
8551 return PyObject_Unicode(x);
8552 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008553 return PyUnicode_FromEncodedObject(x, encoding, errors);
8554}
8555
Guido van Rossume023fe02001-08-30 03:12:59 +00008556static PyObject *
8557unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8558{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008559 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008560 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008561
8562 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8563 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8564 if (tmp == NULL)
8565 return NULL;
8566 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008567 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008568 if (pnew == NULL) {
8569 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008570 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008571 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008572 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8573 if (pnew->str == NULL) {
8574 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008575 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008576 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008577 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008578 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008579 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8580 pnew->length = n;
8581 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008582 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008583 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008584}
8585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008586PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008587"unicode(string [, encoding[, errors]]) -> object\n\
8588\n\
8589Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008590encoding defaults to the current default string encoding.\n\
8591errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008592
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008593static PyObject *unicode_iter(PyObject *seq);
8594
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595PyTypeObject PyUnicode_Type = {
8596 PyObject_HEAD_INIT(&PyType_Type)
8597 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008598 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 sizeof(PyUnicodeObject), /* tp_size */
8600 0, /* tp_itemsize */
8601 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008602 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008604 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008606 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008607 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008608 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008610 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 (hashfunc) unicode_hash, /* tp_hash*/
8612 0, /* tp_call*/
8613 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008614 PyObject_GenericGetAttr, /* tp_getattro */
8615 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008617 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8618 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008619 unicode_doc, /* tp_doc */
8620 0, /* tp_traverse */
8621 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008622 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008623 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008624 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008625 0, /* tp_iternext */
8626 unicode_methods, /* tp_methods */
8627 0, /* tp_members */
8628 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008629 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008630 0, /* tp_dict */
8631 0, /* tp_descr_get */
8632 0, /* tp_descr_set */
8633 0, /* tp_dictoffset */
8634 0, /* tp_init */
8635 0, /* tp_alloc */
8636 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008637 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638};
8639
8640/* Initialize the Unicode implementation */
8641
Thomas Wouters78890102000-07-22 19:25:51 +00008642void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008644 int i;
8645
Thomas Wouters477c8d52006-05-27 19:21:47 +00008646 /* XXX - move this array to unicodectype.c ? */
8647 Py_UNICODE linebreak[] = {
8648 0x000A, /* LINE FEED */
8649 0x000D, /* CARRIAGE RETURN */
8650 0x001C, /* FILE SEPARATOR */
8651 0x001D, /* GROUP SEPARATOR */
8652 0x001E, /* RECORD SEPARATOR */
8653 0x0085, /* NEXT LINE */
8654 0x2028, /* LINE SEPARATOR */
8655 0x2029, /* PARAGRAPH SEPARATOR */
8656 };
8657
Fred Drakee4315f52000-05-09 19:53:39 +00008658 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008659 unicode_freelist = NULL;
8660 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008662 if (!unicode_empty)
8663 return;
8664
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008665 for (i = 0; i < 256; i++)
8666 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008667 if (PyType_Ready(&PyUnicode_Type) < 0)
8668 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008669
8670 /* initialize the linebreak bloom filter */
8671 bloom_linebreak = make_bloom_mask(
8672 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8673 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008674
8675 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676}
8677
8678/* Finalize the Unicode implementation */
8679
8680void
Thomas Wouters78890102000-07-22 19:25:51 +00008681_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008683 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008684 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008686 Py_XDECREF(unicode_empty);
8687 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008688
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008689 for (i = 0; i < 256; i++) {
8690 if (unicode_latin1[i]) {
8691 Py_DECREF(unicode_latin1[i]);
8692 unicode_latin1[i] = NULL;
8693 }
8694 }
8695
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008696 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 PyUnicodeObject *v = u;
8698 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008699 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008700 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008701 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008702 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008704 unicode_freelist = NULL;
8705 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008707
Walter Dörwald16807132007-05-25 13:52:07 +00008708void
8709PyUnicode_InternInPlace(PyObject **p)
8710{
8711 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8712 PyObject *t;
8713 if (s == NULL || !PyUnicode_Check(s))
8714 Py_FatalError(
8715 "PyUnicode_InternInPlace: unicode strings only please!");
8716 /* If it's a subclass, we don't really know what putting
8717 it in the interned dict might do. */
8718 if (!PyUnicode_CheckExact(s))
8719 return;
8720 if (PyUnicode_CHECK_INTERNED(s))
8721 return;
8722 if (interned == NULL) {
8723 interned = PyDict_New();
8724 if (interned == NULL) {
8725 PyErr_Clear(); /* Don't leave an exception */
8726 return;
8727 }
8728 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008729 /* It might be that the GetItem call fails even
8730 though the key is present in the dictionary,
8731 namely when this happens during a stack overflow. */
8732 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008733 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008734 Py_END_ALLOW_RECURSION
8735
Walter Dörwald16807132007-05-25 13:52:07 +00008736 if (t) {
8737 Py_INCREF(t);
8738 Py_DECREF(*p);
8739 *p = t;
8740 return;
8741 }
8742
Martin v. Löwis5b222132007-06-10 09:51:05 +00008743 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008744 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8745 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008746 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008747 return;
8748 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008749 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008750 /* The two references in interned are not counted by refcnt.
8751 The deallocator will take care of this */
8752 s->ob_refcnt -= 2;
8753 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8754}
8755
8756void
8757PyUnicode_InternImmortal(PyObject **p)
8758{
8759 PyUnicode_InternInPlace(p);
8760 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8761 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8762 Py_INCREF(*p);
8763 }
8764}
8765
8766PyObject *
8767PyUnicode_InternFromString(const char *cp)
8768{
8769 PyObject *s = PyUnicode_FromString(cp);
8770 if (s == NULL)
8771 return NULL;
8772 PyUnicode_InternInPlace(&s);
8773 return s;
8774}
8775
8776void _Py_ReleaseInternedUnicodeStrings(void)
8777{
8778 PyObject *keys;
8779 PyUnicodeObject *s;
8780 Py_ssize_t i, n;
8781 Py_ssize_t immortal_size = 0, mortal_size = 0;
8782
8783 if (interned == NULL || !PyDict_Check(interned))
8784 return;
8785 keys = PyDict_Keys(interned);
8786 if (keys == NULL || !PyList_Check(keys)) {
8787 PyErr_Clear();
8788 return;
8789 }
8790
8791 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8792 detector, interned unicode strings are not forcibly deallocated;
8793 rather, we give them their stolen references back, and then clear
8794 and DECREF the interned dict. */
8795
8796 n = PyList_GET_SIZE(keys);
8797 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8798 n);
8799 for (i = 0; i < n; i++) {
8800 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8801 switch (s->state) {
8802 case SSTATE_NOT_INTERNED:
8803 /* XXX Shouldn't happen */
8804 break;
8805 case SSTATE_INTERNED_IMMORTAL:
8806 s->ob_refcnt += 1;
8807 immortal_size += s->length;
8808 break;
8809 case SSTATE_INTERNED_MORTAL:
8810 s->ob_refcnt += 2;
8811 mortal_size += s->length;
8812 break;
8813 default:
8814 Py_FatalError("Inconsistent interned string state.");
8815 }
8816 s->state = SSTATE_NOT_INTERNED;
8817 }
8818 fprintf(stderr, "total size of all interned strings: "
8819 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8820 "mortal/immortal\n", mortal_size, immortal_size);
8821 Py_DECREF(keys);
8822 PyDict_Clear(interned);
8823 Py_DECREF(interned);
8824 interned = NULL;
8825}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008826
8827
8828/********************* Unicode Iterator **************************/
8829
8830typedef struct {
8831 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008832 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008833 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8834} unicodeiterobject;
8835
8836static void
8837unicodeiter_dealloc(unicodeiterobject *it)
8838{
8839 _PyObject_GC_UNTRACK(it);
8840 Py_XDECREF(it->it_seq);
8841 PyObject_GC_Del(it);
8842}
8843
8844static int
8845unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8846{
8847 Py_VISIT(it->it_seq);
8848 return 0;
8849}
8850
8851static PyObject *
8852unicodeiter_next(unicodeiterobject *it)
8853{
8854 PyUnicodeObject *seq;
8855 PyObject *item;
8856
8857 assert(it != NULL);
8858 seq = it->it_seq;
8859 if (seq == NULL)
8860 return NULL;
8861 assert(PyUnicode_Check(seq));
8862
8863 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008864 item = PyUnicode_FromUnicode(
8865 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008866 if (item != NULL)
8867 ++it->it_index;
8868 return item;
8869 }
8870
8871 Py_DECREF(seq);
8872 it->it_seq = NULL;
8873 return NULL;
8874}
8875
8876static PyObject *
8877unicodeiter_len(unicodeiterobject *it)
8878{
8879 Py_ssize_t len = 0;
8880 if (it->it_seq)
8881 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8882 return PyInt_FromSsize_t(len);
8883}
8884
8885PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8886
8887static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008888 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8889 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008890 {NULL, NULL} /* sentinel */
8891};
8892
8893PyTypeObject PyUnicodeIter_Type = {
8894 PyObject_HEAD_INIT(&PyType_Type)
8895 0, /* ob_size */
8896 "unicodeiterator", /* tp_name */
8897 sizeof(unicodeiterobject), /* tp_basicsize */
8898 0, /* tp_itemsize */
8899 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008900 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008901 0, /* tp_print */
8902 0, /* tp_getattr */
8903 0, /* tp_setattr */
8904 0, /* tp_compare */
8905 0, /* tp_repr */
8906 0, /* tp_as_number */
8907 0, /* tp_as_sequence */
8908 0, /* tp_as_mapping */
8909 0, /* tp_hash */
8910 0, /* tp_call */
8911 0, /* tp_str */
8912 PyObject_GenericGetAttr, /* tp_getattro */
8913 0, /* tp_setattro */
8914 0, /* tp_as_buffer */
8915 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8916 0, /* tp_doc */
8917 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8918 0, /* tp_clear */
8919 0, /* tp_richcompare */
8920 0, /* tp_weaklistoffset */
8921 PyObject_SelfIter, /* tp_iter */
8922 (iternextfunc)unicodeiter_next, /* tp_iternext */
8923 unicodeiter_methods, /* tp_methods */
8924 0,
8925};
8926
8927static PyObject *
8928unicode_iter(PyObject *seq)
8929{
8930 unicodeiterobject *it;
8931
8932 if (!PyUnicode_Check(seq)) {
8933 PyErr_BadInternalCall();
8934 return NULL;
8935 }
8936 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8937 if (it == NULL)
8938 return NULL;
8939 it->it_index = 0;
8940 Py_INCREF(seq);
8941 it->it_seq = (PyUnicodeObject *)seq;
8942 _PyObject_GC_TRACK(it);
8943 return (PyObject *)it;
8944}
8945
Martin v. Löwis5b222132007-06-10 09:51:05 +00008946size_t
8947Py_UNICODE_strlen(const Py_UNICODE *u)
8948{
8949 int res = 0;
8950 while(*u++)
8951 res++;
8952 return res;
8953}
8954
8955Py_UNICODE*
8956Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8957{
8958 Py_UNICODE *u = s1;
8959 while ((*u++ = *s2++));
8960 return s1;
8961}
8962
8963Py_UNICODE*
8964Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8965{
8966 Py_UNICODE *u = s1;
8967 while ((*u++ = *s2++))
8968 if (n-- == 0)
8969 break;
8970 return s1;
8971}
8972
8973int
8974Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8975{
8976 while (*s1 && *s2 && *s1 == *s2)
8977 s1++, s2++;
8978 if (*s1 && *s2)
8979 return (*s1 < *s2) ? -1 : +1;
8980 if (*s1)
8981 return 1;
8982 if (*s2)
8983 return -1;
8984 return 0;
8985}
8986
8987Py_UNICODE*
8988Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
8989{
8990 const Py_UNICODE *p;
8991 for (p = s; *p; p++)
8992 if (*p == c)
8993 return (Py_UNICODE*)p;
8994 return NULL;
8995}
8996
8997
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008998#ifdef __cplusplus
8999}
9000#endif
9001
9002
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009003/*
9004Local variables:
9005c-basic-offset: 4
9006indent-tabs-mode: nil
9007End:
9008*/