blob: 3777991e2d5ce2ac10b21eec955fdd316336e6b3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000543 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000806 Py_UNICODE *ucopy;
807 Py_ssize_t usize;
808 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* unused, since we already have the result */
810 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000811 ucopy = PyUnicode_AS_UNICODE(*callresult);
812 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 for (upos = 0; upos<usize;)
814 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000817 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 ++callresult;
819 break;
820 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 case 'p':
822 sprintf(buffer, "%p", va_arg(vargs, void*));
823 /* %p is ill-defined: ensure leading 0x. */
824 if (buffer[1] == 'X')
825 buffer[1] = 'x';
826 else if (buffer[1] != 'x') {
827 memmove(buffer+2, buffer, strlen(buffer)+1);
828 buffer[0] = '0';
829 buffer[1] = 'x';
830 }
831 appendstring(buffer);
832 break;
833 case '%':
834 *s++ = '%';
835 break;
836 default:
837 appendstring(p);
838 goto end;
839 }
840 } else
841 *s++ = *f;
842 }
843
844 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000845 if (callresults)
846 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 if (abuffer)
848 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 fail:
852 if (callresults) {
853 PyObject **callresult2 = callresults;
854 while (callresult2 <= callresult) {
855 Py_DECREF(*callresult2);
856 ++callresult2;
857 }
858 PyMem_Free(callresults);
859 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 if (abuffer)
861 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000862 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870 PyObject* ret;
871 va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874 va_start(vargs, format);
875#else
876 va_start(vargs);
877#endif
878 ret = PyUnicode_FromFormatV(format, vargs);
879 va_end(vargs);
880 return ret;
881}
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884 wchar_t *w,
885 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886{
887 if (unicode == NULL) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891
892 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000894 size = PyUnicode_GET_SIZE(unicode) + 1;
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896#ifdef HAVE_USABLE_WCHAR_T
897 memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899 {
900 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000903 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 *w++ = *u++;
905 }
906#endif
907
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 if (size > PyUnicode_GET_SIZE(unicode))
909 return PyUnicode_GET_SIZE(unicode);
910 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 return size;
912}
913
914#endif
915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000918 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
920#ifdef Py_UNICODE_WIDE
921 if (ordinal < 0 || ordinal > 0x10ffff) {
922 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000923 "chr() arg not in range(0x110000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000924 "(wide Python build)");
925 return NULL;
926 }
927#else
928 if (ordinal < 0 || ordinal > 0xffff) {
929 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000930 "chr() arg not in range(0x10000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 "(narrow Python build)");
932 return NULL;
933 }
934#endif
935
Hye-Shik Chang40574832004-04-06 07:24:51 +0000936 s[0] = (Py_UNICODE)ordinal;
937 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000938}
939
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940PyObject *PyUnicode_FromObject(register PyObject *obj)
941{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000942 /* XXX Perhaps we should make this API an alias of
943 PyObject_Unicode() instead ?! */
944 if (PyUnicode_CheckExact(obj)) {
945 Py_INCREF(obj);
946 return obj;
947 }
948 if (PyUnicode_Check(obj)) {
949 /* For a Unicode subtype that's not a Unicode object,
950 return a true Unicode object with the same data. */
951 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
952 PyUnicode_GET_SIZE(obj));
953 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000954 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
955}
956
957PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
958 const char *encoding,
959 const char *errors)
960{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000961 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000963 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000964
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 if (obj == NULL) {
966 PyErr_BadInternalCall();
967 return NULL;
968 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000969
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000970#if 0
971 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000972 that no encodings is given and then redirect to
973 PyObject_Unicode() which then applies the additional logic for
974 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000976 NOTE: This API should really only be used for object which
977 represent *encoded* Unicode !
978
979 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 if (PyUnicode_Check(obj)) {
981 if (encoding) {
982 PyErr_SetString(PyExc_TypeError,
983 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000988#else
989 if (PyUnicode_Check(obj)) {
990 PyErr_SetString(PyExc_TypeError,
991 "decoding Unicode is not supported");
992 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000994#endif
995
996 /* Coerce object */
997 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 s = PyString_AS_STRING(obj);
999 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1002 /* Overwrite the error message with something more useful in
1003 case of a TypeError. */
1004 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001006 "coercing to Unicode: need string or buffer, "
1007 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 obj->ob_type->tp_name);
1009 goto onError;
1010 }
Tim Petersced69f82003-09-16 20:30:58 +00001011
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001012 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 if (len == 0) {
1014 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 }
Tim Petersced69f82003-09-16 20:30:58 +00001017 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001019
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001020 return v;
1021
1022 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024}
1025
1026PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 const char *encoding,
1029 const char *errors)
1030{
1031 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032
1033 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001034 encoding = PyUnicode_GetDefaultEncoding();
1035
1036 /* Shortcuts for common default encodings */
1037 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001039 else if (strcmp(encoding, "latin-1") == 0)
1040 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001041#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1042 else if (strcmp(encoding, "mbcs") == 0)
1043 return PyUnicode_DecodeMBCS(s, size, errors);
1044#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001045 else if (strcmp(encoding, "ascii") == 0)
1046 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047
1048 /* Decode via the codec registry */
1049 buffer = PyBuffer_FromMemory((void *)s, size);
1050 if (buffer == NULL)
1051 goto onError;
1052 unicode = PyCodec_Decode(buffer, encoding, errors);
1053 if (unicode == NULL)
1054 goto onError;
1055 if (!PyUnicode_Check(unicode)) {
1056 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001057 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 unicode->ob_type->tp_name);
1059 Py_DECREF(unicode);
1060 goto onError;
1061 }
1062 Py_DECREF(buffer);
1063 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001064
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 onError:
1066 Py_XDECREF(buffer);
1067 return NULL;
1068}
1069
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001070PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1071 const char *encoding,
1072 const char *errors)
1073{
1074 PyObject *v;
1075
1076 if (!PyUnicode_Check(unicode)) {
1077 PyErr_BadArgument();
1078 goto onError;
1079 }
1080
1081 if (encoding == NULL)
1082 encoding = PyUnicode_GetDefaultEncoding();
1083
1084 /* Decode via the codec registry */
1085 v = PyCodec_Decode(unicode, encoding, errors);
1086 if (v == NULL)
1087 goto onError;
1088 return v;
1089
1090 onError:
1091 return NULL;
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001095 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 const char *encoding,
1097 const char *errors)
1098{
1099 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 unicode = PyUnicode_FromUnicode(s, size);
1102 if (unicode == NULL)
1103 return NULL;
1104 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1105 Py_DECREF(unicode);
1106 return v;
1107}
1108
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001109PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1110 const char *encoding,
1111 const char *errors)
1112{
1113 PyObject *v;
1114
1115 if (!PyUnicode_Check(unicode)) {
1116 PyErr_BadArgument();
1117 goto onError;
1118 }
1119
1120 if (encoding == NULL)
1121 encoding = PyUnicode_GetDefaultEncoding();
1122
1123 /* Encode via the codec registry */
1124 v = PyCodec_Encode(unicode, encoding, errors);
1125 if (v == NULL)
1126 goto onError;
1127 return v;
1128
1129 onError:
1130 return NULL;
1131}
1132
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1134 const char *encoding,
1135 const char *errors)
1136{
1137 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001138
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 if (!PyUnicode_Check(unicode)) {
1140 PyErr_BadArgument();
1141 goto onError;
1142 }
Fred Drakee4315f52000-05-09 19:53:39 +00001143
Tim Petersced69f82003-09-16 20:30:58 +00001144 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001145 encoding = PyUnicode_GetDefaultEncoding();
1146
1147 /* Shortcuts for common default encodings */
1148 if (errors == NULL) {
1149 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001150 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001151 else if (strcmp(encoding, "latin-1") == 0)
1152 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1154 else if (strcmp(encoding, "mbcs") == 0)
1155 return PyUnicode_AsMBCSString(unicode);
1156#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001157 else if (strcmp(encoding, "ascii") == 0)
1158 return PyUnicode_AsASCIIString(unicode);
1159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160
1161 /* Encode via the codec registry */
1162 v = PyCodec_Encode(unicode, encoding, errors);
1163 if (v == NULL)
1164 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001165 if (!PyBytes_Check(v)) {
1166 if (PyString_Check(v)) {
1167 /* Old codec, turn it into bytes */
1168 PyObject *b = PyBytes_FromObject(v);
1169 Py_DECREF(v);
1170 return b;
1171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001173 "encoder did not return a bytes object "
1174 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1175 v->ob_type->tp_name,
1176 encoding ? encoding : "NULL",
1177 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 Py_DECREF(v);
1179 goto onError;
1180 }
1181 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 onError:
1184 return NULL;
1185}
1186
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1188 const char *errors)
1189{
1190 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001191 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001192 if (v)
1193 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001194 if (errors != NULL)
1195 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1196 if (errors == NULL) {
1197 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1198 PyUnicode_GET_SIZE(unicode),
1199 NULL);
1200 }
1201 else {
1202 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1203 }
1204 if (!b)
1205 return NULL;
1206 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1207 PyBytes_Size(b));
1208 Py_DECREF(b);
1209 if (!errors) {
1210 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001211 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001212 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001213 return v;
1214}
1215
Martin v. Löwis5b222132007-06-10 09:51:05 +00001216char*
1217PyUnicode_AsString(PyObject *unicode)
1218{
1219 assert(PyUnicode_Check(unicode));
1220 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1221 if (!unicode)
1222 return NULL;
1223 return PyString_AsString(unicode);
1224}
1225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1227{
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232 return PyUnicode_AS_UNICODE(unicode);
1233
1234 onError:
1235 return NULL;
1236}
1237
Martin v. Löwis18e16552006-02-15 17:27:45 +00001238Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239{
1240 if (!PyUnicode_Check(unicode)) {
1241 PyErr_BadArgument();
1242 goto onError;
1243 }
1244 return PyUnicode_GET_SIZE(unicode);
1245
1246 onError:
1247 return -1;
1248}
1249
Thomas Wouters78890102000-07-22 19:25:51 +00001250const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001251{
1252 return unicode_default_encoding;
1253}
1254
1255int PyUnicode_SetDefaultEncoding(const char *encoding)
1256{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001257 if (strcmp(encoding, unicode_default_encoding) != 0) {
1258 PyErr_Format(PyExc_ValueError,
1259 "Can only set default encoding to %s",
1260 unicode_default_encoding);
1261 return -1;
1262 }
Fred Drakee4315f52000-05-09 19:53:39 +00001263 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001264}
1265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266/* error handling callback helper:
1267 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001268 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 and adjust various state variables.
1270 return 0 on success, -1 on error
1271*/
1272
1273static
1274int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1275 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1277 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280
1281 PyObject *restuple = NULL;
1282 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001283 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1284 Py_ssize_t requiredsize;
1285 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 int res = -1;
1289
1290 if (*errorHandler == NULL) {
1291 *errorHandler = PyCodec_LookupError(errors);
1292 if (*errorHandler == NULL)
1293 goto onError;
1294 }
1295
1296 if (*exceptionObject == NULL) {
1297 *exceptionObject = PyUnicodeDecodeError_Create(
1298 encoding, input, insize, *startinpos, *endinpos, reason);
1299 if (*exceptionObject == NULL)
1300 goto onError;
1301 }
1302 else {
1303 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1304 goto onError;
1305 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1306 goto onError;
1307 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1308 goto onError;
1309 }
1310
1311 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1312 if (restuple == NULL)
1313 goto onError;
1314 if (!PyTuple_Check(restuple)) {
1315 PyErr_Format(PyExc_TypeError, &argparse[4]);
1316 goto onError;
1317 }
1318 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1319 goto onError;
1320 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001321 newpos = insize+newpos;
1322 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001324 goto onError;
1325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326
1327 /* need more space? (at least enough for what we
1328 have+the replacement+the rest of the string (starting
1329 at the new input position), so we won't have to check space
1330 when there are no errors in the rest of the string) */
1331 repptr = PyUnicode_AS_UNICODE(repunicode);
1332 repsize = PyUnicode_GET_SIZE(repunicode);
1333 requiredsize = *outpos + repsize + insize-newpos;
1334 if (requiredsize > outsize) {
1335 if (requiredsize<2*outsize)
1336 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001337 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 goto onError;
1339 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1340 }
1341 *endinpos = newpos;
1342 *inptr = input + newpos;
1343 Py_UNICODE_COPY(*outptr, repptr, repsize);
1344 *outptr += repsize;
1345 *outpos += repsize;
1346 /* we made it! */
1347 res = 0;
1348
1349 onError:
1350 Py_XDECREF(restuple);
1351 return res;
1352}
1353
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001354/* --- UTF-7 Codec -------------------------------------------------------- */
1355
1356/* see RFC2152 for details */
1357
Tim Petersced69f82003-09-16 20:30:58 +00001358static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001359char utf7_special[128] = {
1360 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1361 encoded:
1362 0 - not special
1363 1 - special
1364 2 - whitespace (optional)
1365 3 - RFC2152 Set O (optional) */
1366 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1367 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1368 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1370 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1372 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1374
1375};
1376
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001377/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1378 warnings about the comparison always being false; since
1379 utf7_special[0] is 1, we can safely make that one comparison
1380 true */
1381
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001382#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001383 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001384 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001385 (encodeO && (utf7_special[(c)] == 3)))
1386
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001387#define B64(n) \
1388 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1389#define B64CHAR(c) \
1390 (isalnum(c) || (c) == '+' || (c) == '/')
1391#define UB64(c) \
1392 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1393 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001394
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001395#define ENCODE(out, ch, bits) \
1396 while (bits >= 6) { \
1397 *out++ = B64(ch >> (bits-6)); \
1398 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001399 }
1400
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001401#define DECODE(out, ch, bits, surrogate) \
1402 while (bits >= 16) { \
1403 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1404 bits -= 16; \
1405 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001406 /* We have already generated an error for the high surrogate \
1407 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001408 surrogate = 0; \
1409 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001410 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001411 it in a 16-bit character */ \
1412 surrogate = 1; \
1413 errmsg = "code pairs are not supported"; \
1414 goto utf7Error; \
1415 } else { \
1416 *out++ = outCh; \
1417 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001418 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001419
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422 const char *errors)
1423{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001424 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001425 Py_ssize_t startinpos;
1426 Py_ssize_t endinpos;
1427 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001428 const char *e;
1429 PyUnicodeObject *unicode;
1430 Py_UNICODE *p;
1431 const char *errmsg = "";
1432 int inShift = 0;
1433 unsigned int bitsleft = 0;
1434 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 int surrogate = 0;
1436 PyObject *errorHandler = NULL;
1437 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001438
1439 unicode = _PyUnicode_New(size);
1440 if (!unicode)
1441 return NULL;
1442 if (size == 0)
1443 return (PyObject *)unicode;
1444
1445 p = unicode->str;
1446 e = s + size;
1447
1448 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 Py_UNICODE ch;
1450 restart:
1451 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452
1453 if (inShift) {
1454 if ((ch == '-') || !B64CHAR(ch)) {
1455 inShift = 0;
1456 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001457
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1459 if (bitsleft >= 6) {
1460 /* The shift sequence has a partial character in it. If
1461 bitsleft < 6 then we could just classify it as padding
1462 but that is not the case here */
1463
1464 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001465 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466 }
1467 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001468 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469 here so indicate the potential of a misencoded character. */
1470
1471 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1472 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1473 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001474 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001475 }
1476
1477 if (ch == '-') {
1478 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001479 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480 inShift = 1;
1481 }
1482 } else if (SPECIAL(ch,0,0)) {
1483 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001484 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485 } else {
1486 *p++ = ch;
1487 }
1488 } else {
1489 charsleft = (charsleft << 6) | UB64(ch);
1490 bitsleft += 6;
1491 s++;
1492 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1493 }
1494 }
1495 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 s++;
1498 if (s < e && *s == '-') {
1499 s++;
1500 *p++ = '+';
1501 } else
1502 {
1503 inShift = 1;
1504 bitsleft = 0;
1505 }
1506 }
1507 else if (SPECIAL(ch,0,0)) {
1508 errmsg = "unexpected special character";
1509 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001510 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 }
1512 else {
1513 *p++ = ch;
1514 s++;
1515 }
1516 continue;
1517 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 outpos = p-PyUnicode_AS_UNICODE(unicode);
1519 endinpos = s-starts;
1520 if (unicode_decode_call_errorhandler(
1521 errors, &errorHandler,
1522 "utf7", errmsg,
1523 starts, size, &startinpos, &endinpos, &exc, &s,
1524 (PyObject **)&unicode, &outpos, &p))
1525 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 }
1527
1528 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 outpos = p-PyUnicode_AS_UNICODE(unicode);
1530 endinpos = size;
1531 if (unicode_decode_call_errorhandler(
1532 errors, &errorHandler,
1533 "utf7", "unterminated shift sequence",
1534 starts, size, &startinpos, &endinpos, &exc, &s,
1535 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 if (s < e)
1538 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 }
1540
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001541 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 goto onError;
1543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 Py_XDECREF(errorHandler);
1545 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 return (PyObject *)unicode;
1547
1548onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 Py_XDECREF(errorHandler);
1550 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 Py_DECREF(unicode);
1552 return NULL;
1553}
1554
1555
1556PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001557 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 int encodeSetO,
1559 int encodeWhiteSpace,
1560 const char *errors)
1561{
1562 PyObject *v;
1563 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 unsigned int bitsleft = 0;
1568 unsigned long charsleft = 0;
1569 char * out;
1570 char * start;
1571
1572 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001573 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574
Walter Dörwald51ab4142007-05-05 14:43:36 +00001575 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 if (v == NULL)
1577 return NULL;
1578
Walter Dörwald51ab4142007-05-05 14:43:36 +00001579 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 for (;i < size; ++i) {
1581 Py_UNICODE ch = s[i];
1582
1583 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001584 if (ch == '+') {
1585 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001586 *out++ = '-';
1587 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1588 charsleft = ch;
1589 bitsleft = 16;
1590 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001591 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001593 } else {
1594 *out++ = (char) ch;
1595 }
1596 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1598 *out++ = B64(charsleft << (6-bitsleft));
1599 charsleft = 0;
1600 bitsleft = 0;
1601 /* Characters not in the BASE64 set implicitly unshift the sequence
1602 so no '-' is required, except if the character is itself a '-' */
1603 if (B64CHAR(ch) || ch == '-') {
1604 *out++ = '-';
1605 }
1606 inShift = 0;
1607 *out++ = (char) ch;
1608 } else {
1609 bitsleft += 16;
1610 charsleft = (charsleft << 16) | ch;
1611 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1612
1613 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001614 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001615 or '-' then the shift sequence will be terminated implicitly and we
1616 don't have to insert a '-'. */
1617
1618 if (bitsleft == 0) {
1619 if (i + 1 < size) {
1620 Py_UNICODE ch2 = s[i+1];
1621
1622 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001623
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 } else if (B64CHAR(ch2) || ch2 == '-') {
1625 *out++ = '-';
1626 inShift = 0;
1627 } else {
1628 inShift = 0;
1629 }
1630
1631 }
1632 else {
1633 *out++ = '-';
1634 inShift = 0;
1635 }
1636 }
Tim Petersced69f82003-09-16 20:30:58 +00001637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001640 if (bitsleft) {
1641 *out++= B64(charsleft << (6-bitsleft) );
1642 *out++ = '-';
1643 }
1644
Walter Dörwald51ab4142007-05-05 14:43:36 +00001645 if (PyBytes_Resize(v, out - start)) {
1646 Py_DECREF(v);
1647 return NULL;
1648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 return v;
1650}
1651
1652#undef SPECIAL
1653#undef B64
1654#undef B64CHAR
1655#undef UB64
1656#undef ENCODE
1657#undef DECODE
1658
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659/* --- UTF-8 Codec -------------------------------------------------------- */
1660
Tim Petersced69f82003-09-16 20:30:58 +00001661static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662char utf8_code_length[256] = {
1663 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1664 illegal prefix. see RFC 2279 for details */
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1678 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1679 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1680 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1681};
1682
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 const char *errors)
1686{
Walter Dörwald69652032004-09-07 20:24:22 +00001687 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1688}
1689
1690PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001692 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001693 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001694{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001697 Py_ssize_t startinpos;
1698 Py_ssize_t endinpos;
1699 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 const char *e;
1701 PyUnicodeObject *unicode;
1702 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001703 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 PyObject *errorHandler = NULL;
1705 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706
1707 /* Note: size will always be longer than the resulting Unicode
1708 character count */
1709 unicode = _PyUnicode_New(size);
1710 if (!unicode)
1711 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001712 if (size == 0) {
1713 if (consumed)
1714 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717
1718 /* Unpack UTF-8 encoded data */
1719 p = unicode->str;
1720 e = s + size;
1721
1722 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001723 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001726 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 s++;
1728 continue;
1729 }
1730
1731 n = utf8_code_length[ch];
1732
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001733 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001734 if (consumed)
1735 break;
1736 else {
1737 errmsg = "unexpected end of data";
1738 startinpos = s-starts;
1739 endinpos = size;
1740 goto utf8Error;
1741 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743
1744 switch (n) {
1745
1746 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001747 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 startinpos = s-starts;
1749 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751
1752 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 startinpos = s-starts;
1755 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001756 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
1758 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 if ((s[1] & 0xc0) != 0x80) {
1760 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
1762 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001763 goto utf8Error;
1764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 startinpos = s-starts;
1768 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 errmsg = "illegal encoding";
1770 goto utf8Error;
1771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001773 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 break;
1775
1776 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001777 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 (s[2] & 0xc0) != 0x80) {
1779 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 startinpos = s-starts;
1781 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001782 goto utf8Error;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001785 if (ch < 0x0800) {
1786 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001787 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001788
1789 XXX For wide builds (UCS-4) we should probably try
1790 to recombine the surrogates into a single code
1791 unit.
1792 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 startinpos = s-starts;
1795 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001796 goto utf8Error;
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001799 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001800 break;
1801
1802 case 4:
1803 if ((s[1] & 0xc0) != 0x80 ||
1804 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 (s[3] & 0xc0) != 0x80) {
1806 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 startinpos = s-starts;
1808 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001809 goto utf8Error;
1810 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001811 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1812 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1813 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001815 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001816 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001817 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001818 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 startinpos = s-starts;
1821 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 goto utf8Error;
1823 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001824#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001825 *p++ = (Py_UNICODE)ch;
1826#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001827 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001828
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001829 /* translate from 10000..10FFFF to 0..FFFF */
1830 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001831
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001832 /* high surrogate = top 10 bits added to D800 */
1833 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001834
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001835 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001836 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001837#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 break;
1839
1840 default:
1841 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 startinpos = s-starts;
1844 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
1847 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001849
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 outpos = p-PyUnicode_AS_UNICODE(unicode);
1852 if (unicode_decode_call_errorhandler(
1853 errors, &errorHandler,
1854 "utf8", errmsg,
1855 starts, size, &startinpos, &endinpos, &exc, &s,
1856 (PyObject **)&unicode, &outpos, &p))
1857 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 }
Walter Dörwald69652032004-09-07 20:24:22 +00001859 if (consumed)
1860 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861
1862 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001863 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 goto onError;
1865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 Py_XDECREF(errorHandler);
1867 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 return (PyObject *)unicode;
1869
1870onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 Py_XDECREF(errorHandler);
1872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 Py_DECREF(unicode);
1874 return NULL;
1875}
1876
Tim Peters602f7402002-04-27 18:03:26 +00001877/* Allocation strategy: if the string is short, convert into a stack buffer
1878 and allocate exactly as much space needed at the end. Else allocate the
1879 maximum possible needed (4 result bytes per Unicode character), and return
1880 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001881*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001882PyObject *
1883PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001884 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886{
Tim Peters602f7402002-04-27 18:03:26 +00001887#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001888
Martin v. Löwis18e16552006-02-15 17:27:45 +00001889 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001890 PyObject *v; /* result string object */
1891 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001892 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001893 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001894 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001895
Tim Peters602f7402002-04-27 18:03:26 +00001896 assert(s != NULL);
1897 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898
Tim Peters602f7402002-04-27 18:03:26 +00001899 if (size <= MAX_SHORT_UNICHARS) {
1900 /* Write into the stack buffer; nallocated can't overflow.
1901 * At the end, we'll allocate exactly as much heap space as it
1902 * turns out we need.
1903 */
1904 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1905 v = NULL; /* will allocate after we're done */
1906 p = stackbuf;
1907 }
1908 else {
1909 /* Overallocate on the heap, and give the excess back at the end. */
1910 nallocated = size * 4;
1911 if (nallocated / 4 != size) /* overflow! */
1912 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001913 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001914 if (v == NULL)
1915 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001916 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001917 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001918
Tim Peters602f7402002-04-27 18:03:26 +00001919 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001920 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001921
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001922 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001923 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001925
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001927 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001928 *p++ = (char)(0xc0 | (ch >> 6));
1929 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001930 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001931 else {
Tim Peters602f7402002-04-27 18:03:26 +00001932 /* Encode UCS2 Unicode ordinals */
1933 if (ch < 0x10000) {
1934 /* Special case: check for high surrogate */
1935 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1936 Py_UCS4 ch2 = s[i];
1937 /* Check for low surrogate and combine the two to
1938 form a UCS4 value */
1939 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001940 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001941 i++;
1942 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 }
Tim Peters602f7402002-04-27 18:03:26 +00001944 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001945 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001947 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1948 *p++ = (char)(0x80 | (ch & 0x3f));
1949 continue;
1950 }
1951encodeUCS4:
1952 /* Encode UCS4 Unicode ordinals */
1953 *p++ = (char)(0xf0 | (ch >> 18));
1954 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1955 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1956 *p++ = (char)(0x80 | (ch & 0x3f));
1957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001959
Tim Peters602f7402002-04-27 18:03:26 +00001960 if (v == NULL) {
1961 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001962 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001963 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001964 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001965 }
1966 else {
1967 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001968 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001969 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001970 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001973
Tim Peters602f7402002-04-27 18:03:26 +00001974#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1978{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 if (!PyUnicode_Check(unicode)) {
1980 PyErr_BadArgument();
1981 return NULL;
1982 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001983 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1984 PyUnicode_GET_SIZE(unicode),
1985 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
1988/* --- UTF-16 Codec ------------------------------------------------------- */
1989
Tim Peters772747b2001-08-09 22:21:55 +00001990PyObject *
1991PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001992 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001993 const char *errors,
1994 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995{
Walter Dörwald69652032004-09-07 20:24:22 +00001996 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1997}
1998
1999PyObject *
2000PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002001 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002002 const char *errors,
2003 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002005{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002007 Py_ssize_t startinpos;
2008 Py_ssize_t endinpos;
2009 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 PyUnicodeObject *unicode;
2011 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002012 const unsigned char *q, *e;
2013 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002014 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002015 /* Offsets from q for retrieving byte pairs in the right order. */
2016#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2017 int ihi = 1, ilo = 0;
2018#else
2019 int ihi = 0, ilo = 1;
2020#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002021 PyObject *errorHandler = NULL;
2022 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023
2024 /* Note: size will always be longer than the resulting Unicode
2025 character count */
2026 unicode = _PyUnicode_New(size);
2027 if (!unicode)
2028 return NULL;
2029 if (size == 0)
2030 return (PyObject *)unicode;
2031
2032 /* Unpack UTF-16 encoded data */
2033 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002034 q = (unsigned char *)s;
2035 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
2037 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002038 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002040 /* Check for BOM marks (U+FEFF) in the input and adjust current
2041 byte order setting accordingly. In native mode, the leading BOM
2042 mark is skipped, in all other modes, it is copied to the output
2043 stream as-is (giving a ZWNBSP character). */
2044 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002045 if (size >= 2) {
2046 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002047#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002048 if (bom == 0xFEFF) {
2049 q += 2;
2050 bo = -1;
2051 }
2052 else if (bom == 0xFFFE) {
2053 q += 2;
2054 bo = 1;
2055 }
Tim Petersced69f82003-09-16 20:30:58 +00002056#else
Walter Dörwald69652032004-09-07 20:24:22 +00002057 if (bom == 0xFEFF) {
2058 q += 2;
2059 bo = 1;
2060 }
2061 else if (bom == 0xFFFE) {
2062 q += 2;
2063 bo = -1;
2064 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002065#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002066 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068
Tim Peters772747b2001-08-09 22:21:55 +00002069 if (bo == -1) {
2070 /* force LE */
2071 ihi = 1;
2072 ilo = 0;
2073 }
2074 else if (bo == 1) {
2075 /* force BE */
2076 ihi = 0;
2077 ilo = 1;
2078 }
2079
2080 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002082 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002084 if (consumed)
2085 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 errmsg = "truncated data";
2087 startinpos = ((const char *)q)-starts;
2088 endinpos = ((const char *)e)-starts;
2089 goto utf16Error;
2090 /* The remaining input chars are ignored if the callback
2091 chooses to skip the input */
2092 }
2093 ch = (q[ihi] << 8) | q[ilo];
2094
Tim Peters772747b2001-08-09 22:21:55 +00002095 q += 2;
2096
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 if (ch < 0xD800 || ch > 0xDFFF) {
2098 *p++ = ch;
2099 continue;
2100 }
2101
2102 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002103 if (q >= e) {
2104 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 startinpos = (((const char *)q)-2)-starts;
2106 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002107 goto utf16Error;
2108 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002109 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002110 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2111 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002112 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002113#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 *p++ = ch;
2115 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116#else
2117 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002118#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002119 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002120 }
2121 else {
2122 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002123 startinpos = (((const char *)q)-4)-starts;
2124 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002125 goto utf16Error;
2126 }
2127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002129 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 startinpos = (((const char *)q)-2)-starts;
2131 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002132 /* Fall through to report the error */
2133
2134 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 outpos = p-PyUnicode_AS_UNICODE(unicode);
2136 if (unicode_decode_call_errorhandler(
2137 errors, &errorHandler,
2138 "utf16", errmsg,
2139 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2140 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 }
2143
2144 if (byteorder)
2145 *byteorder = bo;
2146
Walter Dörwald69652032004-09-07 20:24:22 +00002147 if (consumed)
2148 *consumed = (const char *)q-starts;
2149
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002151 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 goto onError;
2153
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002154 Py_XDECREF(errorHandler);
2155 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 return (PyObject *)unicode;
2157
2158onError:
2159 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160 Py_XDECREF(errorHandler);
2161 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 return NULL;
2163}
2164
Tim Peters772747b2001-08-09 22:21:55 +00002165PyObject *
2166PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002167 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002168 const char *errors,
2169 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
2171 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002172 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002173#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002174 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002175#else
2176 const int pairs = 0;
2177#endif
Tim Peters772747b2001-08-09 22:21:55 +00002178 /* Offsets from p for storing byte pairs in the right order. */
2179#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2180 int ihi = 1, ilo = 0;
2181#else
2182 int ihi = 0, ilo = 1;
2183#endif
2184
2185#define STORECHAR(CH) \
2186 do { \
2187 p[ihi] = ((CH) >> 8) & 0xff; \
2188 p[ilo] = (CH) & 0xff; \
2189 p += 2; \
2190 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002193 for (i = pairs = 0; i < size; i++)
2194 if (s[i] >= 0x10000)
2195 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002196#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002197 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002198 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 if (v == NULL)
2200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201
Walter Dörwald3cc34522007-05-04 10:48:27 +00002202 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002204 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002205 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002206 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002207
2208 if (byteorder == -1) {
2209 /* force LE */
2210 ihi = 1;
2211 ilo = 0;
2212 }
2213 else if (byteorder == 1) {
2214 /* force BE */
2215 ihi = 0;
2216 ilo = 1;
2217 }
2218
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002219 while (size-- > 0) {
2220 Py_UNICODE ch = *s++;
2221 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002222#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002223 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002224 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2225 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002227#endif
Tim Peters772747b2001-08-09 22:21:55 +00002228 STORECHAR(ch);
2229 if (ch2)
2230 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002233#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234}
2235
2236PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2237{
2238 if (!PyUnicode_Check(unicode)) {
2239 PyErr_BadArgument();
2240 return NULL;
2241 }
2242 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2243 PyUnicode_GET_SIZE(unicode),
2244 NULL,
2245 0);
2246}
2247
2248/* --- Unicode Escape Codec ----------------------------------------------- */
2249
Fredrik Lundh06d12682001-01-24 07:59:11 +00002250static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002251
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002253 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 const char *errors)
2255{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002257 Py_ssize_t startinpos;
2258 Py_ssize_t endinpos;
2259 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002264 char* message;
2265 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 PyObject *errorHandler = NULL;
2267 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002268
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 /* Escaped strings will always be longer than the resulting
2270 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 length after conversion to the true value.
2272 (but if the error callback returns a long replacement string
2273 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 v = _PyUnicode_New(size);
2275 if (v == NULL)
2276 goto onError;
2277 if (size == 0)
2278 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002279
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002280 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002282
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 while (s < end) {
2284 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002285 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287
2288 /* Non-escape characters are interpreted as Unicode ordinals */
2289 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 continue;
2292 }
2293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 /* \ - Escapes */
2296 s++;
2297 switch (*s++) {
2298
2299 /* \x escapes */
2300 case '\n': break;
2301 case '\\': *p++ = '\\'; break;
2302 case '\'': *p++ = '\''; break;
2303 case '\"': *p++ = '\"'; break;
2304 case 'b': *p++ = '\b'; break;
2305 case 'f': *p++ = '\014'; break; /* FF */
2306 case 't': *p++ = '\t'; break;
2307 case 'n': *p++ = '\n'; break;
2308 case 'r': *p++ = '\r'; break;
2309 case 'v': *p++ = '\013'; break; /* VT */
2310 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2311
2312 /* \OOO (octal) escapes */
2313 case '0': case '1': case '2': case '3':
2314 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002315 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002317 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002319 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002321 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 break;
2323
Fredrik Lundhccc74732001-02-18 22:13:49 +00002324 /* hex escapes */
2325 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002327 digits = 2;
2328 message = "truncated \\xXX escape";
2329 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
Fredrik Lundhccc74732001-02-18 22:13:49 +00002331 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002333 digits = 4;
2334 message = "truncated \\uXXXX escape";
2335 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336
Fredrik Lundhccc74732001-02-18 22:13:49 +00002337 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002338 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002339 digits = 8;
2340 message = "truncated \\UXXXXXXXX escape";
2341 hexescape:
2342 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 outpos = p-PyUnicode_AS_UNICODE(v);
2344 if (s+digits>end) {
2345 endinpos = size;
2346 if (unicode_decode_call_errorhandler(
2347 errors, &errorHandler,
2348 "unicodeescape", "end of string in escape sequence",
2349 starts, size, &startinpos, &endinpos, &exc, &s,
2350 (PyObject **)&v, &outpos, &p))
2351 goto onError;
2352 goto nextByte;
2353 }
2354 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002355 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002356 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002357 endinpos = (s+i+1)-starts;
2358 if (unicode_decode_call_errorhandler(
2359 errors, &errorHandler,
2360 "unicodeescape", message,
2361 starts, size, &startinpos, &endinpos, &exc, &s,
2362 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002365 }
2366 chr = (chr<<4) & ~0xF;
2367 if (c >= '0' && c <= '9')
2368 chr += c - '0';
2369 else if (c >= 'a' && c <= 'f')
2370 chr += 10 + c - 'a';
2371 else
2372 chr += 10 + c - 'A';
2373 }
2374 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002375 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002376 /* _decoding_error will have already written into the
2377 target buffer. */
2378 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002379 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002380 /* when we get here, chr is a 32-bit unicode character */
2381 if (chr <= 0xffff)
2382 /* UCS-2 character */
2383 *p++ = (Py_UNICODE) chr;
2384 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002385 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002386 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002387#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002388 *p++ = chr;
2389#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002390 chr -= 0x10000L;
2391 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002392 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002393#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002394 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002395 endinpos = s-starts;
2396 outpos = p-PyUnicode_AS_UNICODE(v);
2397 if (unicode_decode_call_errorhandler(
2398 errors, &errorHandler,
2399 "unicodeescape", "illegal Unicode character",
2400 starts, size, &startinpos, &endinpos, &exc, &s,
2401 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002402 goto onError;
2403 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002404 break;
2405
2406 /* \N{name} */
2407 case 'N':
2408 message = "malformed \\N character escape";
2409 if (ucnhash_CAPI == NULL) {
2410 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002411 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002412 m = PyImport_ImportModule("unicodedata");
2413 if (m == NULL)
2414 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002415 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002416 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002417 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002418 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002419 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002420 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002421 if (ucnhash_CAPI == NULL)
2422 goto ucnhashError;
2423 }
2424 if (*s == '{') {
2425 const char *start = s+1;
2426 /* look for the closing brace */
2427 while (*s != '}' && s < end)
2428 s++;
2429 if (s > start && s < end && *s == '}') {
2430 /* found a name. look it up in the unicode database */
2431 message = "unknown Unicode character name";
2432 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002433 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002434 goto store;
2435 }
2436 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002437 endinpos = s-starts;
2438 outpos = p-PyUnicode_AS_UNICODE(v);
2439 if (unicode_decode_call_errorhandler(
2440 errors, &errorHandler,
2441 "unicodeescape", message,
2442 starts, size, &startinpos, &endinpos, &exc, &s,
2443 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002444 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002445 break;
2446
2447 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002448 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002449 message = "\\ at end of string";
2450 s--;
2451 endinpos = s-starts;
2452 outpos = p-PyUnicode_AS_UNICODE(v);
2453 if (unicode_decode_call_errorhandler(
2454 errors, &errorHandler,
2455 "unicodeescape", message,
2456 starts, size, &startinpos, &endinpos, &exc, &s,
2457 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002458 goto onError;
2459 }
2460 else {
2461 *p++ = '\\';
2462 *p++ = (unsigned char)s[-1];
2463 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002464 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 nextByte:
2467 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002469 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002471 Py_XDECREF(errorHandler);
2472 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002474
Fredrik Lundhccc74732001-02-18 22:13:49 +00002475ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002476 PyErr_SetString(
2477 PyExc_UnicodeError,
2478 "\\N escapes not supported (can't load unicodedata module)"
2479 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002480 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 Py_XDECREF(errorHandler);
2482 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002483 return NULL;
2484
Fredrik Lundhccc74732001-02-18 22:13:49 +00002485onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 Py_XDECREF(errorHandler);
2488 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 return NULL;
2490}
2491
2492/* Return a Unicode-Escape string version of the Unicode object.
2493
2494 If quotes is true, the string is enclosed in u"" or u'' quotes as
2495 appropriate.
2496
2497*/
2498
Thomas Wouters477c8d52006-05-27 19:21:47 +00002499Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2500 Py_ssize_t size,
2501 Py_UNICODE ch)
2502{
2503 /* like wcschr, but doesn't stop at NULL characters */
2504
2505 while (size-- > 0) {
2506 if (*s == ch)
2507 return s;
2508 s++;
2509 }
2510
2511 return NULL;
2512}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002513
Walter Dörwald79e913e2007-05-12 11:08:06 +00002514static const char *hexdigits = "0123456789abcdef";
2515
2516PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2517 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518{
2519 PyObject *repr;
2520 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
Thomas Wouters89f507f2006-12-13 04:49:30 +00002522 /* XXX(nnorwitz): rather than over-allocating, it would be
2523 better to choose a different scheme. Perhaps scan the
2524 first N-chars of the string and allocate based on that size.
2525 */
2526 /* Initial allocation is based on the longest-possible unichr
2527 escape.
2528
2529 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2530 unichr, so in this case it's the longest unichr escape. In
2531 narrow (UTF-16) builds this is five chars per source unichr
2532 since there are two unichrs in the surrogate pair, so in narrow
2533 (UTF-16) builds it's not the longest unichr escape.
2534
2535 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2536 so in the narrow (UTF-16) build case it's the longest unichr
2537 escape.
2538 */
2539
Walter Dörwald79e913e2007-05-12 11:08:06 +00002540 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002541#ifdef Py_UNICODE_WIDE
2542 + 10*size
2543#else
2544 + 6*size
2545#endif
2546 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (repr == NULL)
2548 return NULL;
2549
Walter Dörwald79e913e2007-05-12 11:08:06 +00002550 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 while (size-- > 0) {
2553 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002554
Walter Dörwald79e913e2007-05-12 11:08:06 +00002555 /* Escape backslashes */
2556 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 *p++ = '\\';
2558 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002559 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002560 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002561
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002562#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563 /* Map 21-bit characters to '\U00xxxxxx' */
2564 else if (ch >= 0x10000) {
2565 *p++ = '\\';
2566 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002567 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2570 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2571 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2572 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2573 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2574 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002575 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002576 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002577#else
2578 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 else if (ch >= 0xD800 && ch < 0xDC00) {
2580 Py_UNICODE ch2;
2581 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002582
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002583 ch2 = *s++;
2584 size--;
2585 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2586 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2587 *p++ = '\\';
2588 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002589 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2592 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2593 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2594 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2595 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2596 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002597 continue;
2598 }
2599 /* Fall through: isolated surrogates are copied as-is */
2600 s--;
2601 size++;
2602 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002603#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002604
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002606 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 *p++ = '\\';
2608 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002609 *p++ = hexdigits[(ch >> 12) & 0x000F];
2610 *p++ = hexdigits[(ch >> 8) & 0x000F];
2611 *p++ = hexdigits[(ch >> 4) & 0x000F];
2612 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002614
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002615 /* Map special whitespace to '\t', \n', '\r' */
2616 else if (ch == '\t') {
2617 *p++ = '\\';
2618 *p++ = 't';
2619 }
2620 else if (ch == '\n') {
2621 *p++ = '\\';
2622 *p++ = 'n';
2623 }
2624 else if (ch == '\r') {
2625 *p++ = '\\';
2626 *p++ = 'r';
2627 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002628
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002629 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002630 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002632 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002633 *p++ = hexdigits[(ch >> 4) & 0x000F];
2634 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002635 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002636
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637 /* Copy everything else as-is */
2638 else
2639 *p++ = (char) ch;
2640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641
2642 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002643 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2644 Py_DECREF(repr);
2645 return NULL;
2646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 return repr;
2648}
2649
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2651{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002652 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 if (!PyUnicode_Check(unicode)) {
2654 PyErr_BadArgument();
2655 return NULL;
2656 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002657 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2658 PyUnicode_GET_SIZE(unicode));
2659
2660 if (!s)
2661 return NULL;
2662 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2663 PyBytes_GET_SIZE(s));
2664 Py_DECREF(s);
2665 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666}
2667
2668/* --- Raw Unicode Escape Codec ------------------------------------------- */
2669
2670PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002671 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 const char *errors)
2673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002675 Py_ssize_t startinpos;
2676 Py_ssize_t endinpos;
2677 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 const char *end;
2681 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 PyObject *errorHandler = NULL;
2683 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002684
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 /* Escaped strings will always be longer than the resulting
2686 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 length after conversion to the true value. (But decoding error
2688 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 v = _PyUnicode_New(size);
2690 if (v == NULL)
2691 goto onError;
2692 if (size == 0)
2693 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002694 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 end = s + size;
2696 while (s < end) {
2697 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002698 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002700 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701
2702 /* Non-escape characters are interpreted as Unicode ordinals */
2703 if (*s != '\\') {
2704 *p++ = (unsigned char)*s++;
2705 continue;
2706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708
2709 /* \u-escapes are only interpreted iff the number of leading
2710 backslashes if odd */
2711 bs = s;
2712 for (;s < end;) {
2713 if (*s != '\\')
2714 break;
2715 *p++ = (unsigned char)*s++;
2716 }
2717 if (((s - bs) & 1) == 0 ||
2718 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002719 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 continue;
2721 }
2722 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002723 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 s++;
2725
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002726 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002728 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 endinpos = s-starts;
2732 if (unicode_decode_call_errorhandler(
2733 errors, &errorHandler,
2734 "rawunicodeescape", "truncated \\uXXXX",
2735 starts, size, &startinpos, &endinpos, &exc, &s,
2736 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 }
2740 x = (x<<4) & ~0xF;
2741 if (c >= '0' && c <= '9')
2742 x += c - '0';
2743 else if (c >= 'a' && c <= 'f')
2744 x += 10 + c - 'a';
2745 else
2746 x += 10 + c - 'A';
2747 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002748#ifndef Py_UNICODE_WIDE
2749 if (x > 0x10000) {
2750 if (unicode_decode_call_errorhandler(
2751 errors, &errorHandler,
2752 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2753 starts, size, &startinpos, &endinpos, &exc, &s,
2754 (PyObject **)&v, &outpos, &p))
2755 goto onError;
2756 }
2757#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 *p++ = x;
2759 nextByte:
2760 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002762 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002763 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 Py_XDECREF(errorHandler);
2765 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002767
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 onError:
2769 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 Py_XDECREF(errorHandler);
2771 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 return NULL;
2773}
2774
2775PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002776 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777{
2778 PyObject *repr;
2779 char *p;
2780 char *q;
2781
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002783 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002784#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002785 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002786#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 if (repr == NULL)
2788 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002789 if (size == 0)
2790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791
Walter Dörwald711005d2007-05-12 12:03:26 +00002792 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 while (size-- > 0) {
2794 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002795#ifdef Py_UNICODE_WIDE
2796 /* Map 32-bit characters to '\Uxxxxxxxx' */
2797 if (ch >= 0x10000) {
2798 *p++ = '\\';
2799 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002800 *p++ = hexdigits[(ch >> 28) & 0xf];
2801 *p++ = hexdigits[(ch >> 24) & 0xf];
2802 *p++ = hexdigits[(ch >> 20) & 0xf];
2803 *p++ = hexdigits[(ch >> 16) & 0xf];
2804 *p++ = hexdigits[(ch >> 12) & 0xf];
2805 *p++ = hexdigits[(ch >> 8) & 0xf];
2806 *p++ = hexdigits[(ch >> 4) & 0xf];
2807 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002808 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002809 else
2810#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 /* Map 16-bit characters to '\uxxxx' */
2812 if (ch >= 256) {
2813 *p++ = '\\';
2814 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002815 *p++ = hexdigits[(ch >> 12) & 0xf];
2816 *p++ = hexdigits[(ch >> 8) & 0xf];
2817 *p++ = hexdigits[(ch >> 4) & 0xf];
2818 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 }
2820 /* Copy everything else as-is */
2821 else
2822 *p++ = (char) ch;
2823 }
2824 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002825 if (PyBytes_Resize(repr, p - q)) {
2826 Py_DECREF(repr);
2827 return NULL;
2828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 return repr;
2830}
2831
2832PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2833{
Walter Dörwald711005d2007-05-12 12:03:26 +00002834 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002836 PyErr_BadArgument();
2837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002839 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2840 PyUnicode_GET_SIZE(unicode));
2841
2842 if (!s)
2843 return NULL;
2844 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2845 PyBytes_GET_SIZE(s));
2846 Py_DECREF(s);
2847 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848}
2849
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002850/* --- Unicode Internal Codec ------------------------------------------- */
2851
2852PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002854 const char *errors)
2855{
2856 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002857 Py_ssize_t startinpos;
2858 Py_ssize_t endinpos;
2859 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002860 PyUnicodeObject *v;
2861 Py_UNICODE *p;
2862 const char *end;
2863 const char *reason;
2864 PyObject *errorHandler = NULL;
2865 PyObject *exc = NULL;
2866
Neal Norwitzd43069c2006-01-08 01:12:10 +00002867#ifdef Py_UNICODE_WIDE
2868 Py_UNICODE unimax = PyUnicode_GetMax();
2869#endif
2870
Thomas Wouters89f507f2006-12-13 04:49:30 +00002871 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002872 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2873 if (v == NULL)
2874 goto onError;
2875 if (PyUnicode_GetSize((PyObject *)v) == 0)
2876 return (PyObject *)v;
2877 p = PyUnicode_AS_UNICODE(v);
2878 end = s + size;
2879
2880 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002881 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002882 /* We have to sanity check the raw data, otherwise doom looms for
2883 some malformed UCS-4 data. */
2884 if (
2885 #ifdef Py_UNICODE_WIDE
2886 *p > unimax || *p < 0 ||
2887 #endif
2888 end-s < Py_UNICODE_SIZE
2889 )
2890 {
2891 startinpos = s - starts;
2892 if (end-s < Py_UNICODE_SIZE) {
2893 endinpos = end-starts;
2894 reason = "truncated input";
2895 }
2896 else {
2897 endinpos = s - starts + Py_UNICODE_SIZE;
2898 reason = "illegal code point (> 0x10FFFF)";
2899 }
2900 outpos = p - PyUnicode_AS_UNICODE(v);
2901 if (unicode_decode_call_errorhandler(
2902 errors, &errorHandler,
2903 "unicode_internal", reason,
2904 starts, size, &startinpos, &endinpos, &exc, &s,
2905 (PyObject **)&v, &outpos, &p)) {
2906 goto onError;
2907 }
2908 }
2909 else {
2910 p++;
2911 s += Py_UNICODE_SIZE;
2912 }
2913 }
2914
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002915 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002916 goto onError;
2917 Py_XDECREF(errorHandler);
2918 Py_XDECREF(exc);
2919 return (PyObject *)v;
2920
2921 onError:
2922 Py_XDECREF(v);
2923 Py_XDECREF(errorHandler);
2924 Py_XDECREF(exc);
2925 return NULL;
2926}
2927
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928/* --- Latin-1 Codec ------------------------------------------------------ */
2929
2930PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002931 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 const char *errors)
2933{
2934 PyUnicodeObject *v;
2935 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002936
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002938 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002939 Py_UNICODE r = *(unsigned char*)s;
2940 return PyUnicode_FromUnicode(&r, 1);
2941 }
2942
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 v = _PyUnicode_New(size);
2944 if (v == NULL)
2945 goto onError;
2946 if (size == 0)
2947 return (PyObject *)v;
2948 p = PyUnicode_AS_UNICODE(v);
2949 while (size-- > 0)
2950 *p++ = (unsigned char)*s++;
2951 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002952
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 onError:
2954 Py_XDECREF(v);
2955 return NULL;
2956}
2957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958/* create or adjust a UnicodeEncodeError */
2959static void make_encode_exception(PyObject **exceptionObject,
2960 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002961 const Py_UNICODE *unicode, Py_ssize_t size,
2962 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 if (*exceptionObject == NULL) {
2966 *exceptionObject = PyUnicodeEncodeError_Create(
2967 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 }
2969 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2971 goto onError;
2972 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2973 goto onError;
2974 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2975 goto onError;
2976 return;
2977 onError:
2978 Py_DECREF(*exceptionObject);
2979 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
2981}
2982
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002983/* raises a UnicodeEncodeError */
2984static void raise_encode_exception(PyObject **exceptionObject,
2985 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002986 const Py_UNICODE *unicode, Py_ssize_t size,
2987 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988 const char *reason)
2989{
2990 make_encode_exception(exceptionObject,
2991 encoding, unicode, size, startpos, endpos, reason);
2992 if (*exceptionObject != NULL)
2993 PyCodec_StrictErrors(*exceptionObject);
2994}
2995
2996/* error handling callback helper:
2997 build arguments, call the callback and check the arguments,
2998 put the result into newpos and return the replacement string, which
2999 has to be freed by the caller */
3000static PyObject *unicode_encode_call_errorhandler(const char *errors,
3001 PyObject **errorHandler,
3002 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003003 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3004 Py_ssize_t startpos, Py_ssize_t endpos,
3005 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008
3009 PyObject *restuple;
3010 PyObject *resunicode;
3011
3012 if (*errorHandler == NULL) {
3013 *errorHandler = PyCodec_LookupError(errors);
3014 if (*errorHandler == NULL)
3015 return NULL;
3016 }
3017
3018 make_encode_exception(exceptionObject,
3019 encoding, unicode, size, startpos, endpos, reason);
3020 if (*exceptionObject == NULL)
3021 return NULL;
3022
3023 restuple = PyObject_CallFunctionObjArgs(
3024 *errorHandler, *exceptionObject, NULL);
3025 if (restuple == NULL)
3026 return NULL;
3027 if (!PyTuple_Check(restuple)) {
3028 PyErr_Format(PyExc_TypeError, &argparse[4]);
3029 Py_DECREF(restuple);
3030 return NULL;
3031 }
3032 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3033 &resunicode, newpos)) {
3034 Py_DECREF(restuple);
3035 return NULL;
3036 }
3037 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003038 *newpos = size+*newpos;
3039 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003040 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003041 Py_DECREF(restuple);
3042 return NULL;
3043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 Py_INCREF(resunicode);
3045 Py_DECREF(restuple);
3046 return resunicode;
3047}
3048
3049static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 const char *errors,
3052 int limit)
3053{
3054 /* output object */
3055 PyObject *res;
3056 /* pointers to the beginning and end+1 of input */
3057 const Py_UNICODE *startp = p;
3058 const Py_UNICODE *endp = p + size;
3059 /* pointer to the beginning of the unencodable characters */
3060 /* const Py_UNICODE *badp = NULL; */
3061 /* pointer into the output */
3062 char *str;
3063 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003064 Py_ssize_t respos = 0;
3065 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003066 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3067 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 PyObject *errorHandler = NULL;
3069 PyObject *exc = NULL;
3070 /* the following variable is used for caching string comparisons
3071 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3072 int known_errorHandler = -1;
3073
3074 /* allocate enough for a simple encoding without
3075 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003076 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 if (res == NULL)
3078 goto onError;
3079 if (size == 0)
3080 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003081 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082 ressize = size;
3083
3084 while (p<endp) {
3085 Py_UNICODE c = *p;
3086
3087 /* can we encode this? */
3088 if (c<limit) {
3089 /* no overflow check, because we know that the space is enough */
3090 *str++ = (char)c;
3091 ++p;
3092 }
3093 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003094 Py_ssize_t unicodepos = p-startp;
3095 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003097 Py_ssize_t repsize;
3098 Py_ssize_t newpos;
3099 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 Py_UNICODE *uni2;
3101 /* startpos for collecting unencodable chars */
3102 const Py_UNICODE *collstart = p;
3103 const Py_UNICODE *collend = p;
3104 /* find all unecodable characters */
3105 while ((collend < endp) && ((*collend)>=limit))
3106 ++collend;
3107 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3108 if (known_errorHandler==-1) {
3109 if ((errors==NULL) || (!strcmp(errors, "strict")))
3110 known_errorHandler = 1;
3111 else if (!strcmp(errors, "replace"))
3112 known_errorHandler = 2;
3113 else if (!strcmp(errors, "ignore"))
3114 known_errorHandler = 3;
3115 else if (!strcmp(errors, "xmlcharrefreplace"))
3116 known_errorHandler = 4;
3117 else
3118 known_errorHandler = 0;
3119 }
3120 switch (known_errorHandler) {
3121 case 1: /* strict */
3122 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3123 goto onError;
3124 case 2: /* replace */
3125 while (collstart++<collend)
3126 *str++ = '?'; /* fall through */
3127 case 3: /* ignore */
3128 p = collend;
3129 break;
3130 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003131 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 /* determine replacement size (temporarily (mis)uses p) */
3133 for (p = collstart, repsize = 0; p < collend; ++p) {
3134 if (*p<10)
3135 repsize += 2+1+1;
3136 else if (*p<100)
3137 repsize += 2+2+1;
3138 else if (*p<1000)
3139 repsize += 2+3+1;
3140 else if (*p<10000)
3141 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003142#ifndef Py_UNICODE_WIDE
3143 else
3144 repsize += 2+5+1;
3145#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 else if (*p<100000)
3147 repsize += 2+5+1;
3148 else if (*p<1000000)
3149 repsize += 2+6+1;
3150 else
3151 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003152#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 }
3154 requiredsize = respos+repsize+(endp-collend);
3155 if (requiredsize > ressize) {
3156 if (requiredsize<2*ressize)
3157 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003158 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003160 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161 ressize = requiredsize;
3162 }
3163 /* generate replacement (temporarily (mis)uses p) */
3164 for (p = collstart; p < collend; ++p) {
3165 str += sprintf(str, "&#%d;", (int)*p);
3166 }
3167 p = collend;
3168 break;
3169 default:
3170 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3171 encoding, reason, startp, size, &exc,
3172 collstart-startp, collend-startp, &newpos);
3173 if (repunicode == NULL)
3174 goto onError;
3175 /* need more space? (at least enough for what we
3176 have+the replacement+the rest of the string, so
3177 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003178 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 repsize = PyUnicode_GET_SIZE(repunicode);
3180 requiredsize = respos+repsize+(endp-collend);
3181 if (requiredsize > ressize) {
3182 if (requiredsize<2*ressize)
3183 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003184 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 Py_DECREF(repunicode);
3186 goto onError;
3187 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003188 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 ressize = requiredsize;
3190 }
3191 /* check if there is anything unencodable in the replacement
3192 and copy it to the output */
3193 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3194 c = *uni2;
3195 if (c >= limit) {
3196 raise_encode_exception(&exc, encoding, startp, size,
3197 unicodepos, unicodepos+1, reason);
3198 Py_DECREF(repunicode);
3199 goto onError;
3200 }
3201 *str = (char)c;
3202 }
3203 p = startp + newpos;
3204 Py_DECREF(repunicode);
3205 }
3206 }
3207 }
3208 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003209 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 if (respos<ressize)
3211 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003212 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 Py_XDECREF(errorHandler);
3214 Py_XDECREF(exc);
3215 return res;
3216
3217 onError:
3218 Py_XDECREF(res);
3219 Py_XDECREF(errorHandler);
3220 Py_XDECREF(exc);
3221 return NULL;
3222}
3223
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003225 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 const char *errors)
3227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229}
3230
3231PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3232{
3233 if (!PyUnicode_Check(unicode)) {
3234 PyErr_BadArgument();
3235 return NULL;
3236 }
3237 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3238 PyUnicode_GET_SIZE(unicode),
3239 NULL);
3240}
3241
3242/* --- 7-bit ASCII Codec -------------------------------------------------- */
3243
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003245 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 const char *errors)
3247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 PyUnicodeObject *v;
3250 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003251 Py_ssize_t startinpos;
3252 Py_ssize_t endinpos;
3253 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 const char *e;
3255 PyObject *errorHandler = NULL;
3256 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003259 if (size == 1 && *(unsigned char*)s < 128) {
3260 Py_UNICODE r = *(unsigned char*)s;
3261 return PyUnicode_FromUnicode(&r, 1);
3262 }
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 v = _PyUnicode_New(size);
3265 if (v == NULL)
3266 goto onError;
3267 if (size == 0)
3268 return (PyObject *)v;
3269 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 e = s + size;
3271 while (s < e) {
3272 register unsigned char c = (unsigned char)*s;
3273 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 ++s;
3276 }
3277 else {
3278 startinpos = s-starts;
3279 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003280 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 if (unicode_decode_call_errorhandler(
3282 errors, &errorHandler,
3283 "ascii", "ordinal not in range(128)",
3284 starts, size, &startinpos, &endinpos, &exc, &s,
3285 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003289 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003290 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 Py_XDECREF(errorHandler);
3293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003295
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 onError:
3297 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 Py_XDECREF(errorHandler);
3299 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 return NULL;
3301}
3302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003304 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 const char *errors)
3306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308}
3309
3310PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3311{
3312 if (!PyUnicode_Check(unicode)) {
3313 PyErr_BadArgument();
3314 return NULL;
3315 }
3316 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3317 PyUnicode_GET_SIZE(unicode),
3318 NULL);
3319}
3320
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003321#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003322
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003323/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003324
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003325#if SIZEOF_INT < SIZEOF_SSIZE_T
3326#define NEED_RETRY
3327#endif
3328
3329/* XXX This code is limited to "true" double-byte encodings, as
3330 a) it assumes an incomplete character consists of a single byte, and
3331 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3332 encodings, see IsDBCSLeadByteEx documentation. */
3333
3334static int is_dbcs_lead_byte(const char *s, int offset)
3335{
3336 const char *curr = s + offset;
3337
3338 if (IsDBCSLeadByte(*curr)) {
3339 const char *prev = CharPrev(s, curr);
3340 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3341 }
3342 return 0;
3343}
3344
3345/*
3346 * Decode MBCS string into unicode object. If 'final' is set, converts
3347 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3348 */
3349static int decode_mbcs(PyUnicodeObject **v,
3350 const char *s, /* MBCS string */
3351 int size, /* sizeof MBCS string */
3352 int final)
3353{
3354 Py_UNICODE *p;
3355 Py_ssize_t n = 0;
3356 int usize = 0;
3357
3358 assert(size >= 0);
3359
3360 /* Skip trailing lead-byte unless 'final' is set */
3361 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3362 --size;
3363
3364 /* First get the size of the result */
3365 if (size > 0) {
3366 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3367 if (usize == 0) {
3368 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3369 return -1;
3370 }
3371 }
3372
3373 if (*v == NULL) {
3374 /* Create unicode object */
3375 *v = _PyUnicode_New(usize);
3376 if (*v == NULL)
3377 return -1;
3378 }
3379 else {
3380 /* Extend unicode object */
3381 n = PyUnicode_GET_SIZE(*v);
3382 if (_PyUnicode_Resize(v, n + usize) < 0)
3383 return -1;
3384 }
3385
3386 /* Do the conversion */
3387 if (size > 0) {
3388 p = PyUnicode_AS_UNICODE(*v) + n;
3389 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3390 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3391 return -1;
3392 }
3393 }
3394
3395 return size;
3396}
3397
3398PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3399 Py_ssize_t size,
3400 const char *errors,
3401 Py_ssize_t *consumed)
3402{
3403 PyUnicodeObject *v = NULL;
3404 int done;
3405
3406 if (consumed)
3407 *consumed = 0;
3408
3409#ifdef NEED_RETRY
3410 retry:
3411 if (size > INT_MAX)
3412 done = decode_mbcs(&v, s, INT_MAX, 0);
3413 else
3414#endif
3415 done = decode_mbcs(&v, s, (int)size, !consumed);
3416
3417 if (done < 0) {
3418 Py_XDECREF(v);
3419 return NULL;
3420 }
3421
3422 if (consumed)
3423 *consumed += done;
3424
3425#ifdef NEED_RETRY
3426 if (size > INT_MAX) {
3427 s += done;
3428 size -= done;
3429 goto retry;
3430 }
3431#endif
3432
3433 return (PyObject *)v;
3434}
3435
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003436PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003438 const char *errors)
3439{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003440 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3441}
3442
3443/*
3444 * Convert unicode into string object (MBCS).
3445 * Returns 0 if succeed, -1 otherwise.
3446 */
3447static int encode_mbcs(PyObject **repr,
3448 const Py_UNICODE *p, /* unicode */
3449 int size) /* size of unicode */
3450{
3451 int mbcssize = 0;
3452 Py_ssize_t n = 0;
3453
3454 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003455
3456 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003457 if (size > 0) {
3458 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3459 if (mbcssize == 0) {
3460 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3461 return -1;
3462 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003463 }
3464
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003465 if (*repr == NULL) {
3466 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003467 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003468 if (*repr == NULL)
3469 return -1;
3470 }
3471 else {
3472 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003473 n = PyBytes_Size(*repr);
3474 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003475 return -1;
3476 }
3477
3478 /* Do the conversion */
3479 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003480 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003481 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3482 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3483 return -1;
3484 }
3485 }
3486
3487 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003488}
3489
3490PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003491 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003492 const char *errors)
3493{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003494 PyObject *repr = NULL;
3495 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003496
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003497#ifdef NEED_RETRY
3498 retry:
3499 if (size > INT_MAX)
3500 ret = encode_mbcs(&repr, p, INT_MAX);
3501 else
3502#endif
3503 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505 if (ret < 0) {
3506 Py_XDECREF(repr);
3507 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003508 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003509
3510#ifdef NEED_RETRY
3511 if (size > INT_MAX) {
3512 p += INT_MAX;
3513 size -= INT_MAX;
3514 goto retry;
3515 }
3516#endif
3517
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003518 return repr;
3519}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003520
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003521PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3522{
3523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 return NULL;
3526 }
3527 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3528 PyUnicode_GET_SIZE(unicode),
3529 NULL);
3530}
3531
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003532#undef NEED_RETRY
3533
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003534#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003535
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536/* --- Character Mapping Codec -------------------------------------------- */
3537
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003539 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 PyObject *mapping,
3541 const char *errors)
3542{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003544 Py_ssize_t startinpos;
3545 Py_ssize_t endinpos;
3546 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 PyUnicodeObject *v;
3549 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 PyObject *errorHandler = NULL;
3552 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003553 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003554 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003555
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 /* Default to Latin-1 */
3557 if (mapping == NULL)
3558 return PyUnicode_DecodeLatin1(s, size, errors);
3559
3560 v = _PyUnicode_New(size);
3561 if (v == NULL)
3562 goto onError;
3563 if (size == 0)
3564 return (PyObject *)v;
3565 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003567 if (PyUnicode_CheckExact(mapping)) {
3568 mapstring = PyUnicode_AS_UNICODE(mapping);
3569 maplen = PyUnicode_GET_SIZE(mapping);
3570 while (s < e) {
3571 unsigned char ch = *s;
3572 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003574 if (ch < maplen)
3575 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003577 if (x == 0xfffe) {
3578 /* undefined mapping */
3579 outpos = p-PyUnicode_AS_UNICODE(v);
3580 startinpos = s-starts;
3581 endinpos = startinpos+1;
3582 if (unicode_decode_call_errorhandler(
3583 errors, &errorHandler,
3584 "charmap", "character maps to <undefined>",
3585 starts, size, &startinpos, &endinpos, &exc, &s,
3586 (PyObject **)&v, &outpos, &p)) {
3587 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003588 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003589 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003590 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003591 *p++ = x;
3592 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003594 }
3595 else {
3596 while (s < e) {
3597 unsigned char ch = *s;
3598 PyObject *w, *x;
3599
3600 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3601 w = PyInt_FromLong((long)ch);
3602 if (w == NULL)
3603 goto onError;
3604 x = PyObject_GetItem(mapping, w);
3605 Py_DECREF(w);
3606 if (x == NULL) {
3607 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3608 /* No mapping found means: mapping is undefined. */
3609 PyErr_Clear();
3610 x = Py_None;
3611 Py_INCREF(x);
3612 } else
3613 goto onError;
3614 }
3615
3616 /* Apply mapping */
3617 if (PyInt_Check(x)) {
3618 long value = PyInt_AS_LONG(x);
3619 if (value < 0 || value > 65535) {
3620 PyErr_SetString(PyExc_TypeError,
3621 "character mapping must be in range(65536)");
3622 Py_DECREF(x);
3623 goto onError;
3624 }
3625 *p++ = (Py_UNICODE)value;
3626 }
3627 else if (x == Py_None) {
3628 /* undefined mapping */
3629 outpos = p-PyUnicode_AS_UNICODE(v);
3630 startinpos = s-starts;
3631 endinpos = startinpos+1;
3632 if (unicode_decode_call_errorhandler(
3633 errors, &errorHandler,
3634 "charmap", "character maps to <undefined>",
3635 starts, size, &startinpos, &endinpos, &exc, &s,
3636 (PyObject **)&v, &outpos, &p)) {
3637 Py_DECREF(x);
3638 goto onError;
3639 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003640 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003641 continue;
3642 }
3643 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003644 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003645
3646 if (targetsize == 1)
3647 /* 1-1 mapping */
3648 *p++ = *PyUnicode_AS_UNICODE(x);
3649
3650 else if (targetsize > 1) {
3651 /* 1-n mapping */
3652 if (targetsize > extrachars) {
3653 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003654 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3655 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003656 (targetsize << 2);
3657 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003658 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003659 if (_PyUnicode_Resize(&v,
3660 PyUnicode_GET_SIZE(v) + needed) < 0) {
3661 Py_DECREF(x);
3662 goto onError;
3663 }
3664 p = PyUnicode_AS_UNICODE(v) + oldpos;
3665 }
3666 Py_UNICODE_COPY(p,
3667 PyUnicode_AS_UNICODE(x),
3668 targetsize);
3669 p += targetsize;
3670 extrachars -= targetsize;
3671 }
3672 /* 1-0 mapping: skip the character */
3673 }
3674 else {
3675 /* wrong return value */
3676 PyErr_SetString(PyExc_TypeError,
3677 "character mapping must return integer, None or unicode");
3678 Py_DECREF(x);
3679 goto onError;
3680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003682 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 }
3685 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003686 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 Py_XDECREF(errorHandler);
3689 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 Py_XDECREF(errorHandler);
3694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 Py_XDECREF(v);
3696 return NULL;
3697}
3698
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003699/* Charmap encoding: the lookup table */
3700
3701struct encoding_map{
3702 PyObject_HEAD
3703 unsigned char level1[32];
3704 int count2, count3;
3705 unsigned char level23[1];
3706};
3707
3708static PyObject*
3709encoding_map_size(PyObject *obj, PyObject* args)
3710{
3711 struct encoding_map *map = (struct encoding_map*)obj;
3712 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3713 128*map->count3);
3714}
3715
3716static PyMethodDef encoding_map_methods[] = {
3717 {"size", encoding_map_size, METH_NOARGS,
3718 PyDoc_STR("Return the size (in bytes) of this object") },
3719 { 0 }
3720};
3721
3722static void
3723encoding_map_dealloc(PyObject* o)
3724{
3725 PyObject_FREE(o);
3726}
3727
3728static PyTypeObject EncodingMapType = {
3729 PyObject_HEAD_INIT(NULL)
3730 0, /*ob_size*/
3731 "EncodingMap", /*tp_name*/
3732 sizeof(struct encoding_map), /*tp_basicsize*/
3733 0, /*tp_itemsize*/
3734 /* methods */
3735 encoding_map_dealloc, /*tp_dealloc*/
3736 0, /*tp_print*/
3737 0, /*tp_getattr*/
3738 0, /*tp_setattr*/
3739 0, /*tp_compare*/
3740 0, /*tp_repr*/
3741 0, /*tp_as_number*/
3742 0, /*tp_as_sequence*/
3743 0, /*tp_as_mapping*/
3744 0, /*tp_hash*/
3745 0, /*tp_call*/
3746 0, /*tp_str*/
3747 0, /*tp_getattro*/
3748 0, /*tp_setattro*/
3749 0, /*tp_as_buffer*/
3750 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3751 0, /*tp_doc*/
3752 0, /*tp_traverse*/
3753 0, /*tp_clear*/
3754 0, /*tp_richcompare*/
3755 0, /*tp_weaklistoffset*/
3756 0, /*tp_iter*/
3757 0, /*tp_iternext*/
3758 encoding_map_methods, /*tp_methods*/
3759 0, /*tp_members*/
3760 0, /*tp_getset*/
3761 0, /*tp_base*/
3762 0, /*tp_dict*/
3763 0, /*tp_descr_get*/
3764 0, /*tp_descr_set*/
3765 0, /*tp_dictoffset*/
3766 0, /*tp_init*/
3767 0, /*tp_alloc*/
3768 0, /*tp_new*/
3769 0, /*tp_free*/
3770 0, /*tp_is_gc*/
3771};
3772
3773PyObject*
3774PyUnicode_BuildEncodingMap(PyObject* string)
3775{
3776 Py_UNICODE *decode;
3777 PyObject *result;
3778 struct encoding_map *mresult;
3779 int i;
3780 int need_dict = 0;
3781 unsigned char level1[32];
3782 unsigned char level2[512];
3783 unsigned char *mlevel1, *mlevel2, *mlevel3;
3784 int count2 = 0, count3 = 0;
3785
3786 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3787 PyErr_BadArgument();
3788 return NULL;
3789 }
3790 decode = PyUnicode_AS_UNICODE(string);
3791 memset(level1, 0xFF, sizeof level1);
3792 memset(level2, 0xFF, sizeof level2);
3793
3794 /* If there isn't a one-to-one mapping of NULL to \0,
3795 or if there are non-BMP characters, we need to use
3796 a mapping dictionary. */
3797 if (decode[0] != 0)
3798 need_dict = 1;
3799 for (i = 1; i < 256; i++) {
3800 int l1, l2;
3801 if (decode[i] == 0
3802 #ifdef Py_UNICODE_WIDE
3803 || decode[i] > 0xFFFF
3804 #endif
3805 ) {
3806 need_dict = 1;
3807 break;
3808 }
3809 if (decode[i] == 0xFFFE)
3810 /* unmapped character */
3811 continue;
3812 l1 = decode[i] >> 11;
3813 l2 = decode[i] >> 7;
3814 if (level1[l1] == 0xFF)
3815 level1[l1] = count2++;
3816 if (level2[l2] == 0xFF)
3817 level2[l2] = count3++;
3818 }
3819
3820 if (count2 >= 0xFF || count3 >= 0xFF)
3821 need_dict = 1;
3822
3823 if (need_dict) {
3824 PyObject *result = PyDict_New();
3825 PyObject *key, *value;
3826 if (!result)
3827 return NULL;
3828 for (i = 0; i < 256; i++) {
3829 key = value = NULL;
3830 key = PyInt_FromLong(decode[i]);
3831 value = PyInt_FromLong(i);
3832 if (!key || !value)
3833 goto failed1;
3834 if (PyDict_SetItem(result, key, value) == -1)
3835 goto failed1;
3836 Py_DECREF(key);
3837 Py_DECREF(value);
3838 }
3839 return result;
3840 failed1:
3841 Py_XDECREF(key);
3842 Py_XDECREF(value);
3843 Py_DECREF(result);
3844 return NULL;
3845 }
3846
3847 /* Create a three-level trie */
3848 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3849 16*count2 + 128*count3 - 1);
3850 if (!result)
3851 return PyErr_NoMemory();
3852 PyObject_Init(result, &EncodingMapType);
3853 mresult = (struct encoding_map*)result;
3854 mresult->count2 = count2;
3855 mresult->count3 = count3;
3856 mlevel1 = mresult->level1;
3857 mlevel2 = mresult->level23;
3858 mlevel3 = mresult->level23 + 16*count2;
3859 memcpy(mlevel1, level1, 32);
3860 memset(mlevel2, 0xFF, 16*count2);
3861 memset(mlevel3, 0, 128*count3);
3862 count3 = 0;
3863 for (i = 1; i < 256; i++) {
3864 int o1, o2, o3, i2, i3;
3865 if (decode[i] == 0xFFFE)
3866 /* unmapped character */
3867 continue;
3868 o1 = decode[i]>>11;
3869 o2 = (decode[i]>>7) & 0xF;
3870 i2 = 16*mlevel1[o1] + o2;
3871 if (mlevel2[i2] == 0xFF)
3872 mlevel2[i2] = count3++;
3873 o3 = decode[i] & 0x7F;
3874 i3 = 128*mlevel2[i2] + o3;
3875 mlevel3[i3] = i;
3876 }
3877 return result;
3878}
3879
3880static int
3881encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3882{
3883 struct encoding_map *map = (struct encoding_map*)mapping;
3884 int l1 = c>>11;
3885 int l2 = (c>>7) & 0xF;
3886 int l3 = c & 0x7F;
3887 int i;
3888
3889#ifdef Py_UNICODE_WIDE
3890 if (c > 0xFFFF) {
3891 return -1;
3892 }
3893#endif
3894 if (c == 0)
3895 return 0;
3896 /* level 1*/
3897 i = map->level1[l1];
3898 if (i == 0xFF) {
3899 return -1;
3900 }
3901 /* level 2*/
3902 i = map->level23[16*i+l2];
3903 if (i == 0xFF) {
3904 return -1;
3905 }
3906 /* level 3 */
3907 i = map->level23[16*map->count2 + 128*i + l3];
3908 if (i == 0) {
3909 return -1;
3910 }
3911 return i;
3912}
3913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914/* Lookup the character ch in the mapping. If the character
3915 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003916 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 PyObject *w = PyInt_FromLong((long)c);
3920 PyObject *x;
3921
3922 if (w == NULL)
3923 return NULL;
3924 x = PyObject_GetItem(mapping, w);
3925 Py_DECREF(w);
3926 if (x == NULL) {
3927 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3928 /* No mapping found means: mapping is undefined. */
3929 PyErr_Clear();
3930 x = Py_None;
3931 Py_INCREF(x);
3932 return x;
3933 } else
3934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003936 else if (x == Py_None)
3937 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 else if (PyInt_Check(x)) {
3939 long value = PyInt_AS_LONG(x);
3940 if (value < 0 || value > 255) {
3941 PyErr_SetString(PyExc_TypeError,
3942 "character mapping must be in range(256)");
3943 Py_DECREF(x);
3944 return NULL;
3945 }
3946 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 else if (PyString_Check(x))
3949 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003952 PyErr_Format(PyExc_TypeError,
3953 "character mapping must return integer, None or str8, not %.400s",
3954 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 Py_DECREF(x);
3956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 }
3958}
3959
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003960static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003961charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003962{
Walter Dörwald827b0552007-05-12 13:23:53 +00003963 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003964 /* exponentially overallocate to minimize reallocations */
3965 if (requiredsize < 2*outsize)
3966 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003967 if (PyBytes_Resize(outobj, requiredsize)) {
3968 Py_DECREF(outobj);
3969 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003971 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003972}
3973
3974typedef enum charmapencode_result {
3975 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3976}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003978 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 space is available. Return a new reference to the object that
3980 was put in the output buffer, or Py_None, if the mapping was undefined
3981 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003982 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003984charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003985 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003987 PyObject *rep;
3988 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003989 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003991 if (mapping->ob_type == &EncodingMapType) {
3992 int res = encoding_map_lookup(c, mapping);
3993 Py_ssize_t requiredsize = *outpos+1;
3994 if (res == -1)
3995 return enc_FAILED;
3996 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003997 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003999 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004000 outstart[(*outpos)++] = (char)res;
4001 return enc_SUCCESS;
4002 }
4003
4004 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004006 return enc_EXCEPTION;
4007 else if (rep==Py_None) {
4008 Py_DECREF(rep);
4009 return enc_FAILED;
4010 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004012 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004013 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004014 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004016 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004018 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4020 }
4021 else {
4022 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4024 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004025 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004026 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004028 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004030 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 memcpy(outstart + *outpos, repchars, repsize);
4032 *outpos += repsize;
4033 }
4034 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 Py_DECREF(rep);
4036 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037}
4038
4039/* handle an error in PyUnicode_EncodeCharmap
4040 Return 0 on success, -1 on error */
4041static
4042int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004043 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004045 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004046 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047{
4048 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t repsize;
4050 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 Py_UNICODE *uni2;
4052 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004053 Py_ssize_t collstartpos = *inpos;
4054 Py_ssize_t collendpos = *inpos+1;
4055 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 char *encoding = "charmap";
4057 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004058 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 /* find all unencodable characters */
4061 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004062 PyObject *rep;
4063 if (mapping->ob_type == &EncodingMapType) {
4064 int res = encoding_map_lookup(p[collendpos], mapping);
4065 if (res != -1)
4066 break;
4067 ++collendpos;
4068 continue;
4069 }
4070
4071 rep = charmapencode_lookup(p[collendpos], mapping);
4072 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004074 else if (rep!=Py_None) {
4075 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 break;
4077 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004078 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 ++collendpos;
4080 }
4081 /* cache callback name lookup
4082 * (if not done yet, i.e. it's the first error) */
4083 if (*known_errorHandler==-1) {
4084 if ((errors==NULL) || (!strcmp(errors, "strict")))
4085 *known_errorHandler = 1;
4086 else if (!strcmp(errors, "replace"))
4087 *known_errorHandler = 2;
4088 else if (!strcmp(errors, "ignore"))
4089 *known_errorHandler = 3;
4090 else if (!strcmp(errors, "xmlcharrefreplace"))
4091 *known_errorHandler = 4;
4092 else
4093 *known_errorHandler = 0;
4094 }
4095 switch (*known_errorHandler) {
4096 case 1: /* strict */
4097 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4098 return -1;
4099 case 2: /* replace */
4100 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4101 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004102 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 return -1;
4104 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004105 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4107 return -1;
4108 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 }
4110 /* fall through */
4111 case 3: /* ignore */
4112 *inpos = collendpos;
4113 break;
4114 case 4: /* xmlcharrefreplace */
4115 /* generate replacement (temporarily (mis)uses p) */
4116 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4117 char buffer[2+29+1+1];
4118 char *cp;
4119 sprintf(buffer, "&#%d;", (int)p[collpos]);
4120 for (cp = buffer; *cp; ++cp) {
4121 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004122 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004124 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4126 return -1;
4127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 }
4129 }
4130 *inpos = collendpos;
4131 break;
4132 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004133 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 encoding, reason, p, size, exceptionObject,
4135 collstartpos, collendpos, &newpos);
4136 if (repunicode == NULL)
4137 return -1;
4138 /* generate replacement */
4139 repsize = PyUnicode_GET_SIZE(repunicode);
4140 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4141 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004142 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 return -1;
4144 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004145 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4148 return -1;
4149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 }
4151 *inpos = newpos;
4152 Py_DECREF(repunicode);
4153 }
4154 return 0;
4155}
4156
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004158 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 PyObject *mapping,
4160 const char *errors)
4161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 /* output object */
4163 PyObject *res = NULL;
4164 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 PyObject *errorHandler = NULL;
4169 PyObject *exc = NULL;
4170 /* the following variable is used for caching string comparisons
4171 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4172 * 3=ignore, 4=xmlcharrefreplace */
4173 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174
4175 /* Default to Latin-1 */
4176 if (mapping == NULL)
4177 return PyUnicode_EncodeLatin1(p, size, errors);
4178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* allocate enough for a simple encoding without
4180 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004181 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 if (res == NULL)
4183 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004184 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 while (inpos<size) {
4188 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004189 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004190 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004192 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 if (charmap_encoding_error(p, size, &inpos, mapping,
4194 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004195 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004196 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004197 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 else
4201 /* done with this character => adjust input position */
4202 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004206 if (respos<PyBytes_GET_SIZE(res)) {
4207 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 goto onError;
4209 }
4210 Py_XDECREF(exc);
4211 Py_XDECREF(errorHandler);
4212 return res;
4213
4214 onError:
4215 Py_XDECREF(res);
4216 Py_XDECREF(exc);
4217 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 return NULL;
4219}
4220
4221PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4222 PyObject *mapping)
4223{
4224 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4225 PyErr_BadArgument();
4226 return NULL;
4227 }
4228 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4229 PyUnicode_GET_SIZE(unicode),
4230 mapping,
4231 NULL);
4232}
4233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234/* create or adjust a UnicodeTranslateError */
4235static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004236 const Py_UNICODE *unicode, Py_ssize_t size,
4237 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 if (*exceptionObject == NULL) {
4241 *exceptionObject = PyUnicodeTranslateError_Create(
4242 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 }
4244 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4246 goto onError;
4247 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4248 goto onError;
4249 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4250 goto onError;
4251 return;
4252 onError:
4253 Py_DECREF(*exceptionObject);
4254 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 }
4256}
4257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258/* raises a UnicodeTranslateError */
4259static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004260 const Py_UNICODE *unicode, Py_ssize_t size,
4261 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 const char *reason)
4263{
4264 make_translate_exception(exceptionObject,
4265 unicode, size, startpos, endpos, reason);
4266 if (*exceptionObject != NULL)
4267 PyCodec_StrictErrors(*exceptionObject);
4268}
4269
4270/* error handling callback helper:
4271 build arguments, call the callback and check the arguments,
4272 put the result into newpos and return the replacement string, which
4273 has to be freed by the caller */
4274static PyObject *unicode_translate_call_errorhandler(const char *errors,
4275 PyObject **errorHandler,
4276 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4278 Py_ssize_t startpos, Py_ssize_t endpos,
4279 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004281 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004283 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 PyObject *restuple;
4285 PyObject *resunicode;
4286
4287 if (*errorHandler == NULL) {
4288 *errorHandler = PyCodec_LookupError(errors);
4289 if (*errorHandler == NULL)
4290 return NULL;
4291 }
4292
4293 make_translate_exception(exceptionObject,
4294 unicode, size, startpos, endpos, reason);
4295 if (*exceptionObject == NULL)
4296 return NULL;
4297
4298 restuple = PyObject_CallFunctionObjArgs(
4299 *errorHandler, *exceptionObject, NULL);
4300 if (restuple == NULL)
4301 return NULL;
4302 if (!PyTuple_Check(restuple)) {
4303 PyErr_Format(PyExc_TypeError, &argparse[4]);
4304 Py_DECREF(restuple);
4305 return NULL;
4306 }
4307 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004308 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 Py_DECREF(restuple);
4310 return NULL;
4311 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004312 if (i_newpos<0)
4313 *newpos = size+i_newpos;
4314 else
4315 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004318 Py_DECREF(restuple);
4319 return NULL;
4320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 Py_INCREF(resunicode);
4322 Py_DECREF(restuple);
4323 return resunicode;
4324}
4325
4326/* Lookup the character ch in the mapping and put the result in result,
4327 which must be decrefed by the caller.
4328 Return 0 on success, -1 on error */
4329static
4330int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4331{
4332 PyObject *w = PyInt_FromLong((long)c);
4333 PyObject *x;
4334
4335 if (w == NULL)
4336 return -1;
4337 x = PyObject_GetItem(mapping, w);
4338 Py_DECREF(w);
4339 if (x == NULL) {
4340 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4341 /* No mapping found means: use 1:1 mapping. */
4342 PyErr_Clear();
4343 *result = NULL;
4344 return 0;
4345 } else
4346 return -1;
4347 }
4348 else if (x == Py_None) {
4349 *result = x;
4350 return 0;
4351 }
4352 else if (PyInt_Check(x)) {
4353 long value = PyInt_AS_LONG(x);
4354 long max = PyUnicode_GetMax();
4355 if (value < 0 || value > max) {
4356 PyErr_Format(PyExc_TypeError,
4357 "character mapping must be in range(0x%lx)", max+1);
4358 Py_DECREF(x);
4359 return -1;
4360 }
4361 *result = x;
4362 return 0;
4363 }
4364 else if (PyUnicode_Check(x)) {
4365 *result = x;
4366 return 0;
4367 }
4368 else {
4369 /* wrong return value */
4370 PyErr_SetString(PyExc_TypeError,
4371 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004372 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 return -1;
4374 }
4375}
4376/* ensure that *outobj is at least requiredsize characters long,
4377if not reallocate and adjust various state variables.
4378Return 0 on success, -1 on error */
4379static
Walter Dörwald4894c302003-10-24 14:25:28 +00004380int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004383 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004384 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004388 if (requiredsize < 2 * oldsize)
4389 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004390 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 return -1;
4392 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 }
4394 return 0;
4395}
4396/* lookup the character, put the result in the output string and adjust
4397 various state variables. Return a new reference to the object that
4398 was put in the output buffer in *result, or Py_None, if the mapping was
4399 undefined (in which case no character was written).
4400 The called must decref result.
4401 Return 0 on success, -1 on error. */
4402static
Walter Dörwald4894c302003-10-24 14:25:28 +00004403int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004404 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004405 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406{
Walter Dörwald4894c302003-10-24 14:25:28 +00004407 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 return -1;
4409 if (*res==NULL) {
4410 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004411 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 }
4413 else if (*res==Py_None)
4414 ;
4415 else if (PyInt_Check(*res)) {
4416 /* no overflow check, because we know that the space is enough */
4417 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4418 }
4419 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004420 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 if (repsize==1) {
4422 /* no overflow check, because we know that the space is enough */
4423 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4424 }
4425 else if (repsize!=0) {
4426 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004427 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004428 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004429 repsize - 1;
4430 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 return -1;
4432 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4433 *outp += repsize;
4434 }
4435 }
4436 else
4437 return -1;
4438 return 0;
4439}
4440
4441PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 PyObject *mapping,
4444 const char *errors)
4445{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* output object */
4447 PyObject *res = NULL;
4448 /* pointers to the beginning and end+1 of input */
4449 const Py_UNICODE *startp = p;
4450 const Py_UNICODE *endp = p + size;
4451 /* pointer into the output */
4452 Py_UNICODE *str;
4453 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004454 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 char *reason = "character maps to <undefined>";
4456 PyObject *errorHandler = NULL;
4457 PyObject *exc = NULL;
4458 /* the following variable is used for caching string comparisons
4459 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4460 * 3=ignore, 4=xmlcharrefreplace */
4461 int known_errorHandler = -1;
4462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 if (mapping == NULL) {
4464 PyErr_BadArgument();
4465 return NULL;
4466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467
4468 /* allocate enough for a simple 1:1 translation without
4469 replacements, if we need more, we'll resize */
4470 res = PyUnicode_FromUnicode(NULL, size);
4471 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004472 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 return res;
4475 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 while (p<endp) {
4478 /* try to encode it */
4479 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004480 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 goto onError;
4483 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004484 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 if (x!=Py_None) /* it worked => adjust input pointer */
4486 ++p;
4487 else { /* untranslatable character */
4488 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004489 Py_ssize_t repsize;
4490 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 Py_UNICODE *uni2;
4492 /* startpos for collecting untranslatable chars */
4493 const Py_UNICODE *collstart = p;
4494 const Py_UNICODE *collend = p+1;
4495 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 /* find all untranslatable characters */
4498 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004499 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 goto onError;
4501 Py_XDECREF(x);
4502 if (x!=Py_None)
4503 break;
4504 ++collend;
4505 }
4506 /* cache callback name lookup
4507 * (if not done yet, i.e. it's the first error) */
4508 if (known_errorHandler==-1) {
4509 if ((errors==NULL) || (!strcmp(errors, "strict")))
4510 known_errorHandler = 1;
4511 else if (!strcmp(errors, "replace"))
4512 known_errorHandler = 2;
4513 else if (!strcmp(errors, "ignore"))
4514 known_errorHandler = 3;
4515 else if (!strcmp(errors, "xmlcharrefreplace"))
4516 known_errorHandler = 4;
4517 else
4518 known_errorHandler = 0;
4519 }
4520 switch (known_errorHandler) {
4521 case 1: /* strict */
4522 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4523 goto onError;
4524 case 2: /* replace */
4525 /* No need to check for space, this is a 1:1 replacement */
4526 for (coll = collstart; coll<collend; ++coll)
4527 *str++ = '?';
4528 /* fall through */
4529 case 3: /* ignore */
4530 p = collend;
4531 break;
4532 case 4: /* xmlcharrefreplace */
4533 /* generate replacement (temporarily (mis)uses p) */
4534 for (p = collstart; p < collend; ++p) {
4535 char buffer[2+29+1+1];
4536 char *cp;
4537 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004538 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4540 goto onError;
4541 for (cp = buffer; *cp; ++cp)
4542 *str++ = *cp;
4543 }
4544 p = collend;
4545 break;
4546 default:
4547 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4548 reason, startp, size, &exc,
4549 collstart-startp, collend-startp, &newpos);
4550 if (repunicode == NULL)
4551 goto onError;
4552 /* generate replacement */
4553 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004554 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4556 Py_DECREF(repunicode);
4557 goto onError;
4558 }
4559 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4560 *str++ = *uni2;
4561 p = startp + newpos;
4562 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
4564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 /* Resize if we allocated to much */
4567 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004568 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004569 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004570 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 }
4572 Py_XDECREF(exc);
4573 Py_XDECREF(errorHandler);
4574 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 onError:
4577 Py_XDECREF(res);
4578 Py_XDECREF(exc);
4579 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 return NULL;
4581}
4582
4583PyObject *PyUnicode_Translate(PyObject *str,
4584 PyObject *mapping,
4585 const char *errors)
4586{
4587 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004588
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 str = PyUnicode_FromObject(str);
4590 if (str == NULL)
4591 goto onError;
4592 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4593 PyUnicode_GET_SIZE(str),
4594 mapping,
4595 errors);
4596 Py_DECREF(str);
4597 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 onError:
4600 Py_XDECREF(str);
4601 return NULL;
4602}
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossum9e896b32000-04-05 20:11:21 +00004604/* --- Decimal Encoder ---------------------------------------------------- */
4605
4606int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004608 char *output,
4609 const char *errors)
4610{
4611 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 PyObject *errorHandler = NULL;
4613 PyObject *exc = NULL;
4614 const char *encoding = "decimal";
4615 const char *reason = "invalid decimal Unicode string";
4616 /* the following variable is used for caching string comparisons
4617 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4618 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004619
4620 if (output == NULL) {
4621 PyErr_BadArgument();
4622 return -1;
4623 }
4624
4625 p = s;
4626 end = s + length;
4627 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004629 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004631 Py_ssize_t repsize;
4632 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 Py_UNICODE *uni2;
4634 Py_UNICODE *collstart;
4635 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004636
Guido van Rossum9e896b32000-04-05 20:11:21 +00004637 if (Py_UNICODE_ISSPACE(ch)) {
4638 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004640 continue;
4641 }
4642 decimal = Py_UNICODE_TODECIMAL(ch);
4643 if (decimal >= 0) {
4644 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004646 continue;
4647 }
Guido van Rossumba477042000-04-06 18:18:10 +00004648 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004649 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004651 continue;
4652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* All other characters are considered unencodable */
4654 collstart = p;
4655 collend = p+1;
4656 while (collend < end) {
4657 if ((0 < *collend && *collend < 256) ||
4658 !Py_UNICODE_ISSPACE(*collend) ||
4659 Py_UNICODE_TODECIMAL(*collend))
4660 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 /* cache callback name lookup
4663 * (if not done yet, i.e. it's the first error) */
4664 if (known_errorHandler==-1) {
4665 if ((errors==NULL) || (!strcmp(errors, "strict")))
4666 known_errorHandler = 1;
4667 else if (!strcmp(errors, "replace"))
4668 known_errorHandler = 2;
4669 else if (!strcmp(errors, "ignore"))
4670 known_errorHandler = 3;
4671 else if (!strcmp(errors, "xmlcharrefreplace"))
4672 known_errorHandler = 4;
4673 else
4674 known_errorHandler = 0;
4675 }
4676 switch (known_errorHandler) {
4677 case 1: /* strict */
4678 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4679 goto onError;
4680 case 2: /* replace */
4681 for (p = collstart; p < collend; ++p)
4682 *output++ = '?';
4683 /* fall through */
4684 case 3: /* ignore */
4685 p = collend;
4686 break;
4687 case 4: /* xmlcharrefreplace */
4688 /* generate replacement (temporarily (mis)uses p) */
4689 for (p = collstart; p < collend; ++p)
4690 output += sprintf(output, "&#%d;", (int)*p);
4691 p = collend;
4692 break;
4693 default:
4694 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4695 encoding, reason, s, length, &exc,
4696 collstart-s, collend-s, &newpos);
4697 if (repunicode == NULL)
4698 goto onError;
4699 /* generate replacement */
4700 repsize = PyUnicode_GET_SIZE(repunicode);
4701 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4702 Py_UNICODE ch = *uni2;
4703 if (Py_UNICODE_ISSPACE(ch))
4704 *output++ = ' ';
4705 else {
4706 decimal = Py_UNICODE_TODECIMAL(ch);
4707 if (decimal >= 0)
4708 *output++ = '0' + decimal;
4709 else if (0 < ch && ch < 256)
4710 *output++ = (char)ch;
4711 else {
4712 Py_DECREF(repunicode);
4713 raise_encode_exception(&exc, encoding,
4714 s, length, collstart-s, collend-s, reason);
4715 goto onError;
4716 }
4717 }
4718 }
4719 p = s + newpos;
4720 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004721 }
4722 }
4723 /* 0-terminate the output string */
4724 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 Py_XDECREF(exc);
4726 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004727 return 0;
4728
4729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 Py_XDECREF(exc);
4731 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004732 return -1;
4733}
4734
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735/* --- Helpers ------------------------------------------------------------ */
4736
Thomas Wouters477c8d52006-05-27 19:21:47 +00004737#define STRINGLIB_CHAR Py_UNICODE
4738
4739#define STRINGLIB_LEN PyUnicode_GET_SIZE
4740#define STRINGLIB_NEW PyUnicode_FromUnicode
4741#define STRINGLIB_STR PyUnicode_AS_UNICODE
4742
4743Py_LOCAL_INLINE(int)
4744STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004746 if (str[0] != other[0])
4747 return 1;
4748 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749}
4750
Thomas Wouters477c8d52006-05-27 19:21:47 +00004751#define STRINGLIB_EMPTY unicode_empty
4752
4753#include "stringlib/fastsearch.h"
4754
4755#include "stringlib/count.h"
4756#include "stringlib/find.h"
4757#include "stringlib/partition.h"
4758
4759/* helper macro to fixup start/end slice values */
4760#define FIX_START_END(obj) \
4761 if (start < 0) \
4762 start += (obj)->length; \
4763 if (start < 0) \
4764 start = 0; \
4765 if (end > (obj)->length) \
4766 end = (obj)->length; \
4767 if (end < 0) \
4768 end += (obj)->length; \
4769 if (end < 0) \
4770 end = 0;
4771
Martin v. Löwis18e16552006-02-15 17:27:45 +00004772Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004773 PyObject *substr,
4774 Py_ssize_t start,
4775 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004777 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004778 PyUnicodeObject* str_obj;
4779 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004780
Thomas Wouters477c8d52006-05-27 19:21:47 +00004781 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4782 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004784 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4785 if (!sub_obj) {
4786 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 return -1;
4788 }
Tim Petersced69f82003-09-16 20:30:58 +00004789
Thomas Wouters477c8d52006-05-27 19:21:47 +00004790 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004791
Thomas Wouters477c8d52006-05-27 19:21:47 +00004792 result = stringlib_count(
4793 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4794 );
4795
4796 Py_DECREF(sub_obj);
4797 Py_DECREF(str_obj);
4798
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 return result;
4800}
4801
Martin v. Löwis18e16552006-02-15 17:27:45 +00004802Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004803 PyObject *sub,
4804 Py_ssize_t start,
4805 Py_ssize_t end,
4806 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004808 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004811 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004812 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004813 sub = PyUnicode_FromObject(sub);
4814 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004815 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004816 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 }
Tim Petersced69f82003-09-16 20:30:58 +00004818
Thomas Wouters477c8d52006-05-27 19:21:47 +00004819 if (direction > 0)
4820 result = stringlib_find_slice(
4821 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4822 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4823 start, end
4824 );
4825 else
4826 result = stringlib_rfind_slice(
4827 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4828 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4829 start, end
4830 );
4831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004833 Py_DECREF(sub);
4834
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 return result;
4836}
4837
Tim Petersced69f82003-09-16 20:30:58 +00004838static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839int tailmatch(PyUnicodeObject *self,
4840 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 Py_ssize_t start,
4842 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 int direction)
4844{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 if (substring->length == 0)
4846 return 1;
4847
Thomas Wouters477c8d52006-05-27 19:21:47 +00004848 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
4850 end -= substring->length;
4851 if (end < start)
4852 return 0;
4853
4854 if (direction > 0) {
4855 if (Py_UNICODE_MATCH(self, end, substring))
4856 return 1;
4857 } else {
4858 if (Py_UNICODE_MATCH(self, start, substring))
4859 return 1;
4860 }
4861
4862 return 0;
4863}
4864
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004867 Py_ssize_t start,
4868 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 int direction)
4870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004872
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 str = PyUnicode_FromObject(str);
4874 if (str == NULL)
4875 return -1;
4876 substr = PyUnicode_FromObject(substr);
4877 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004878 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 return -1;
4880 }
Tim Petersced69f82003-09-16 20:30:58 +00004881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 result = tailmatch((PyUnicodeObject *)str,
4883 (PyUnicodeObject *)substr,
4884 start, end, direction);
4885 Py_DECREF(str);
4886 Py_DECREF(substr);
4887 return result;
4888}
4889
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890/* Apply fixfct filter to the Unicode object self and return a
4891 reference to the modified object */
4892
Tim Petersced69f82003-09-16 20:30:58 +00004893static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894PyObject *fixup(PyUnicodeObject *self,
4895 int (*fixfct)(PyUnicodeObject *s))
4896{
4897
4898 PyUnicodeObject *u;
4899
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004900 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 if (u == NULL)
4902 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004903
4904 Py_UNICODE_COPY(u->str, self->str, self->length);
4905
Tim Peters7a29bd52001-09-12 03:03:31 +00004906 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 /* fixfct should return TRUE if it modified the buffer. If
4908 FALSE, return a reference to the original buffer instead
4909 (to save space, not time) */
4910 Py_INCREF(self);
4911 Py_DECREF(u);
4912 return (PyObject*) self;
4913 }
4914 return (PyObject*) u;
4915}
4916
Tim Petersced69f82003-09-16 20:30:58 +00004917static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918int fixupper(PyUnicodeObject *self)
4919{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004920 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 Py_UNICODE *s = self->str;
4922 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 while (len-- > 0) {
4925 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004926
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 ch = Py_UNICODE_TOUPPER(*s);
4928 if (ch != *s) {
4929 status = 1;
4930 *s = ch;
4931 }
4932 s++;
4933 }
4934
4935 return status;
4936}
4937
Tim Petersced69f82003-09-16 20:30:58 +00004938static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939int fixlower(PyUnicodeObject *self)
4940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 Py_UNICODE *s = self->str;
4943 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 while (len-- > 0) {
4946 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004947
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 ch = Py_UNICODE_TOLOWER(*s);
4949 if (ch != *s) {
4950 status = 1;
4951 *s = ch;
4952 }
4953 s++;
4954 }
4955
4956 return status;
4957}
4958
Tim Petersced69f82003-09-16 20:30:58 +00004959static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960int fixswapcase(PyUnicodeObject *self)
4961{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004962 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 Py_UNICODE *s = self->str;
4964 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004965
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966 while (len-- > 0) {
4967 if (Py_UNICODE_ISUPPER(*s)) {
4968 *s = Py_UNICODE_TOLOWER(*s);
4969 status = 1;
4970 } else if (Py_UNICODE_ISLOWER(*s)) {
4971 *s = Py_UNICODE_TOUPPER(*s);
4972 status = 1;
4973 }
4974 s++;
4975 }
4976
4977 return status;
4978}
4979
Tim Petersced69f82003-09-16 20:30:58 +00004980static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981int fixcapitalize(PyUnicodeObject *self)
4982{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004983 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004984 Py_UNICODE *s = self->str;
4985 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004986
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004987 if (len == 0)
4988 return 0;
4989 if (Py_UNICODE_ISLOWER(*s)) {
4990 *s = Py_UNICODE_TOUPPER(*s);
4991 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004993 s++;
4994 while (--len > 0) {
4995 if (Py_UNICODE_ISUPPER(*s)) {
4996 *s = Py_UNICODE_TOLOWER(*s);
4997 status = 1;
4998 }
4999 s++;
5000 }
5001 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002}
5003
5004static
5005int fixtitle(PyUnicodeObject *self)
5006{
5007 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5008 register Py_UNICODE *e;
5009 int previous_is_cased;
5010
5011 /* Shortcut for single character strings */
5012 if (PyUnicode_GET_SIZE(self) == 1) {
5013 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5014 if (*p != ch) {
5015 *p = ch;
5016 return 1;
5017 }
5018 else
5019 return 0;
5020 }
Tim Petersced69f82003-09-16 20:30:58 +00005021
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 e = p + PyUnicode_GET_SIZE(self);
5023 previous_is_cased = 0;
5024 for (; p < e; p++) {
5025 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005026
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 if (previous_is_cased)
5028 *p = Py_UNICODE_TOLOWER(ch);
5029 else
5030 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005031
5032 if (Py_UNICODE_ISLOWER(ch) ||
5033 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 Py_UNICODE_ISTITLE(ch))
5035 previous_is_cased = 1;
5036 else
5037 previous_is_cased = 0;
5038 }
5039 return 1;
5040}
5041
Tim Peters8ce9f162004-08-27 01:49:32 +00005042PyObject *
5043PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044{
Tim Peters8ce9f162004-08-27 01:49:32 +00005045 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005046 const Py_UNICODE blank = ' ';
5047 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005048 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005049 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005050 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5051 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005052 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5053 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005054 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005055 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005056 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
Tim Peters05eba1f2004-08-27 21:32:02 +00005058 fseq = PySequence_Fast(seq, "");
5059 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005060 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005061 }
5062
Tim Peters91879ab2004-08-27 22:35:44 +00005063 /* Grrrr. A codec may be invoked to convert str objects to
5064 * Unicode, and so it's possible to call back into Python code
5065 * during PyUnicode_FromObject(), and so it's possible for a sick
5066 * codec to change the size of fseq (if seq is a list). Therefore
5067 * we have to keep refetching the size -- can't assume seqlen
5068 * is invariant.
5069 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005070 seqlen = PySequence_Fast_GET_SIZE(fseq);
5071 /* If empty sequence, return u"". */
5072 if (seqlen == 0) {
5073 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5074 goto Done;
5075 }
5076 /* If singleton sequence with an exact Unicode, return that. */
5077 if (seqlen == 1) {
5078 item = PySequence_Fast_GET_ITEM(fseq, 0);
5079 if (PyUnicode_CheckExact(item)) {
5080 Py_INCREF(item);
5081 res = (PyUnicodeObject *)item;
5082 goto Done;
5083 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005084 }
5085
Tim Peters05eba1f2004-08-27 21:32:02 +00005086 /* At least two items to join, or one that isn't exact Unicode. */
5087 if (seqlen > 1) {
5088 /* Set up sep and seplen -- they're needed. */
5089 if (separator == NULL) {
5090 sep = &blank;
5091 seplen = 1;
5092 }
5093 else {
5094 internal_separator = PyUnicode_FromObject(separator);
5095 if (internal_separator == NULL)
5096 goto onError;
5097 sep = PyUnicode_AS_UNICODE(internal_separator);
5098 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005099 /* In case PyUnicode_FromObject() mutated seq. */
5100 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005101 }
5102 }
5103
5104 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005105 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005106 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005107 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005108 res_p = PyUnicode_AS_UNICODE(res);
5109 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005110
Tim Peters05eba1f2004-08-27 21:32:02 +00005111 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005112 Py_ssize_t itemlen;
5113 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005114
5115 item = PySequence_Fast_GET_ITEM(fseq, i);
5116 /* Convert item to Unicode. */
5117 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5118 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005119 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005120 " %.80s found",
5121 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005122 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005123 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005124 item = PyUnicode_FromObject(item);
5125 if (item == NULL)
5126 goto onError;
5127 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005128
Tim Peters91879ab2004-08-27 22:35:44 +00005129 /* In case PyUnicode_FromObject() mutated seq. */
5130 seqlen = PySequence_Fast_GET_SIZE(fseq);
5131
Tim Peters8ce9f162004-08-27 01:49:32 +00005132 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005136 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005137 if (i < seqlen - 1) {
5138 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005139 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005140 goto Overflow;
5141 }
5142 if (new_res_used > res_alloc) {
5143 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005144 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005145 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005147 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005148 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005149 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005150 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005152 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005153 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005155
5156 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005157 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005158 res_p += itemlen;
5159 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005160 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005161 res_p += seplen;
5162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005164 res_used = new_res_used;
5165 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005166
Tim Peters05eba1f2004-08-27 21:32:02 +00005167 /* Shrink res to match the used area; this probably can't fail,
5168 * but it's cheap to check.
5169 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005170 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005171 goto onError;
5172
5173 Done:
5174 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005175 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 return (PyObject *)res;
5177
Tim Peters8ce9f162004-08-27 01:49:32 +00005178 Overflow:
5179 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005180 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005181 Py_DECREF(item);
5182 /* fall through */
5183
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005185 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005186 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005187 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 return NULL;
5189}
5190
Tim Petersced69f82003-09-16 20:30:58 +00005191static
5192PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t left,
5194 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 Py_UNICODE fill)
5196{
5197 PyUnicodeObject *u;
5198
5199 if (left < 0)
5200 left = 0;
5201 if (right < 0)
5202 right = 0;
5203
Tim Peters7a29bd52001-09-12 03:03:31 +00005204 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_INCREF(self);
5206 return self;
5207 }
5208
5209 u = _PyUnicode_New(left + self->length + right);
5210 if (u) {
5211 if (left)
5212 Py_UNICODE_FILL(u->str, fill, left);
5213 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5214 if (right)
5215 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5216 }
5217
5218 return u;
5219}
5220
5221#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005222 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 if (!str) \
5224 goto onError; \
5225 if (PyList_Append(list, str)) { \
5226 Py_DECREF(str); \
5227 goto onError; \
5228 } \
5229 else \
5230 Py_DECREF(str);
5231
5232static
5233PyObject *split_whitespace(PyUnicodeObject *self,
5234 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 register Py_ssize_t i;
5238 register Py_ssize_t j;
5239 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 PyObject *str;
5241
5242 for (i = j = 0; i < len; ) {
5243 /* find a token */
5244 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5245 i++;
5246 j = i;
5247 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5248 i++;
5249 if (j < i) {
5250 if (maxcount-- <= 0)
5251 break;
5252 SPLIT_APPEND(self->str, j, i);
5253 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5254 i++;
5255 j = i;
5256 }
5257 }
5258 if (j < len) {
5259 SPLIT_APPEND(self->str, j, len);
5260 }
5261 return list;
5262
5263 onError:
5264 Py_DECREF(list);
5265 return NULL;
5266}
5267
5268PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005269 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271 register Py_ssize_t i;
5272 register Py_ssize_t j;
5273 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 PyObject *list;
5275 PyObject *str;
5276 Py_UNICODE *data;
5277
5278 string = PyUnicode_FromObject(string);
5279 if (string == NULL)
5280 return NULL;
5281 data = PyUnicode_AS_UNICODE(string);
5282 len = PyUnicode_GET_SIZE(string);
5283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 list = PyList_New(0);
5285 if (!list)
5286 goto onError;
5287
5288 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005289 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005292 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294
5295 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005296 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 if (i < len) {
5298 if (data[i] == '\r' && i + 1 < len &&
5299 data[i+1] == '\n')
5300 i += 2;
5301 else
5302 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005303 if (keepends)
5304 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 }
Guido van Rossum86662912000-04-11 15:38:46 +00005306 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 j = i;
5308 }
5309 if (j < len) {
5310 SPLIT_APPEND(data, j, len);
5311 }
5312
5313 Py_DECREF(string);
5314 return list;
5315
5316 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005317 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 Py_DECREF(string);
5319 return NULL;
5320}
5321
Tim Petersced69f82003-09-16 20:30:58 +00005322static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323PyObject *split_char(PyUnicodeObject *self,
5324 PyObject *list,
5325 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005328 register Py_ssize_t i;
5329 register Py_ssize_t j;
5330 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 PyObject *str;
5332
5333 for (i = j = 0; i < len; ) {
5334 if (self->str[i] == ch) {
5335 if (maxcount-- <= 0)
5336 break;
5337 SPLIT_APPEND(self->str, j, i);
5338 i = j = i + 1;
5339 } else
5340 i++;
5341 }
5342 if (j <= len) {
5343 SPLIT_APPEND(self->str, j, len);
5344 }
5345 return list;
5346
5347 onError:
5348 Py_DECREF(list);
5349 return NULL;
5350}
5351
Tim Petersced69f82003-09-16 20:30:58 +00005352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353PyObject *split_substring(PyUnicodeObject *self,
5354 PyObject *list,
5355 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 register Py_ssize_t i;
5359 register Py_ssize_t j;
5360 Py_ssize_t len = self->length;
5361 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 PyObject *str;
5363
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005364 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 if (Py_UNICODE_MATCH(self, i, substring)) {
5366 if (maxcount-- <= 0)
5367 break;
5368 SPLIT_APPEND(self->str, j, i);
5369 i = j = i + sublen;
5370 } else
5371 i++;
5372 }
5373 if (j <= len) {
5374 SPLIT_APPEND(self->str, j, len);
5375 }
5376 return list;
5377
5378 onError:
5379 Py_DECREF(list);
5380 return NULL;
5381}
5382
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005383static
5384PyObject *rsplit_whitespace(PyUnicodeObject *self,
5385 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005387{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005388 register Py_ssize_t i;
5389 register Py_ssize_t j;
5390 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005391 PyObject *str;
5392
5393 for (i = j = len - 1; i >= 0; ) {
5394 /* find a token */
5395 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5396 i--;
5397 j = i;
5398 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5399 i--;
5400 if (j > i) {
5401 if (maxcount-- <= 0)
5402 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005403 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005404 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5405 i--;
5406 j = i;
5407 }
5408 }
5409 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005411 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005412 if (PyList_Reverse(list) < 0)
5413 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005414 return list;
5415
5416 onError:
5417 Py_DECREF(list);
5418 return NULL;
5419}
5420
5421static
5422PyObject *rsplit_char(PyUnicodeObject *self,
5423 PyObject *list,
5424 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005426{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005427 register Py_ssize_t i;
5428 register Py_ssize_t j;
5429 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005430 PyObject *str;
5431
5432 for (i = j = len - 1; i >= 0; ) {
5433 if (self->str[i] == ch) {
5434 if (maxcount-- <= 0)
5435 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005436 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005437 j = i = i - 1;
5438 } else
5439 i--;
5440 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005441 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005444 if (PyList_Reverse(list) < 0)
5445 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005446 return list;
5447
5448 onError:
5449 Py_DECREF(list);
5450 return NULL;
5451}
5452
5453static
5454PyObject *rsplit_substring(PyUnicodeObject *self,
5455 PyObject *list,
5456 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459 register Py_ssize_t i;
5460 register Py_ssize_t j;
5461 Py_ssize_t len = self->length;
5462 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005463 PyObject *str;
5464
5465 for (i = len - sublen, j = len; i >= 0; ) {
5466 if (Py_UNICODE_MATCH(self, i, substring)) {
5467 if (maxcount-- <= 0)
5468 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005469 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005470 j = i;
5471 i -= sublen;
5472 } else
5473 i--;
5474 }
5475 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005476 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005478 if (PyList_Reverse(list) < 0)
5479 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005480 return list;
5481
5482 onError:
5483 Py_DECREF(list);
5484 return NULL;
5485}
5486
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487#undef SPLIT_APPEND
5488
5489static
5490PyObject *split(PyUnicodeObject *self,
5491 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005492 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493{
5494 PyObject *list;
5495
5496 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005497 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
5499 list = PyList_New(0);
5500 if (!list)
5501 return NULL;
5502
5503 if (substring == NULL)
5504 return split_whitespace(self,list,maxcount);
5505
5506 else if (substring->length == 1)
5507 return split_char(self,list,substring->str[0],maxcount);
5508
5509 else if (substring->length == 0) {
5510 Py_DECREF(list);
5511 PyErr_SetString(PyExc_ValueError, "empty separator");
5512 return NULL;
5513 }
5514 else
5515 return split_substring(self,list,substring,maxcount);
5516}
5517
Tim Petersced69f82003-09-16 20:30:58 +00005518static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005519PyObject *rsplit(PyUnicodeObject *self,
5520 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005521 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005522{
5523 PyObject *list;
5524
5525 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005526 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005527
5528 list = PyList_New(0);
5529 if (!list)
5530 return NULL;
5531
5532 if (substring == NULL)
5533 return rsplit_whitespace(self,list,maxcount);
5534
5535 else if (substring->length == 1)
5536 return rsplit_char(self,list,substring->str[0],maxcount);
5537
5538 else if (substring->length == 0) {
5539 Py_DECREF(list);
5540 PyErr_SetString(PyExc_ValueError, "empty separator");
5541 return NULL;
5542 }
5543 else
5544 return rsplit_substring(self,list,substring,maxcount);
5545}
5546
5547static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548PyObject *replace(PyUnicodeObject *self,
5549 PyUnicodeObject *str1,
5550 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
5553 PyUnicodeObject *u;
5554
5555 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005556 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Thomas Wouters477c8d52006-05-27 19:21:47 +00005558 if (str1->length == str2->length) {
5559 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005560 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005561 if (str1->length == 1) {
5562 /* replace characters */
5563 Py_UNICODE u1, u2;
5564 if (!findchar(self->str, self->length, str1->str[0]))
5565 goto nothing;
5566 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5567 if (!u)
5568 return NULL;
5569 Py_UNICODE_COPY(u->str, self->str, self->length);
5570 u1 = str1->str[0];
5571 u2 = str2->str[0];
5572 for (i = 0; i < u->length; i++)
5573 if (u->str[i] == u1) {
5574 if (--maxcount < 0)
5575 break;
5576 u->str[i] = u2;
5577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005579 i = fastsearch(
5580 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005582 if (i < 0)
5583 goto nothing;
5584 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5585 if (!u)
5586 return NULL;
5587 Py_UNICODE_COPY(u->str, self->str, self->length);
5588 while (i <= self->length - str1->length)
5589 if (Py_UNICODE_MATCH(self, i, str1)) {
5590 if (--maxcount < 0)
5591 break;
5592 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5593 i += str1->length;
5594 } else
5595 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005598
5599 Py_ssize_t n, i, j, e;
5600 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 Py_UNICODE *p;
5602
5603 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 if (n > maxcount)
5606 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005607 if (n == 0)
5608 goto nothing;
5609 /* new_size = self->length + n * (str2->length - str1->length)); */
5610 delta = (str2->length - str1->length);
5611 if (delta == 0) {
5612 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005614 product = n * (str2->length - str1->length);
5615 if ((product / (str2->length - str1->length)) != n) {
5616 PyErr_SetString(PyExc_OverflowError,
5617 "replace string is too long");
5618 return NULL;
5619 }
5620 new_size = self->length + product;
5621 if (new_size < 0) {
5622 PyErr_SetString(PyExc_OverflowError,
5623 "replace string is too long");
5624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
5626 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005627 u = _PyUnicode_New(new_size);
5628 if (!u)
5629 return NULL;
5630 i = 0;
5631 p = u->str;
5632 e = self->length - str1->length;
5633 if (str1->length > 0) {
5634 while (n-- > 0) {
5635 /* look for next match */
5636 j = i;
5637 while (j <= e) {
5638 if (Py_UNICODE_MATCH(self, j, str1))
5639 break;
5640 j++;
5641 }
5642 if (j > i) {
5643 if (j > e)
5644 break;
5645 /* copy unchanged part [i:j] */
5646 Py_UNICODE_COPY(p, self->str+i, j-i);
5647 p += j - i;
5648 }
5649 /* copy substitution string */
5650 if (str2->length > 0) {
5651 Py_UNICODE_COPY(p, str2->str, str2->length);
5652 p += str2->length;
5653 }
5654 i = j + str1->length;
5655 }
5656 if (i < self->length)
5657 /* copy tail [i:] */
5658 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5659 } else {
5660 /* interleave */
5661 while (n > 0) {
5662 Py_UNICODE_COPY(p, str2->str, str2->length);
5663 p += str2->length;
5664 if (--n <= 0)
5665 break;
5666 *p++ = self->str[i++];
5667 }
5668 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005672
5673nothing:
5674 /* nothing to replace; return original string (when possible) */
5675 if (PyUnicode_CheckExact(self)) {
5676 Py_INCREF(self);
5677 return (PyObject *) self;
5678 }
5679 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680}
5681
5682/* --- Unicode Object Methods --------------------------------------------- */
5683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685"S.title() -> unicode\n\
5686\n\
5687Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005691unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return fixup(self, fixtitle);
5694}
5695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697"S.capitalize() -> unicode\n\
5698\n\
5699Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005700have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
5702static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005703unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return fixup(self, fixcapitalize);
5706}
5707
5708#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005709PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710"S.capwords() -> unicode\n\
5711\n\
5712Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005713normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
5715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005716unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
5718 PyObject *list;
5719 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 /* Split into words */
5723 list = split(self, NULL, -1);
5724 if (!list)
5725 return NULL;
5726
5727 /* Capitalize each word */
5728 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5729 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5730 fixcapitalize);
5731 if (item == NULL)
5732 goto onError;
5733 Py_DECREF(PyList_GET_ITEM(list, i));
5734 PyList_SET_ITEM(list, i, item);
5735 }
5736
5737 /* Join the words to form a new string */
5738 item = PyUnicode_Join(NULL, list);
5739
5740onError:
5741 Py_DECREF(list);
5742 return (PyObject *)item;
5743}
5744#endif
5745
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005746/* Argument converter. Coerces to a single unicode character */
5747
5748static int
5749convert_uc(PyObject *obj, void *addr)
5750{
5751 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5752 PyObject *uniobj;
5753 Py_UNICODE *unistr;
5754
5755 uniobj = PyUnicode_FromObject(obj);
5756 if (uniobj == NULL) {
5757 PyErr_SetString(PyExc_TypeError,
5758 "The fill character cannot be converted to Unicode");
5759 return 0;
5760 }
5761 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5762 PyErr_SetString(PyExc_TypeError,
5763 "The fill character must be exactly one character long");
5764 Py_DECREF(uniobj);
5765 return 0;
5766 }
5767 unistr = PyUnicode_AS_UNICODE(uniobj);
5768 *fillcharloc = unistr[0];
5769 Py_DECREF(uniobj);
5770 return 1;
5771}
5772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005773PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005774"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005776Return S centered in a Unicode string of length width. Padding is\n\
5777done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
5779static PyObject *
5780unicode_center(PyUnicodeObject *self, PyObject *args)
5781{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005782 Py_ssize_t marg, left;
5783 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005784 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
Thomas Woutersde017742006-02-16 19:34:37 +00005786 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 return NULL;
5788
Tim Peters7a29bd52001-09-12 03:03:31 +00005789 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 Py_INCREF(self);
5791 return (PyObject*) self;
5792 }
5793
5794 marg = width - self->length;
5795 left = marg / 2 + (marg & width & 1);
5796
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005797 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798}
5799
Marc-André Lemburge5034372000-08-08 08:04:29 +00005800#if 0
5801
5802/* This code should go into some future Unicode collation support
5803 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005804 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005805
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005806/* speedy UTF-16 code point order comparison */
5807/* gleaned from: */
5808/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5809
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005810static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005811{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005812 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005813 0, 0, 0, 0, 0, 0, 0, 0,
5814 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005815 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005816};
5817
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818static int
5819unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005821 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005822
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 Py_UNICODE *s1 = str1->str;
5824 Py_UNICODE *s2 = str2->str;
5825
5826 len1 = str1->length;
5827 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005828
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005830 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005831
5832 c1 = *s1++;
5833 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005834
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005835 if (c1 > (1<<11) * 26)
5836 c1 += utf16Fixup[c1>>11];
5837 if (c2 > (1<<11) * 26)
5838 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005839 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005840
5841 if (c1 != c2)
5842 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005843
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005844 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 }
5846
5847 return (len1 < len2) ? -1 : (len1 != len2);
5848}
5849
Marc-André Lemburge5034372000-08-08 08:04:29 +00005850#else
5851
5852static int
5853unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005856
5857 Py_UNICODE *s1 = str1->str;
5858 Py_UNICODE *s2 = str2->str;
5859
5860 len1 = str1->length;
5861 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005862
Marc-André Lemburge5034372000-08-08 08:04:29 +00005863 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005864 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005865
Fredrik Lundh45714e92001-06-26 16:39:36 +00005866 c1 = *s1++;
5867 c2 = *s2++;
5868
5869 if (c1 != c2)
5870 return (c1 < c2) ? -1 : 1;
5871
Marc-André Lemburge5034372000-08-08 08:04:29 +00005872 len1--; len2--;
5873 }
5874
5875 return (len1 < len2) ? -1 : (len1 != len2);
5876}
5877
5878#endif
5879
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880int PyUnicode_Compare(PyObject *left,
5881 PyObject *right)
5882{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005883 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5884 return unicode_compare((PyUnicodeObject *)left,
5885 (PyUnicodeObject *)right);
5886 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5887 (PyUnicode_Check(left) && PyString_Check(right))) {
5888 if (PyUnicode_Check(left))
5889 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5890 if (PyUnicode_Check(right))
5891 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5892 assert(PyString_Check(left));
5893 assert(PyString_Check(right));
5894 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005896 PyErr_Format(PyExc_TypeError,
5897 "Can't compare %.100s and %.100s",
5898 left->ob_type->tp_name,
5899 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return -1;
5901}
5902
Martin v. Löwis5b222132007-06-10 09:51:05 +00005903int
5904PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5905{
5906 int i;
5907 Py_UNICODE *id;
5908 assert(PyUnicode_Check(uni));
5909 id = PyUnicode_AS_UNICODE(uni);
5910 /* Compare Unicode string and source character set string */
5911 for (i = 0; id[i] && str[i]; i++)
5912 if (id[i] != str[i])
5913 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5914 if (id[i])
5915 return 1; /* uni is longer */
5916 if (str[i])
5917 return -1; /* str is longer */
5918 return 0;
5919}
5920
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005921PyObject *PyUnicode_RichCompare(PyObject *left,
5922 PyObject *right,
5923 int op)
5924{
5925 int result;
5926
5927 result = PyUnicode_Compare(left, right);
5928 if (result == -1 && PyErr_Occurred())
5929 goto onError;
5930
5931 /* Convert the return value to a Boolean */
5932 switch (op) {
5933 case Py_EQ:
5934 result = (result == 0);
5935 break;
5936 case Py_NE:
5937 result = (result != 0);
5938 break;
5939 case Py_LE:
5940 result = (result <= 0);
5941 break;
5942 case Py_GE:
5943 result = (result >= 0);
5944 break;
5945 case Py_LT:
5946 result = (result == -1);
5947 break;
5948 case Py_GT:
5949 result = (result == 1);
5950 break;
5951 }
5952 return PyBool_FromLong(result);
5953
5954 onError:
5955
5956 /* Standard case
5957
5958 Type errors mean that PyUnicode_FromObject() could not convert
5959 one of the arguments (usually the right hand side) to Unicode,
5960 ie. we can't handle the comparison request. However, it is
5961 possible that the other object knows a comparison method, which
5962 is why we return Py_NotImplemented to give the other object a
5963 chance.
5964
5965 */
5966 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5967 PyErr_Clear();
5968 Py_INCREF(Py_NotImplemented);
5969 return Py_NotImplemented;
5970 }
5971 if (op != Py_EQ && op != Py_NE)
5972 return NULL;
5973
5974 /* Equality comparison.
5975
5976 This is a special case: we silence any PyExc_UnicodeDecodeError
5977 and instead turn it into a PyErr_UnicodeWarning.
5978
5979 */
5980 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5981 return NULL;
5982 PyErr_Clear();
5983 if (PyErr_Warn(PyExc_UnicodeWarning,
5984 (op == Py_EQ) ?
5985 "Unicode equal comparison "
5986 "failed to convert both arguments to Unicode - "
5987 "interpreting them as being unequal" :
5988 "Unicode unequal comparison "
5989 "failed to convert both arguments to Unicode - "
5990 "interpreting them as being unequal"
5991 ) < 0)
5992 return NULL;
5993 result = (op == Py_NE);
5994 return PyBool_FromLong(result);
5995}
5996
Guido van Rossum403d68b2000-03-13 15:55:09 +00005997int PyUnicode_Contains(PyObject *container,
5998 PyObject *element)
5999{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006000 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006002
6003 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 sub = PyUnicode_FromObject(element);
6005 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006006 PyErr_Format(PyExc_TypeError,
6007 "'in <string>' requires string as left operand, not %s",
6008 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006010 }
6011
Thomas Wouters477c8d52006-05-27 19:21:47 +00006012 str = PyUnicode_FromObject(container);
6013 if (!str) {
6014 Py_DECREF(sub);
6015 return -1;
6016 }
6017
6018 result = stringlib_contains_obj(str, sub);
6019
6020 Py_DECREF(str);
6021 Py_DECREF(sub);
6022
Guido van Rossum403d68b2000-03-13 15:55:09 +00006023 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006024}
6025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026/* Concat to string or Unicode object giving a new Unicode object. */
6027
6028PyObject *PyUnicode_Concat(PyObject *left,
6029 PyObject *right)
6030{
6031 PyUnicodeObject *u = NULL, *v = NULL, *w;
6032
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006033 if (PyBytes_Check(left) || PyBytes_Check(right))
6034 return PyBytes_Concat(left, right);
6035
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 /* Coerce the two arguments */
6037 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6038 if (u == NULL)
6039 goto onError;
6040 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6041 if (v == NULL)
6042 goto onError;
6043
6044 /* Shortcuts */
6045 if (v == unicode_empty) {
6046 Py_DECREF(v);
6047 return (PyObject *)u;
6048 }
6049 if (u == unicode_empty) {
6050 Py_DECREF(u);
6051 return (PyObject *)v;
6052 }
6053
6054 /* Concat the two Unicode strings */
6055 w = _PyUnicode_New(u->length + v->length);
6056 if (w == NULL)
6057 goto onError;
6058 Py_UNICODE_COPY(w->str, u->str, u->length);
6059 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6060
6061 Py_DECREF(u);
6062 Py_DECREF(v);
6063 return (PyObject *)w;
6064
6065onError:
6066 Py_XDECREF(u);
6067 Py_XDECREF(v);
6068 return NULL;
6069}
6070
Walter Dörwald1ab83302007-05-18 17:15:44 +00006071void
6072PyUnicode_Append(PyObject **pleft, PyObject *right)
6073{
6074 PyObject *new;
6075 if (*pleft == NULL)
6076 return;
6077 if (right == NULL || !PyUnicode_Check(*pleft)) {
6078 Py_DECREF(*pleft);
6079 *pleft = NULL;
6080 return;
6081 }
6082 new = PyUnicode_Concat(*pleft, right);
6083 Py_DECREF(*pleft);
6084 *pleft = new;
6085}
6086
6087void
6088PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6089{
6090 PyUnicode_Append(pleft, right);
6091 Py_XDECREF(right);
6092}
6093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006094PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095"S.count(sub[, start[, end]]) -> int\n\
6096\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006097Return the number of non-overlapping occurrences of substring sub in\n\
6098Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
6101static PyObject *
6102unicode_count(PyUnicodeObject *self, PyObject *args)
6103{
6104 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006105 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006106 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 PyObject *result;
6108
Guido van Rossumb8872e62000-05-09 14:14:27 +00006109 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6110 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return NULL;
6112
6113 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 if (substring == NULL)
6116 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006117
Thomas Wouters477c8d52006-05-27 19:21:47 +00006118 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119
Thomas Wouters477c8d52006-05-27 19:21:47 +00006120 result = PyInt_FromSsize_t(
6121 stringlib_count(self->str + start, end - start,
6122 substring->str, substring->length)
6123 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
6125 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 return result;
6128}
6129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006130PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006131"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006133Encodes S using the codec registered for encoding. encoding defaults\n\
6134to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006135handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6137'xmlcharrefreplace' as well as any other name registered with\n\
6138codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
6140static PyObject *
6141unicode_encode(PyUnicodeObject *self, PyObject *args)
6142{
6143 char *encoding = NULL;
6144 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006145 PyObject *v;
6146
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6148 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006149 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006150 if (v == NULL)
6151 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006152 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006153 if (PyString_Check(v)) {
6154 /* Old codec, turn it into bytes */
6155 PyObject *b = PyBytes_FromObject(v);
6156 Py_DECREF(v);
6157 return b;
6158 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006159 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006160 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006161 "(type=%.400s)",
6162 v->ob_type->tp_name);
6163 Py_DECREF(v);
6164 return NULL;
6165 }
6166 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006167
6168 onError:
6169 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006170}
6171
6172PyDoc_STRVAR(decode__doc__,
6173"S.decode([encoding[,errors]]) -> string or unicode\n\
6174\n\
6175Decodes S using the codec registered for encoding. encoding defaults\n\
6176to the default encoding. errors may be given to set a different error\n\
6177handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6178a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6179as well as any other name registerd with codecs.register_error that is\n\
6180able to handle UnicodeDecodeErrors.");
6181
6182static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006183unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006184{
6185 char *encoding = NULL;
6186 char *errors = NULL;
6187 PyObject *v;
6188
6189 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6190 return NULL;
6191 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006192 if (v == NULL)
6193 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006194 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6195 PyErr_Format(PyExc_TypeError,
6196 "decoder did not return a string/unicode object "
6197 "(type=%.400s)",
6198 v->ob_type->tp_name);
6199 Py_DECREF(v);
6200 return NULL;
6201 }
6202 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006203
6204 onError:
6205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209"S.expandtabs([tabsize]) -> unicode\n\
6210\n\
6211Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
6214static PyObject*
6215unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6216{
6217 Py_UNICODE *e;
6218 Py_UNICODE *p;
6219 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006220 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 PyUnicodeObject *u;
6222 int tabsize = 8;
6223
6224 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6225 return NULL;
6226
Thomas Wouters7e474022000-07-16 12:04:32 +00006227 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006228 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 e = self->str + self->length;
6230 for (p = self->str; p < e; p++)
6231 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006232 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006234 if (old_j > j) {
6235 PyErr_SetString(PyExc_OverflowError,
6236 "new string is too long");
6237 return NULL;
6238 }
6239 old_j = j;
6240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
6242 else {
6243 j++;
6244 if (*p == '\n' || *p == '\r') {
6245 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006246 old_j = j = 0;
6247 if (i < 0) {
6248 PyErr_SetString(PyExc_OverflowError,
6249 "new string is too long");
6250 return NULL;
6251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
6253 }
6254
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006255 if ((i + j) < 0) {
6256 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6257 return NULL;
6258 }
6259
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 /* Second pass: create output string and fill it */
6261 u = _PyUnicode_New(i + j);
6262 if (!u)
6263 return NULL;
6264
6265 j = 0;
6266 q = u->str;
6267
6268 for (p = self->str; p < e; p++)
6269 if (*p == '\t') {
6270 if (tabsize > 0) {
6271 i = tabsize - (j % tabsize);
6272 j += i;
6273 while (i--)
6274 *q++ = ' ';
6275 }
6276 }
6277 else {
6278 j++;
6279 *q++ = *p;
6280 if (*p == '\n' || *p == '\r')
6281 j = 0;
6282 }
6283
6284 return (PyObject*) u;
6285}
6286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006287PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288"S.find(sub [,start [,end]]) -> int\n\
6289\n\
6290Return the lowest index in S where substring sub is found,\n\
6291such that sub is contained within s[start,end]. Optional\n\
6292arguments start and end are interpreted as in slice notation.\n\
6293\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296static PyObject *
6297unicode_find(PyUnicodeObject *self, PyObject *args)
6298{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006301 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006302 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303
Guido van Rossumb8872e62000-05-09 14:14:27 +00006304 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6305 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006307 substring = PyUnicode_FromObject(substring);
6308 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 return NULL;
6310
Thomas Wouters477c8d52006-05-27 19:21:47 +00006311 result = stringlib_find_slice(
6312 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6313 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6314 start, end
6315 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316
6317 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006318
6319 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320}
6321
6322static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006323unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
6325 if (index < 0 || index >= self->length) {
6326 PyErr_SetString(PyExc_IndexError, "string index out of range");
6327 return NULL;
6328 }
6329
6330 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6331}
6332
6333static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006334unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006336 /* Since Unicode objects compare equal to their UTF-8 string
6337 counterparts, we hash the UTF-8 string. */
6338 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6339 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340}
6341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006342PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343"S.index(sub [,start [,end]]) -> int\n\
6344\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006345Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346
6347static PyObject *
6348unicode_index(PyUnicodeObject *self, PyObject *args)
6349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006350 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006351 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006352 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006353 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
Guido van Rossumb8872e62000-05-09 14:14:27 +00006355 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6356 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 substring = PyUnicode_FromObject(substring);
6359 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 return NULL;
6361
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362 result = stringlib_find_slice(
6363 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6364 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6365 start, end
6366 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367
6368 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 if (result < 0) {
6371 PyErr_SetString(PyExc_ValueError, "substring not found");
6372 return NULL;
6373 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006374
Martin v. Löwis18e16552006-02-15 17:27:45 +00006375 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376}
6377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006378PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006379"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006381Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006382at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
6384static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006385unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
6387 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6388 register const Py_UNICODE *e;
6389 int cased;
6390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 /* Shortcut for single character strings */
6392 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006395 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006396 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006397 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 e = p + PyUnicode_GET_SIZE(self);
6400 cased = 0;
6401 for (; p < e; p++) {
6402 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 else if (!cased && Py_UNICODE_ISLOWER(ch))
6407 cased = 1;
6408 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006409 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410}
6411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006413"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006415Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006416at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
6418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006419unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
6421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6422 register const Py_UNICODE *e;
6423 int cased;
6424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 /* Shortcut for single character strings */
6426 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006429 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006430 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006431 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006432
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 e = p + PyUnicode_GET_SIZE(self);
6434 cased = 0;
6435 for (; p < e; p++) {
6436 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006437
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006439 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 else if (!cased && Py_UNICODE_ISUPPER(ch))
6441 cased = 1;
6442 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006443 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444}
6445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006446PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006447"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006449Return True if S is a titlecased string and there is at least one\n\
6450character in S, i.e. upper- and titlecase characters may only\n\
6451follow uncased characters and lowercase characters only cased ones.\n\
6452Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453
6454static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006455unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456{
6457 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6458 register const Py_UNICODE *e;
6459 int cased, previous_is_cased;
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* Shortcut for single character strings */
6462 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006463 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6464 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006466 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006467 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006468 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 e = p + PyUnicode_GET_SIZE(self);
6471 cased = 0;
6472 previous_is_cased = 0;
6473 for (; p < e; p++) {
6474 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006475
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6477 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006478 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 previous_is_cased = 1;
6480 cased = 1;
6481 }
6482 else if (Py_UNICODE_ISLOWER(ch)) {
6483 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006484 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 previous_is_cased = 1;
6486 cased = 1;
6487 }
6488 else
6489 previous_is_cased = 0;
6490 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006491 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492}
6493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006494PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006495"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006497Return True if all characters in S are whitespace\n\
6498and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499
6500static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006501unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
6503 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6504 register const Py_UNICODE *e;
6505
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 /* Shortcut for single character strings */
6507 if (PyUnicode_GET_SIZE(self) == 1 &&
6508 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006509 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006511 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006512 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006513 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006514
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 e = p + PyUnicode_GET_SIZE(self);
6516 for (; p < e; p++) {
6517 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006518 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006523PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006524"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006525\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006526Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006527and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006528
6529static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006530unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006531{
6532 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6533 register const Py_UNICODE *e;
6534
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535 /* Shortcut for single character strings */
6536 if (PyUnicode_GET_SIZE(self) == 1 &&
6537 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006538 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006539
6540 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006541 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006542 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006543
6544 e = p + PyUnicode_GET_SIZE(self);
6545 for (; p < e; p++) {
6546 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006547 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006548 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550}
6551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006552PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006553"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006554\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006555Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006556and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557
6558static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006559unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006560{
6561 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6562 register const Py_UNICODE *e;
6563
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564 /* Shortcut for single character strings */
6565 if (PyUnicode_GET_SIZE(self) == 1 &&
6566 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006567 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006568
6569 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006570 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006571 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006572
6573 e = p + PyUnicode_GET_SIZE(self);
6574 for (; p < e; p++) {
6575 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006576 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006577 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006579}
6580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006582"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006584Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006588unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
6590 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6591 register const Py_UNICODE *e;
6592
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 /* Shortcut for single character strings */
6594 if (PyUnicode_GET_SIZE(self) == 1 &&
6595 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006596 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006598 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006599 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006600 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 e = p + PyUnicode_GET_SIZE(self);
6603 for (; p < e; p++) {
6604 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006605 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006611"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006613Return True if all characters in S are digits\n\
6614and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
6616static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006617unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618{
6619 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6620 register const Py_UNICODE *e;
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 /* Shortcut for single character strings */
6623 if (PyUnicode_GET_SIZE(self) == 1 &&
6624 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006625 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006627 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006628 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006629 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 e = p + PyUnicode_GET_SIZE(self);
6632 for (; p < e; p++) {
6633 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006634 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006639PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006640"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006642Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006643False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
6645static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006646unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
6648 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6649 register const Py_UNICODE *e;
6650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 /* Shortcut for single character strings */
6652 if (PyUnicode_GET_SIZE(self) == 1 &&
6653 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006654 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006656 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006657 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006658 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 e = p + PyUnicode_GET_SIZE(self);
6661 for (; p < e; p++) {
6662 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006665 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669"S.join(sequence) -> unicode\n\
6670\n\
6671Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006672sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673
6674static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006675unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006677 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
Martin v. Löwis18e16552006-02-15 17:27:45 +00006680static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681unicode_length(PyUnicodeObject *self)
6682{
6683 return self->length;
6684}
6685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006686PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006687"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688\n\
6689Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006690done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
6692static PyObject *
6693unicode_ljust(PyUnicodeObject *self, PyObject *args)
6694{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006695 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006696 Py_UNICODE fillchar = ' ';
6697
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006698 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 return NULL;
6700
Tim Peters7a29bd52001-09-12 03:03:31 +00006701 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 Py_INCREF(self);
6703 return (PyObject*) self;
6704 }
6705
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006706 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710"S.lower() -> unicode\n\
6711\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006715unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return fixup(self, fixlower);
6718}
6719
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006720#define LEFTSTRIP 0
6721#define RIGHTSTRIP 1
6722#define BOTHSTRIP 2
6723
6724/* Arrays indexed by above */
6725static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6726
6727#define STRIPNAME(i) (stripformat[i]+3)
6728
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006729/* externally visible for str.strip(unicode) */
6730PyObject *
6731_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6732{
6733 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006734 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006735 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006736 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6737 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006738
Thomas Wouters477c8d52006-05-27 19:21:47 +00006739 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6740
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006741 i = 0;
6742 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006743 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6744 i++;
6745 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006746 }
6747
6748 j = len;
6749 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006750 do {
6751 j--;
6752 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6753 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006754 }
6755
6756 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006757 Py_INCREF(self);
6758 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006759 }
6760 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006761 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006762}
6763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
6765static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006766do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006768 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006770
6771 i = 0;
6772 if (striptype != RIGHTSTRIP) {
6773 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6774 i++;
6775 }
6776 }
6777
6778 j = len;
6779 if (striptype != LEFTSTRIP) {
6780 do {
6781 j--;
6782 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6783 j++;
6784 }
6785
6786 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6787 Py_INCREF(self);
6788 return (PyObject*)self;
6789 }
6790 else
6791 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792}
6793
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006794
6795static PyObject *
6796do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6797{
6798 PyObject *sep = NULL;
6799
6800 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6801 return NULL;
6802
6803 if (sep != NULL && sep != Py_None) {
6804 if (PyUnicode_Check(sep))
6805 return _PyUnicode_XStrip(self, striptype, sep);
6806 else if (PyString_Check(sep)) {
6807 PyObject *res;
6808 sep = PyUnicode_FromObject(sep);
6809 if (sep==NULL)
6810 return NULL;
6811 res = _PyUnicode_XStrip(self, striptype, sep);
6812 Py_DECREF(sep);
6813 return res;
6814 }
6815 else {
6816 PyErr_Format(PyExc_TypeError,
6817 "%s arg must be None, unicode or str",
6818 STRIPNAME(striptype));
6819 return NULL;
6820 }
6821 }
6822
6823 return do_strip(self, striptype);
6824}
6825
6826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006827PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006828"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006829\n\
6830Return a copy of the string S with leading and trailing\n\
6831whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006832If chars is given and not None, remove characters in chars instead.\n\
6833If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006834
6835static PyObject *
6836unicode_strip(PyUnicodeObject *self, PyObject *args)
6837{
6838 if (PyTuple_GET_SIZE(args) == 0)
6839 return do_strip(self, BOTHSTRIP); /* Common case */
6840 else
6841 return do_argstrip(self, BOTHSTRIP, args);
6842}
6843
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006846"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006847\n\
6848Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006849If chars is given and not None, remove characters in chars instead.\n\
6850If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006851
6852static PyObject *
6853unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6854{
6855 if (PyTuple_GET_SIZE(args) == 0)
6856 return do_strip(self, LEFTSTRIP); /* Common case */
6857 else
6858 return do_argstrip(self, LEFTSTRIP, args);
6859}
6860
6861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006863"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006864\n\
6865Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006866If chars is given and not None, remove characters in chars instead.\n\
6867If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006868
6869static PyObject *
6870unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6871{
6872 if (PyTuple_GET_SIZE(args) == 0)
6873 return do_strip(self, RIGHTSTRIP); /* Common case */
6874 else
6875 return do_argstrip(self, RIGHTSTRIP, args);
6876}
6877
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006880unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
6882 PyUnicodeObject *u;
6883 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006885 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
6887 if (len < 0)
6888 len = 0;
6889
Tim Peters7a29bd52001-09-12 03:03:31 +00006890 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 /* no repeat, return original string */
6892 Py_INCREF(str);
6893 return (PyObject*) str;
6894 }
Tim Peters8f422462000-09-09 06:13:41 +00006895
6896 /* ensure # of chars needed doesn't overflow int and # of bytes
6897 * needed doesn't overflow size_t
6898 */
6899 nchars = len * str->length;
6900 if (len && nchars / len != str->length) {
6901 PyErr_SetString(PyExc_OverflowError,
6902 "repeated string is too long");
6903 return NULL;
6904 }
6905 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6906 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6907 PyErr_SetString(PyExc_OverflowError,
6908 "repeated string is too long");
6909 return NULL;
6910 }
6911 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (!u)
6913 return NULL;
6914
6915 p = u->str;
6916
Thomas Wouters477c8d52006-05-27 19:21:47 +00006917 if (str->length == 1 && len > 0) {
6918 Py_UNICODE_FILL(p, str->str[0], len);
6919 } else {
6920 Py_ssize_t done = 0; /* number of characters copied this far */
6921 if (done < nchars) {
6922 Py_UNICODE_COPY(p, str->str, str->length);
6923 done = str->length;
6924 }
6925 while (done < nchars) {
6926 int n = (done <= nchars-done) ? done : nchars-done;
6927 Py_UNICODE_COPY(p+done, p, n);
6928 done += n;
6929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 }
6931
6932 return (PyObject*) u;
6933}
6934
6935PyObject *PyUnicode_Replace(PyObject *obj,
6936 PyObject *subobj,
6937 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006938 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
6940 PyObject *self;
6941 PyObject *str1;
6942 PyObject *str2;
6943 PyObject *result;
6944
6945 self = PyUnicode_FromObject(obj);
6946 if (self == NULL)
6947 return NULL;
6948 str1 = PyUnicode_FromObject(subobj);
6949 if (str1 == NULL) {
6950 Py_DECREF(self);
6951 return NULL;
6952 }
6953 str2 = PyUnicode_FromObject(replobj);
6954 if (str2 == NULL) {
6955 Py_DECREF(self);
6956 Py_DECREF(str1);
6957 return NULL;
6958 }
Tim Petersced69f82003-09-16 20:30:58 +00006959 result = replace((PyUnicodeObject *)self,
6960 (PyUnicodeObject *)str1,
6961 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 maxcount);
6963 Py_DECREF(self);
6964 Py_DECREF(str1);
6965 Py_DECREF(str2);
6966 return result;
6967}
6968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006969PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970"S.replace (old, new[, maxsplit]) -> unicode\n\
6971\n\
6972Return a copy of S with all occurrences of substring\n\
6973old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
6976static PyObject*
6977unicode_replace(PyUnicodeObject *self, PyObject *args)
6978{
6979 PyUnicodeObject *str1;
6980 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 PyObject *result;
6983
Martin v. Löwis18e16552006-02-15 17:27:45 +00006984 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 return NULL;
6986 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6987 if (str1 == NULL)
6988 return NULL;
6989 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006990 if (str2 == NULL) {
6991 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
6995 result = replace(self, str1, str2, maxcount);
6996
6997 Py_DECREF(str1);
6998 Py_DECREF(str2);
6999 return result;
7000}
7001
7002static
7003PyObject *unicode_repr(PyObject *unicode)
7004{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007005 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007006 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007007 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7008 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7009
7010 /* XXX(nnorwitz): rather than over-allocating, it would be
7011 better to choose a different scheme. Perhaps scan the
7012 first N-chars of the string and allocate based on that size.
7013 */
7014 /* Initial allocation is based on the longest-possible unichr
7015 escape.
7016
7017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7018 unichr, so in this case it's the longest unichr escape. In
7019 narrow (UTF-16) builds this is five chars per source unichr
7020 since there are two unichrs in the surrogate pair, so in narrow
7021 (UTF-16) builds it's not the longest unichr escape.
7022
7023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7024 so in the narrow (UTF-16) build case it's the longest unichr
7025 escape.
7026 */
7027
Walter Dörwald1ab83302007-05-18 17:15:44 +00007028 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007029 2 /* quotes */
7030#ifdef Py_UNICODE_WIDE
7031 + 10*size
7032#else
7033 + 6*size
7034#endif
7035 + 1);
7036 if (repr == NULL)
7037 return NULL;
7038
Walter Dörwald1ab83302007-05-18 17:15:44 +00007039 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007040
7041 /* Add quote */
7042 *p++ = (findchar(s, size, '\'') &&
7043 !findchar(s, size, '"')) ? '"' : '\'';
7044 while (size-- > 0) {
7045 Py_UNICODE ch = *s++;
7046
7047 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007048 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007049 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007050 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007051 continue;
7052 }
7053
7054#ifdef Py_UNICODE_WIDE
7055 /* Map 21-bit characters to '\U00xxxxxx' */
7056 else if (ch >= 0x10000) {
7057 *p++ = '\\';
7058 *p++ = 'U';
7059 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7060 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7061 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7062 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7063 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7064 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7065 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7066 *p++ = hexdigits[ch & 0x0000000F];
7067 continue;
7068 }
7069#else
7070 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7071 else if (ch >= 0xD800 && ch < 0xDC00) {
7072 Py_UNICODE ch2;
7073 Py_UCS4 ucs;
7074
7075 ch2 = *s++;
7076 size--;
7077 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7078 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7079 *p++ = '\\';
7080 *p++ = 'U';
7081 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7082 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7083 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7084 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7085 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7086 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7087 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7088 *p++ = hexdigits[ucs & 0x0000000F];
7089 continue;
7090 }
7091 /* Fall through: isolated surrogates are copied as-is */
7092 s--;
7093 size++;
7094 }
7095#endif
7096
7097 /* Map 16-bit characters to '\uxxxx' */
7098 if (ch >= 256) {
7099 *p++ = '\\';
7100 *p++ = 'u';
7101 *p++ = hexdigits[(ch >> 12) & 0x000F];
7102 *p++ = hexdigits[(ch >> 8) & 0x000F];
7103 *p++ = hexdigits[(ch >> 4) & 0x000F];
7104 *p++ = hexdigits[ch & 0x000F];
7105 }
7106
7107 /* Map special whitespace to '\t', \n', '\r' */
7108 else if (ch == '\t') {
7109 *p++ = '\\';
7110 *p++ = 't';
7111 }
7112 else if (ch == '\n') {
7113 *p++ = '\\';
7114 *p++ = 'n';
7115 }
7116 else if (ch == '\r') {
7117 *p++ = '\\';
7118 *p++ = 'r';
7119 }
7120
7121 /* Map non-printable US ASCII to '\xhh' */
7122 else if (ch < ' ' || ch >= 0x7F) {
7123 *p++ = '\\';
7124 *p++ = 'x';
7125 *p++ = hexdigits[(ch >> 4) & 0x000F];
7126 *p++ = hexdigits[ch & 0x000F];
7127 }
7128
7129 /* Copy everything else as-is */
7130 else
7131 *p++ = (char) ch;
7132 }
7133 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007134 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007135
7136 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007137 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007138 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142"S.rfind(sub [,start [,end]]) -> int\n\
7143\n\
7144Return the highest index in S where substring sub is found,\n\
7145such that sub is contained within s[start,end]. Optional\n\
7146arguments start and end are interpreted as in slice notation.\n\
7147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
7150static PyObject *
7151unicode_rfind(PyUnicodeObject *self, PyObject *args)
7152{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007155 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007156 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
Guido van Rossumb8872e62000-05-09 14:14:27 +00007158 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7159 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 substring = PyUnicode_FromObject(substring);
7162 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 return NULL;
7164
Thomas Wouters477c8d52006-05-27 19:21:47 +00007165 result = stringlib_rfind_slice(
7166 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7167 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7168 start, end
7169 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170
7171 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172
7173 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177"S.rindex(sub [,start [,end]]) -> int\n\
7178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180
7181static PyObject *
7182unicode_rindex(PyUnicodeObject *self, PyObject *args)
7183{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007184 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007185 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007186 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007187 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188
Guido van Rossumb8872e62000-05-09 14:14:27 +00007189 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7190 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007192 substring = PyUnicode_FromObject(substring);
7193 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 return NULL;
7195
Thomas Wouters477c8d52006-05-27 19:21:47 +00007196 result = stringlib_rfind_slice(
7197 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7198 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7199 start, end
7200 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201
7202 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007203
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 if (result < 0) {
7205 PyErr_SetString(PyExc_ValueError, "substring not found");
7206 return NULL;
7207 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209}
7210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007212"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213\n\
7214Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007215done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
7217static PyObject *
7218unicode_rjust(PyUnicodeObject *self, PyObject *args)
7219{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007220 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007221 Py_UNICODE fillchar = ' ';
7222
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007223 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 return NULL;
7225
Tim Peters7a29bd52001-09-12 03:03:31 +00007226 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 Py_INCREF(self);
7228 return (PyObject*) self;
7229 }
7230
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007231 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232}
7233
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007235unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236{
7237 /* standard clamping */
7238 if (start < 0)
7239 start = 0;
7240 if (end < 0)
7241 end = 0;
7242 if (end > self->length)
7243 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007244 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 /* full slice, return original string */
7246 Py_INCREF(self);
7247 return (PyObject*) self;
7248 }
7249 if (start > end)
7250 start = end;
7251 /* copy slice */
7252 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7253 end - start);
7254}
7255
7256PyObject *PyUnicode_Split(PyObject *s,
7257 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259{
7260 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007261
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 s = PyUnicode_FromObject(s);
7263 if (s == NULL)
7264 return NULL;
7265 if (sep != NULL) {
7266 sep = PyUnicode_FromObject(sep);
7267 if (sep == NULL) {
7268 Py_DECREF(s);
7269 return NULL;
7270 }
7271 }
7272
7273 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7274
7275 Py_DECREF(s);
7276 Py_XDECREF(sep);
7277 return result;
7278}
7279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007280PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281"S.split([sep [,maxsplit]]) -> list of strings\n\
7282\n\
7283Return a list of the words in S, using sep as the\n\
7284delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007285splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007286any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288static PyObject*
7289unicode_split(PyUnicodeObject *self, PyObject *args)
7290{
7291 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
Martin v. Löwis18e16552006-02-15 17:27:45 +00007294 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 return NULL;
7296
7297 if (substring == Py_None)
7298 return split(self, NULL, maxcount);
7299 else if (PyUnicode_Check(substring))
7300 return split(self, (PyUnicodeObject *)substring, maxcount);
7301 else
7302 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7303}
7304
Thomas Wouters477c8d52006-05-27 19:21:47 +00007305PyObject *
7306PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7307{
7308 PyObject* str_obj;
7309 PyObject* sep_obj;
7310 PyObject* out;
7311
7312 str_obj = PyUnicode_FromObject(str_in);
7313 if (!str_obj)
7314 return NULL;
7315 sep_obj = PyUnicode_FromObject(sep_in);
7316 if (!sep_obj) {
7317 Py_DECREF(str_obj);
7318 return NULL;
7319 }
7320
7321 out = stringlib_partition(
7322 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7323 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7324 );
7325
7326 Py_DECREF(sep_obj);
7327 Py_DECREF(str_obj);
7328
7329 return out;
7330}
7331
7332
7333PyObject *
7334PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7335{
7336 PyObject* str_obj;
7337 PyObject* sep_obj;
7338 PyObject* out;
7339
7340 str_obj = PyUnicode_FromObject(str_in);
7341 if (!str_obj)
7342 return NULL;
7343 sep_obj = PyUnicode_FromObject(sep_in);
7344 if (!sep_obj) {
7345 Py_DECREF(str_obj);
7346 return NULL;
7347 }
7348
7349 out = stringlib_rpartition(
7350 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7351 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7352 );
7353
7354 Py_DECREF(sep_obj);
7355 Py_DECREF(str_obj);
7356
7357 return out;
7358}
7359
7360PyDoc_STRVAR(partition__doc__,
7361"S.partition(sep) -> (head, sep, tail)\n\
7362\n\
7363Searches for the separator sep in S, and returns the part before it,\n\
7364the separator itself, and the part after it. If the separator is not\n\
7365found, returns S and two empty strings.");
7366
7367static PyObject*
7368unicode_partition(PyUnicodeObject *self, PyObject *separator)
7369{
7370 return PyUnicode_Partition((PyObject *)self, separator);
7371}
7372
7373PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007374"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007375\n\
7376Searches for the separator sep in S, starting at the end of S, and returns\n\
7377the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007378separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007379
7380static PyObject*
7381unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7382{
7383 return PyUnicode_RPartition((PyObject *)self, separator);
7384}
7385
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007386PyObject *PyUnicode_RSplit(PyObject *s,
7387 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007388 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007389{
7390 PyObject *result;
7391
7392 s = PyUnicode_FromObject(s);
7393 if (s == NULL)
7394 return NULL;
7395 if (sep != NULL) {
7396 sep = PyUnicode_FromObject(sep);
7397 if (sep == NULL) {
7398 Py_DECREF(s);
7399 return NULL;
7400 }
7401 }
7402
7403 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7404
7405 Py_DECREF(s);
7406 Py_XDECREF(sep);
7407 return result;
7408}
7409
7410PyDoc_STRVAR(rsplit__doc__,
7411"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7412\n\
7413Return a list of the words in S, using sep as the\n\
7414delimiter string, starting at the end of the string and\n\
7415working to the front. If maxsplit is given, at most maxsplit\n\
7416splits are done. If sep is not specified, any whitespace string\n\
7417is a separator.");
7418
7419static PyObject*
7420unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7421{
7422 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007423 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007424
Martin v. Löwis18e16552006-02-15 17:27:45 +00007425 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007426 return NULL;
7427
7428 if (substring == Py_None)
7429 return rsplit(self, NULL, maxcount);
7430 else if (PyUnicode_Check(substring))
7431 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7432 else
7433 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7434}
7435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007436PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007437"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438\n\
7439Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007440Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
7443static PyObject*
7444unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7445{
Guido van Rossum86662912000-04-11 15:38:46 +00007446 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
Guido van Rossum86662912000-04-11 15:38:46 +00007448 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 return NULL;
7450
Guido van Rossum86662912000-04-11 15:38:46 +00007451 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452}
7453
7454static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007455PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456{
Walter Dörwald346737f2007-05-31 10:44:43 +00007457 if (PyUnicode_CheckExact(self)) {
7458 Py_INCREF(self);
7459 return self;
7460 } else
7461 /* Subtype -- return genuine unicode string with the same value. */
7462 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7463 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464}
7465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467"S.swapcase() -> unicode\n\
7468\n\
7469Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007473unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 return fixup(self, fixswapcase);
7476}
7477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479"S.translate(table) -> unicode\n\
7480\n\
7481Return a copy of the string S, where all characters have been mapped\n\
7482through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007483Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7484Unmapped characters are left untouched. Characters mapped to None\n\
7485are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486
7487static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007488unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489{
Tim Petersced69f82003-09-16 20:30:58 +00007490 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007492 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 "ignore");
7494}
7495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497"S.upper() -> unicode\n\
7498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
7501static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007502unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 return fixup(self, fixupper);
7505}
7506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508"S.zfill(width) -> unicode\n\
7509\n\
7510Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007511of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513static PyObject *
7514unicode_zfill(PyUnicodeObject *self, PyObject *args)
7515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007516 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 PyUnicodeObject *u;
7518
Martin v. Löwis18e16552006-02-15 17:27:45 +00007519 Py_ssize_t width;
7520 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 return NULL;
7522
7523 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007524 if (PyUnicode_CheckExact(self)) {
7525 Py_INCREF(self);
7526 return (PyObject*) self;
7527 }
7528 else
7529 return PyUnicode_FromUnicode(
7530 PyUnicode_AS_UNICODE(self),
7531 PyUnicode_GET_SIZE(self)
7532 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 }
7534
7535 fill = width - self->length;
7536
7537 u = pad(self, fill, 0, '0');
7538
Walter Dörwald068325e2002-04-15 13:36:47 +00007539 if (u == NULL)
7540 return NULL;
7541
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 if (u->str[fill] == '+' || u->str[fill] == '-') {
7543 /* move sign to beginning of string */
7544 u->str[0] = u->str[fill];
7545 u->str[fill] = '0';
7546 }
7547
7548 return (PyObject*) u;
7549}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
7551#if 0
7552static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007553unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 return PyInt_FromLong(unicode_freelist_size);
7556}
7557#endif
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007560"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007562Return True if S starts with the specified prefix, False otherwise.\n\
7563With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007564With optional end, stop comparing S at that position.\n\
7565prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566
7567static PyObject *
7568unicode_startswith(PyUnicodeObject *self,
7569 PyObject *args)
7570{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007571 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007573 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007574 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007580 if (PyTuple_Check(subobj)) {
7581 Py_ssize_t i;
7582 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7583 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7584 PyTuple_GET_ITEM(subobj, i));
7585 if (substring == NULL)
7586 return NULL;
7587 result = tailmatch(self, substring, start, end, -1);
7588 Py_DECREF(substring);
7589 if (result) {
7590 Py_RETURN_TRUE;
7591 }
7592 }
7593 /* nothing matched */
7594 Py_RETURN_FALSE;
7595 }
7596 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598 return NULL;
7599 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007601 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602}
7603
7604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007605PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007606"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007608Return True if S ends with the specified suffix, False otherwise.\n\
7609With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007610With optional end, stop comparing S at that position.\n\
7611suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613static PyObject *
7614unicode_endswith(PyUnicodeObject *self,
7615 PyObject *args)
7616{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007617 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007619 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007620 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7624 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626 if (PyTuple_Check(subobj)) {
7627 Py_ssize_t i;
7628 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7629 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7630 PyTuple_GET_ITEM(subobj, i));
7631 if (substring == NULL)
7632 return NULL;
7633 result = tailmatch(self, substring, start, end, +1);
7634 Py_DECREF(substring);
7635 if (result) {
7636 Py_RETURN_TRUE;
7637 }
7638 }
7639 Py_RETURN_FALSE;
7640 }
7641 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007647 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648}
7649
7650
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007651
7652static PyObject *
7653unicode_getnewargs(PyUnicodeObject *v)
7654{
7655 return Py_BuildValue("(u#)", v->str, v->length);
7656}
7657
7658
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659static PyMethodDef unicode_methods[] = {
7660
7661 /* Order is according to common usage: often used methods should
7662 appear first, since lookup is done sequentially. */
7663
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007664 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7665 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7666 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007667 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007668 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7669 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7670 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7671 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7672 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7673 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7674 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007675 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007676 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7677 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7678 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007679 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007680 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007681/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7682 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7683 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7684 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007685 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007686 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007688 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007689 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7690 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7691 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7692 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7693 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7694 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7695 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7696 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7697 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7698 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7699 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7700 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7701 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7702 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007703 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007704#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007705 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706#endif
7707
7708#if 0
7709 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007710 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711#endif
7712
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007713 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 {NULL, NULL}
7715};
7716
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007717static PyObject *
7718unicode_mod(PyObject *v, PyObject *w)
7719{
7720 if (!PyUnicode_Check(v)) {
7721 Py_INCREF(Py_NotImplemented);
7722 return Py_NotImplemented;
7723 }
7724 return PyUnicode_Format(v, w);
7725}
7726
7727static PyNumberMethods unicode_as_number = {
7728 0, /*nb_add*/
7729 0, /*nb_subtract*/
7730 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007731 unicode_mod, /*nb_remainder*/
7732};
7733
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007736 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7738 (ssizeargfunc) unicode_getitem, /* sq_item */
7739 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 0, /* sq_ass_item */
7741 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007742 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743};
7744
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007745static PyObject*
7746unicode_subscript(PyUnicodeObject* self, PyObject* item)
7747{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007748 if (PyIndex_Check(item)) {
7749 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007750 if (i == -1 && PyErr_Occurred())
7751 return NULL;
7752 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007753 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007754 return unicode_getitem(self, i);
7755 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007756 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007757 Py_UNICODE* source_buf;
7758 Py_UNICODE* result_buf;
7759 PyObject* result;
7760
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007761 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007762 &start, &stop, &step, &slicelength) < 0) {
7763 return NULL;
7764 }
7765
7766 if (slicelength <= 0) {
7767 return PyUnicode_FromUnicode(NULL, 0);
7768 } else {
7769 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007770 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7771 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007772
7773 if (result_buf == NULL)
7774 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007775
7776 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7777 result_buf[i] = source_buf[cur];
7778 }
Tim Petersced69f82003-09-16 20:30:58 +00007779
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007780 result = PyUnicode_FromUnicode(result_buf, slicelength);
7781 PyMem_FREE(result_buf);
7782 return result;
7783 }
7784 } else {
7785 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7786 return NULL;
7787 }
7788}
7789
7790static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007791 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007792 (binaryfunc)unicode_subscript, /* mp_subscript */
7793 (objobjargproc)0, /* mp_ass_subscript */
7794};
7795
Martin v. Löwis18e16552006-02-15 17:27:45 +00007796static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007798 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 const void **ptr)
7800{
7801 if (index != 0) {
7802 PyErr_SetString(PyExc_SystemError,
7803 "accessing non-existent unicode segment");
7804 return -1;
7805 }
7806 *ptr = (void *) self->str;
7807 return PyUnicode_GET_DATA_SIZE(self);
7808}
7809
Martin v. Löwis18e16552006-02-15 17:27:45 +00007810static Py_ssize_t
7811unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 const void **ptr)
7813{
7814 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007815 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 return -1;
7817}
7818
7819static int
7820unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007821 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822{
7823 if (lenp)
7824 *lenp = PyUnicode_GET_DATA_SIZE(self);
7825 return 1;
7826}
7827
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007828static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007830 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 const void **ptr)
7832{
7833 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007834
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 if (index != 0) {
7836 PyErr_SetString(PyExc_SystemError,
7837 "accessing non-existent unicode segment");
7838 return -1;
7839 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007840 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 if (str == NULL)
7842 return -1;
7843 *ptr = (void *) PyString_AS_STRING(str);
7844 return PyString_GET_SIZE(str);
7845}
7846
7847/* Helpers for PyUnicode_Format() */
7848
7849static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007850getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007852 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 if (argidx < arglen) {
7854 (*p_argidx)++;
7855 if (arglen < 0)
7856 return args;
7857 else
7858 return PyTuple_GetItem(args, argidx);
7859 }
7860 PyErr_SetString(PyExc_TypeError,
7861 "not enough arguments for format string");
7862 return NULL;
7863}
7864
7865#define F_LJUST (1<<0)
7866#define F_SIGN (1<<1)
7867#define F_BLANK (1<<2)
7868#define F_ALT (1<<3)
7869#define F_ZERO (1<<4)
7870
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007872strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007874 register Py_ssize_t i;
7875 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 for (i = len - 1; i >= 0; i--)
7877 buffer[i] = (Py_UNICODE) charbuffer[i];
7878
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 return len;
7880}
7881
Neal Norwitzfc76d632006-01-10 06:03:13 +00007882static int
7883doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7884{
Tim Peters15231542006-02-16 01:08:01 +00007885 Py_ssize_t result;
7886
Neal Norwitzfc76d632006-01-10 06:03:13 +00007887 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007888 result = strtounicode(buffer, (char *)buffer);
7889 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007890}
7891
7892static int
7893longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7894{
Tim Peters15231542006-02-16 01:08:01 +00007895 Py_ssize_t result;
7896
Neal Norwitzfc76d632006-01-10 06:03:13 +00007897 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007898 result = strtounicode(buffer, (char *)buffer);
7899 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007900}
7901
Guido van Rossum078151d2002-08-11 04:24:12 +00007902/* XXX To save some code duplication, formatfloat/long/int could have been
7903 shared with stringobject.c, converting from 8-bit to Unicode after the
7904 formatting is done. */
7905
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906static int
7907formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007908 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 int flags,
7910 int prec,
7911 int type,
7912 PyObject *v)
7913{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007914 /* fmt = '%#.' + `prec` + `type`
7915 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 char fmt[20];
7917 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007918
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 x = PyFloat_AsDouble(v);
7920 if (x == -1.0 && PyErr_Occurred())
7921 return -1;
7922 if (prec < 0)
7923 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7925 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007926 /* Worst case length calc to ensure no buffer overrun:
7927
7928 'g' formats:
7929 fmt = %#.<prec>g
7930 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7931 for any double rep.)
7932 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7933
7934 'f' formats:
7935 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7936 len = 1 + 50 + 1 + prec = 52 + prec
7937
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007938 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007939 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007940
7941 */
7942 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7943 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007944 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007945 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007946 return -1;
7947 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007948 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7949 (flags&F_ALT) ? "#" : "",
7950 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007951 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952}
7953
Tim Peters38fd5b62000-09-21 05:43:11 +00007954static PyObject*
7955formatlong(PyObject *val, int flags, int prec, int type)
7956{
7957 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007958 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007959 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007960 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007961
7962 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7963 if (!str)
7964 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007965 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007966 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007967 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007968}
7969
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970static int
7971formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007972 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 int flags,
7974 int prec,
7975 int type,
7976 PyObject *v)
7977{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007978 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007979 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7980 * + 1 + 1
7981 * = 24
7982 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007983 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007984 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 long x;
7986
7987 x = PyInt_AsLong(v);
7988 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007989 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007990 if (x < 0 && type == 'u') {
7991 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007992 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007993 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7994 sign = "-";
7995 else
7996 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007998 prec = 1;
7999
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008000 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8001 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008002 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008003 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008004 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008005 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008006 return -1;
8007 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008008
8009 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008010 (type == 'x' || type == 'X' || type == 'o')) {
8011 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008012 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008013 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008014 * - when 0 is being converted, the C standard leaves off
8015 * the '0x' or '0X', which is inconsistent with other
8016 * %#x/%#X conversions and inconsistent with Python's
8017 * hex() function
8018 * - there are platforms that violate the standard and
8019 * convert 0 with the '0x' or '0X'
8020 * (Metrowerks, Compaq Tru64)
8021 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008022 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008023 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008024 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008025 * We can achieve the desired consistency by inserting our
8026 * own '0x' or '0X' prefix, and substituting %x/%X in place
8027 * of %#x/%#X.
8028 *
8029 * Note that this is the same approach as used in
8030 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008031 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008032 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8033 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008034 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008035 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008036 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8037 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008038 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008039 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008040 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008041 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008042 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008043 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044}
8045
8046static int
8047formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008048 size_t buflen,
8049 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008051 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008052 if (PyUnicode_Check(v)) {
8053 if (PyUnicode_GET_SIZE(v) != 1)
8054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008058 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008059 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008060 goto onError;
8061 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
8064 else {
8065 /* Integer input truncated to a character */
8066 long x;
8067 x = PyInt_AsLong(v);
8068 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008069 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008070#ifdef Py_UNICODE_WIDE
8071 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008072 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008073 "%c arg not in range(0x110000) "
8074 "(wide Python build)");
8075 return -1;
8076 }
8077#else
8078 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008079 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008080 "%c arg not in range(0x10000) "
8081 "(narrow Python build)");
8082 return -1;
8083 }
8084#endif
8085 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 }
8087 buf[1] = '\0';
8088 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008089
8090 onError:
8091 PyErr_SetString(PyExc_TypeError,
8092 "%c requires int or char");
8093 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094}
8095
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008096/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8097
8098 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8099 chars are formatted. XXX This is a magic number. Each formatting
8100 routine does bounds checking to ensure no overflow, but a better
8101 solution may be to malloc a buffer of appropriate size for each
8102 format. For now, the current solution is sufficient.
8103*/
8104#define FORMATBUFLEN (size_t)120
8105
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106PyObject *PyUnicode_Format(PyObject *format,
8107 PyObject *args)
8108{
8109 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 int args_owned = 0;
8112 PyUnicodeObject *result = NULL;
8113 PyObject *dict = NULL;
8114 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008115
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 if (format == NULL || args == NULL) {
8117 PyErr_BadInternalCall();
8118 return NULL;
8119 }
8120 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008121 if (uformat == NULL)
8122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 fmt = PyUnicode_AS_UNICODE(uformat);
8124 fmtcnt = PyUnicode_GET_SIZE(uformat);
8125
8126 reslen = rescnt = fmtcnt + 100;
8127 result = _PyUnicode_New(reslen);
8128 if (result == NULL)
8129 goto onError;
8130 res = PyUnicode_AS_UNICODE(result);
8131
8132 if (PyTuple_Check(args)) {
8133 arglen = PyTuple_Size(args);
8134 argidx = 0;
8135 }
8136 else {
8137 arglen = -1;
8138 argidx = -2;
8139 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008140 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8141 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 dict = args;
8143
8144 while (--fmtcnt >= 0) {
8145 if (*fmt != '%') {
8146 if (--rescnt < 0) {
8147 rescnt = fmtcnt + 100;
8148 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008149 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008150 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8152 --rescnt;
8153 }
8154 *res++ = *fmt++;
8155 }
8156 else {
8157 /* Got a format specifier */
8158 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 Py_UNICODE c = '\0';
8162 Py_UNICODE fill;
8163 PyObject *v = NULL;
8164 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008165 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008168 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
8170 fmt++;
8171 if (*fmt == '(') {
8172 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008173 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 PyObject *key;
8175 int pcount = 1;
8176
8177 if (dict == NULL) {
8178 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008179 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 goto onError;
8181 }
8182 ++fmt;
8183 --fmtcnt;
8184 keystart = fmt;
8185 /* Skip over balanced parentheses */
8186 while (pcount > 0 && --fmtcnt >= 0) {
8187 if (*fmt == ')')
8188 --pcount;
8189 else if (*fmt == '(')
8190 ++pcount;
8191 fmt++;
8192 }
8193 keylen = fmt - keystart - 1;
8194 if (fmtcnt < 0 || pcount > 0) {
8195 PyErr_SetString(PyExc_ValueError,
8196 "incomplete format key");
8197 goto onError;
8198 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008199#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008200 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 then looked up since Python uses strings to hold
8202 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008203 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 key = PyUnicode_EncodeUTF8(keystart,
8205 keylen,
8206 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008207#else
8208 key = PyUnicode_FromUnicode(keystart, keylen);
8209#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 if (key == NULL)
8211 goto onError;
8212 if (args_owned) {
8213 Py_DECREF(args);
8214 args_owned = 0;
8215 }
8216 args = PyObject_GetItem(dict, key);
8217 Py_DECREF(key);
8218 if (args == NULL) {
8219 goto onError;
8220 }
8221 args_owned = 1;
8222 arglen = -1;
8223 argidx = -2;
8224 }
8225 while (--fmtcnt >= 0) {
8226 switch (c = *fmt++) {
8227 case '-': flags |= F_LJUST; continue;
8228 case '+': flags |= F_SIGN; continue;
8229 case ' ': flags |= F_BLANK; continue;
8230 case '#': flags |= F_ALT; continue;
8231 case '0': flags |= F_ZERO; continue;
8232 }
8233 break;
8234 }
8235 if (c == '*') {
8236 v = getnextarg(args, arglen, &argidx);
8237 if (v == NULL)
8238 goto onError;
8239 if (!PyInt_Check(v)) {
8240 PyErr_SetString(PyExc_TypeError,
8241 "* wants int");
8242 goto onError;
8243 }
8244 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008245 if (width == -1 && PyErr_Occurred())
8246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 if (width < 0) {
8248 flags |= F_LJUST;
8249 width = -width;
8250 }
8251 if (--fmtcnt >= 0)
8252 c = *fmt++;
8253 }
8254 else if (c >= '0' && c <= '9') {
8255 width = c - '0';
8256 while (--fmtcnt >= 0) {
8257 c = *fmt++;
8258 if (c < '0' || c > '9')
8259 break;
8260 if ((width*10) / 10 != width) {
8261 PyErr_SetString(PyExc_ValueError,
8262 "width too big");
8263 goto onError;
8264 }
8265 width = width*10 + (c - '0');
8266 }
8267 }
8268 if (c == '.') {
8269 prec = 0;
8270 if (--fmtcnt >= 0)
8271 c = *fmt++;
8272 if (c == '*') {
8273 v = getnextarg(args, arglen, &argidx);
8274 if (v == NULL)
8275 goto onError;
8276 if (!PyInt_Check(v)) {
8277 PyErr_SetString(PyExc_TypeError,
8278 "* wants int");
8279 goto onError;
8280 }
8281 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008282 if (prec == -1 && PyErr_Occurred())
8283 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 if (prec < 0)
8285 prec = 0;
8286 if (--fmtcnt >= 0)
8287 c = *fmt++;
8288 }
8289 else if (c >= '0' && c <= '9') {
8290 prec = c - '0';
8291 while (--fmtcnt >= 0) {
8292 c = Py_CHARMASK(*fmt++);
8293 if (c < '0' || c > '9')
8294 break;
8295 if ((prec*10) / 10 != prec) {
8296 PyErr_SetString(PyExc_ValueError,
8297 "prec too big");
8298 goto onError;
8299 }
8300 prec = prec*10 + (c - '0');
8301 }
8302 }
8303 } /* prec */
8304 if (fmtcnt >= 0) {
8305 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 if (--fmtcnt >= 0)
8307 c = *fmt++;
8308 }
8309 }
8310 if (fmtcnt < 0) {
8311 PyErr_SetString(PyExc_ValueError,
8312 "incomplete format");
8313 goto onError;
8314 }
8315 if (c != '%') {
8316 v = getnextarg(args, arglen, &argidx);
8317 if (v == NULL)
8318 goto onError;
8319 }
8320 sign = 0;
8321 fill = ' ';
8322 switch (c) {
8323
8324 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008325 pbuf = formatbuf;
8326 /* presume that buffer length is at least 1 */
8327 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 len = 1;
8329 break;
8330
8331 case 's':
8332 case 'r':
8333 if (PyUnicode_Check(v) && c == 's') {
8334 temp = v;
8335 Py_INCREF(temp);
8336 }
8337 else {
8338 PyObject *unicode;
8339 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008340 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 else
8342 temp = PyObject_Repr(v);
8343 if (temp == NULL)
8344 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008345 if (PyUnicode_Check(temp))
8346 /* nothing to do */;
8347 else if (PyString_Check(temp)) {
8348 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008349 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008351 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008353 Py_DECREF(temp);
8354 temp = unicode;
8355 if (temp == NULL)
8356 goto onError;
8357 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008358 else {
8359 Py_DECREF(temp);
8360 PyErr_SetString(PyExc_TypeError,
8361 "%s argument has non-string str()");
8362 goto onError;
8363 }
8364 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008365 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 len = PyUnicode_GET_SIZE(temp);
8367 if (prec >= 0 && len > prec)
8368 len = prec;
8369 break;
8370
8371 case 'i':
8372 case 'd':
8373 case 'u':
8374 case 'o':
8375 case 'x':
8376 case 'X':
8377 if (c == 'i')
8378 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008379 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008380 temp = formatlong(v, flags, prec, c);
8381 if (!temp)
8382 goto onError;
8383 pbuf = PyUnicode_AS_UNICODE(temp);
8384 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008385 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008387 else {
8388 pbuf = formatbuf;
8389 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8390 flags, prec, c, v);
8391 if (len < 0)
8392 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008393 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008394 }
8395 if (flags & F_ZERO)
8396 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 break;
8398
8399 case 'e':
8400 case 'E':
8401 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008402 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 case 'g':
8404 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008405 if (c == 'F')
8406 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008407 pbuf = formatbuf;
8408 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8409 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 if (len < 0)
8411 goto onError;
8412 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008413 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 fill = '0';
8415 break;
8416
8417 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008418 pbuf = formatbuf;
8419 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 if (len < 0)
8421 goto onError;
8422 break;
8423
8424 default:
8425 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008426 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008427 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008428 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008429 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008430 (Py_ssize_t)(fmt - 1 -
8431 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 goto onError;
8433 }
8434 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008435 if (*pbuf == '-' || *pbuf == '+') {
8436 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 len--;
8438 }
8439 else if (flags & F_SIGN)
8440 sign = '+';
8441 else if (flags & F_BLANK)
8442 sign = ' ';
8443 else
8444 sign = 0;
8445 }
8446 if (width < len)
8447 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008448 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 reslen -= rescnt;
8450 rescnt = width + fmtcnt + 100;
8451 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008452 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008453 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008454 PyErr_NoMemory();
8455 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008456 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008457 if (_PyUnicode_Resize(&result, reslen) < 0) {
8458 Py_XDECREF(temp);
8459 goto onError;
8460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 res = PyUnicode_AS_UNICODE(result)
8462 + reslen - rescnt;
8463 }
8464 if (sign) {
8465 if (fill != ' ')
8466 *res++ = sign;
8467 rescnt--;
8468 if (width > len)
8469 width--;
8470 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008471 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008472 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008473 assert(pbuf[1] == c);
8474 if (fill != ' ') {
8475 *res++ = *pbuf++;
8476 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008477 }
Tim Petersfff53252001-04-12 18:38:48 +00008478 rescnt -= 2;
8479 width -= 2;
8480 if (width < 0)
8481 width = 0;
8482 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 if (width > len && !(flags & F_LJUST)) {
8485 do {
8486 --rescnt;
8487 *res++ = fill;
8488 } while (--width > len);
8489 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008490 if (fill == ' ') {
8491 if (sign)
8492 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008493 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008494 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008495 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008496 *res++ = *pbuf++;
8497 *res++ = *pbuf++;
8498 }
8499 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008500 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 res += len;
8502 rescnt -= len;
8503 while (--width >= len) {
8504 --rescnt;
8505 *res++ = ' ';
8506 }
8507 if (dict && (argidx < arglen) && c != '%') {
8508 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008509 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008510 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 goto onError;
8512 }
8513 Py_XDECREF(temp);
8514 } /* '%' */
8515 } /* until end */
8516 if (argidx < arglen && !dict) {
8517 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008518 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 goto onError;
8520 }
8521
Thomas Woutersa96affe2006-03-12 00:29:36 +00008522 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8523 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 if (args_owned) {
8525 Py_DECREF(args);
8526 }
8527 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 return (PyObject *)result;
8529
8530 onError:
8531 Py_XDECREF(result);
8532 Py_DECREF(uformat);
8533 if (args_owned) {
8534 Py_DECREF(args);
8535 }
8536 return NULL;
8537}
8538
8539static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 (readbufferproc) unicode_buffer_getreadbuf,
8541 (writebufferproc) unicode_buffer_getwritebuf,
8542 (segcountproc) unicode_buffer_getsegcount,
8543 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544};
8545
Jeremy Hylton938ace62002-07-17 16:30:39 +00008546static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008547unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8548
Tim Peters6d6c1a32001-08-02 04:15:00 +00008549static PyObject *
8550unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8551{
8552 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008553 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008554 char *encoding = NULL;
8555 char *errors = NULL;
8556
Guido van Rossume023fe02001-08-30 03:12:59 +00008557 if (type != &PyUnicode_Type)
8558 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008559 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8560 kwlist, &x, &encoding, &errors))
8561 return NULL;
8562 if (x == NULL)
8563 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008564 if (encoding == NULL && errors == NULL)
8565 return PyObject_Unicode(x);
8566 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008567 return PyUnicode_FromEncodedObject(x, encoding, errors);
8568}
8569
Guido van Rossume023fe02001-08-30 03:12:59 +00008570static PyObject *
8571unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8572{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008573 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008574 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008575
8576 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8577 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8578 if (tmp == NULL)
8579 return NULL;
8580 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008581 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008582 if (pnew == NULL) {
8583 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008584 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008585 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008586 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8587 if (pnew->str == NULL) {
8588 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008589 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008590 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008591 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008592 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008593 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8594 pnew->length = n;
8595 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008596 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008597 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008598}
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008601"unicode(string [, encoding[, errors]]) -> object\n\
8602\n\
8603Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008604encoding defaults to the current default string encoding.\n\
8605errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008606
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008607static PyObject *unicode_iter(PyObject *seq);
8608
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609PyTypeObject PyUnicode_Type = {
8610 PyObject_HEAD_INIT(&PyType_Type)
8611 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008612 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 sizeof(PyUnicodeObject), /* tp_size */
8614 0, /* tp_itemsize */
8615 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008616 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008618 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008620 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008621 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008622 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008624 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 (hashfunc) unicode_hash, /* tp_hash*/
8626 0, /* tp_call*/
8627 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008628 PyObject_GenericGetAttr, /* tp_getattro */
8629 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008631 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8632 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008633 unicode_doc, /* tp_doc */
8634 0, /* tp_traverse */
8635 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008636 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008637 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008638 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008639 0, /* tp_iternext */
8640 unicode_methods, /* tp_methods */
8641 0, /* tp_members */
8642 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008643 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008644 0, /* tp_dict */
8645 0, /* tp_descr_get */
8646 0, /* tp_descr_set */
8647 0, /* tp_dictoffset */
8648 0, /* tp_init */
8649 0, /* tp_alloc */
8650 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008651 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652};
8653
8654/* Initialize the Unicode implementation */
8655
Thomas Wouters78890102000-07-22 19:25:51 +00008656void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008658 int i;
8659
Thomas Wouters477c8d52006-05-27 19:21:47 +00008660 /* XXX - move this array to unicodectype.c ? */
8661 Py_UNICODE linebreak[] = {
8662 0x000A, /* LINE FEED */
8663 0x000D, /* CARRIAGE RETURN */
8664 0x001C, /* FILE SEPARATOR */
8665 0x001D, /* GROUP SEPARATOR */
8666 0x001E, /* RECORD SEPARATOR */
8667 0x0085, /* NEXT LINE */
8668 0x2028, /* LINE SEPARATOR */
8669 0x2029, /* PARAGRAPH SEPARATOR */
8670 };
8671
Fred Drakee4315f52000-05-09 19:53:39 +00008672 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008673 unicode_freelist = NULL;
8674 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008676 if (!unicode_empty)
8677 return;
8678
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008679 for (i = 0; i < 256; i++)
8680 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008681 if (PyType_Ready(&PyUnicode_Type) < 0)
8682 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008683
8684 /* initialize the linebreak bloom filter */
8685 bloom_linebreak = make_bloom_mask(
8686 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8687 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008688
8689 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690}
8691
8692/* Finalize the Unicode implementation */
8693
8694void
Thomas Wouters78890102000-07-22 19:25:51 +00008695_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008697 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008698 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008700 Py_XDECREF(unicode_empty);
8701 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008702
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008703 for (i = 0; i < 256; i++) {
8704 if (unicode_latin1[i]) {
8705 Py_DECREF(unicode_latin1[i]);
8706 unicode_latin1[i] = NULL;
8707 }
8708 }
8709
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008710 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 PyUnicodeObject *v = u;
8712 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008713 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008714 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008715 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008716 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008718 unicode_freelist = NULL;
8719 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008721
Walter Dörwald16807132007-05-25 13:52:07 +00008722void
8723PyUnicode_InternInPlace(PyObject **p)
8724{
8725 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8726 PyObject *t;
8727 if (s == NULL || !PyUnicode_Check(s))
8728 Py_FatalError(
8729 "PyUnicode_InternInPlace: unicode strings only please!");
8730 /* If it's a subclass, we don't really know what putting
8731 it in the interned dict might do. */
8732 if (!PyUnicode_CheckExact(s))
8733 return;
8734 if (PyUnicode_CHECK_INTERNED(s))
8735 return;
8736 if (interned == NULL) {
8737 interned = PyDict_New();
8738 if (interned == NULL) {
8739 PyErr_Clear(); /* Don't leave an exception */
8740 return;
8741 }
8742 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008743 /* It might be that the GetItem call fails even
8744 though the key is present in the dictionary,
8745 namely when this happens during a stack overflow. */
8746 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008747 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008748 Py_END_ALLOW_RECURSION
8749
Walter Dörwald16807132007-05-25 13:52:07 +00008750 if (t) {
8751 Py_INCREF(t);
8752 Py_DECREF(*p);
8753 *p = t;
8754 return;
8755 }
8756
Martin v. Löwis5b222132007-06-10 09:51:05 +00008757 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008758 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8759 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008760 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008761 return;
8762 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008763 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008764 /* The two references in interned are not counted by refcnt.
8765 The deallocator will take care of this */
8766 s->ob_refcnt -= 2;
8767 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8768}
8769
8770void
8771PyUnicode_InternImmortal(PyObject **p)
8772{
8773 PyUnicode_InternInPlace(p);
8774 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8775 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8776 Py_INCREF(*p);
8777 }
8778}
8779
8780PyObject *
8781PyUnicode_InternFromString(const char *cp)
8782{
8783 PyObject *s = PyUnicode_FromString(cp);
8784 if (s == NULL)
8785 return NULL;
8786 PyUnicode_InternInPlace(&s);
8787 return s;
8788}
8789
8790void _Py_ReleaseInternedUnicodeStrings(void)
8791{
8792 PyObject *keys;
8793 PyUnicodeObject *s;
8794 Py_ssize_t i, n;
8795 Py_ssize_t immortal_size = 0, mortal_size = 0;
8796
8797 if (interned == NULL || !PyDict_Check(interned))
8798 return;
8799 keys = PyDict_Keys(interned);
8800 if (keys == NULL || !PyList_Check(keys)) {
8801 PyErr_Clear();
8802 return;
8803 }
8804
8805 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8806 detector, interned unicode strings are not forcibly deallocated;
8807 rather, we give them their stolen references back, and then clear
8808 and DECREF the interned dict. */
8809
8810 n = PyList_GET_SIZE(keys);
8811 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8812 n);
8813 for (i = 0; i < n; i++) {
8814 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8815 switch (s->state) {
8816 case SSTATE_NOT_INTERNED:
8817 /* XXX Shouldn't happen */
8818 break;
8819 case SSTATE_INTERNED_IMMORTAL:
8820 s->ob_refcnt += 1;
8821 immortal_size += s->length;
8822 break;
8823 case SSTATE_INTERNED_MORTAL:
8824 s->ob_refcnt += 2;
8825 mortal_size += s->length;
8826 break;
8827 default:
8828 Py_FatalError("Inconsistent interned string state.");
8829 }
8830 s->state = SSTATE_NOT_INTERNED;
8831 }
8832 fprintf(stderr, "total size of all interned strings: "
8833 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8834 "mortal/immortal\n", mortal_size, immortal_size);
8835 Py_DECREF(keys);
8836 PyDict_Clear(interned);
8837 Py_DECREF(interned);
8838 interned = NULL;
8839}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008840
8841
8842/********************* Unicode Iterator **************************/
8843
8844typedef struct {
8845 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008846 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008847 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8848} unicodeiterobject;
8849
8850static void
8851unicodeiter_dealloc(unicodeiterobject *it)
8852{
8853 _PyObject_GC_UNTRACK(it);
8854 Py_XDECREF(it->it_seq);
8855 PyObject_GC_Del(it);
8856}
8857
8858static int
8859unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8860{
8861 Py_VISIT(it->it_seq);
8862 return 0;
8863}
8864
8865static PyObject *
8866unicodeiter_next(unicodeiterobject *it)
8867{
8868 PyUnicodeObject *seq;
8869 PyObject *item;
8870
8871 assert(it != NULL);
8872 seq = it->it_seq;
8873 if (seq == NULL)
8874 return NULL;
8875 assert(PyUnicode_Check(seq));
8876
8877 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008878 item = PyUnicode_FromUnicode(
8879 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008880 if (item != NULL)
8881 ++it->it_index;
8882 return item;
8883 }
8884
8885 Py_DECREF(seq);
8886 it->it_seq = NULL;
8887 return NULL;
8888}
8889
8890static PyObject *
8891unicodeiter_len(unicodeiterobject *it)
8892{
8893 Py_ssize_t len = 0;
8894 if (it->it_seq)
8895 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8896 return PyInt_FromSsize_t(len);
8897}
8898
8899PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8900
8901static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008902 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8903 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008904 {NULL, NULL} /* sentinel */
8905};
8906
8907PyTypeObject PyUnicodeIter_Type = {
8908 PyObject_HEAD_INIT(&PyType_Type)
8909 0, /* ob_size */
8910 "unicodeiterator", /* tp_name */
8911 sizeof(unicodeiterobject), /* tp_basicsize */
8912 0, /* tp_itemsize */
8913 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008914 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008915 0, /* tp_print */
8916 0, /* tp_getattr */
8917 0, /* tp_setattr */
8918 0, /* tp_compare */
8919 0, /* tp_repr */
8920 0, /* tp_as_number */
8921 0, /* tp_as_sequence */
8922 0, /* tp_as_mapping */
8923 0, /* tp_hash */
8924 0, /* tp_call */
8925 0, /* tp_str */
8926 PyObject_GenericGetAttr, /* tp_getattro */
8927 0, /* tp_setattro */
8928 0, /* tp_as_buffer */
8929 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8930 0, /* tp_doc */
8931 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8932 0, /* tp_clear */
8933 0, /* tp_richcompare */
8934 0, /* tp_weaklistoffset */
8935 PyObject_SelfIter, /* tp_iter */
8936 (iternextfunc)unicodeiter_next, /* tp_iternext */
8937 unicodeiter_methods, /* tp_methods */
8938 0,
8939};
8940
8941static PyObject *
8942unicode_iter(PyObject *seq)
8943{
8944 unicodeiterobject *it;
8945
8946 if (!PyUnicode_Check(seq)) {
8947 PyErr_BadInternalCall();
8948 return NULL;
8949 }
8950 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8951 if (it == NULL)
8952 return NULL;
8953 it->it_index = 0;
8954 Py_INCREF(seq);
8955 it->it_seq = (PyUnicodeObject *)seq;
8956 _PyObject_GC_TRACK(it);
8957 return (PyObject *)it;
8958}
8959
Martin v. Löwis5b222132007-06-10 09:51:05 +00008960size_t
8961Py_UNICODE_strlen(const Py_UNICODE *u)
8962{
8963 int res = 0;
8964 while(*u++)
8965 res++;
8966 return res;
8967}
8968
8969Py_UNICODE*
8970Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8971{
8972 Py_UNICODE *u = s1;
8973 while ((*u++ = *s2++));
8974 return s1;
8975}
8976
8977Py_UNICODE*
8978Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8979{
8980 Py_UNICODE *u = s1;
8981 while ((*u++ = *s2++))
8982 if (n-- == 0)
8983 break;
8984 return s1;
8985}
8986
8987int
8988Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8989{
8990 while (*s1 && *s2 && *s1 == *s2)
8991 s1++, s2++;
8992 if (*s1 && *s2)
8993 return (*s1 < *s2) ? -1 : +1;
8994 if (*s1)
8995 return 1;
8996 if (*s2)
8997 return -1;
8998 return 0;
8999}
9000
9001Py_UNICODE*
9002Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9003{
9004 const Py_UNICODE *p;
9005 for (p = s; *p; p++)
9006 if (*p == c)
9007 return (Py_UNICODE*)p;
9008 return NULL;
9009}
9010
9011
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009012#ifdef __cplusplus
9013}
9014#endif
9015
9016
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009017/*
9018Local variables:
9019c-basic-offset: 4
9020indent-tabs-mode: nil
9021End:
9022*/