blob: 87c5c997281bf8df3f5a05409a2e8545ab570506 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
543 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000634 case 'S':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 PyObject *str;
638 assert(obj);
639 str = PyObject_Unicode(obj);
640 if (!str)
641 goto fail;
642 n += PyUnicode_GET_SIZE(str);
643 /* Remember the str and switch to the next slot */
644 *callresult++ = str;
645 break;
646 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000647 case 'R':
648 {
649 PyObject *obj = va_arg(count, PyObject *);
650 PyObject *repr;
651 assert(obj);
652 repr = PyObject_Repr(obj);
653 if (!repr)
654 goto fail;
655 n += PyUnicode_GET_SIZE(repr);
656 /* Remember the repr and switch to the next slot */
657 *callresult++ = repr;
658 break;
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 case 'p':
661 (void) va_arg(count, int);
662 /* maximum 64-bit pointer representation:
663 * 0xffffffffffffffff
664 * so 19 characters is enough.
665 * XXX I count 18 -- what's the extra for?
666 */
667 n += 19;
668 break;
669 default:
670 /* if we stumble upon an unknown
671 formatting code, copy the rest of
672 the format string to the output
673 string. (we cannot just skip the
674 code, since there's no way to know
675 what's in the argument list) */
676 n += strlen(p);
677 goto expand;
678 }
679 } else
680 n++;
681 }
682 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000683 if (abuffersize > 20) {
684 abuffer = PyMem_Malloc(abuffersize);
685 if (!abuffer) {
686 PyErr_NoMemory();
687 goto fail;
688 }
689 realbuffer = abuffer;
690 }
691 else
692 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000693 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000694 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000695 we don't have to resize the string.
696 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000697 string = PyUnicode_FromUnicode(NULL, n);
698 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000699 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000700
701 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000702 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703
704 for (f = format; *f; f++) {
705 if (*f == '%') {
706 const char* p = f++;
707 int longflag = 0;
708 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000709 zeropad = (*f == '0');
710 /* parse the width.precision part */
711 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000713 width = (width*10) + *f++ - '0';
714 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715 if (*f == '.') {
716 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000718 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720 /* handle the long flag, but only for %ld and %lu.
721 others can be added when necessary. */
722 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
723 longflag = 1;
724 ++f;
725 }
726 /* handle the size_t flag. */
727 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
728 size_tflag = 1;
729 ++f;
730 }
731
732 switch (*f) {
733 case 'c':
734 *s++ = va_arg(vargs, int);
735 break;
736 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000737 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000738 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000739 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000740 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000741 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000742 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000743 sprintf(realbuffer, fmt, va_arg(vargs, int));
744 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745 break;
746 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
754 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000755 break;
756 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000757 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
758 sprintf(realbuffer, fmt, va_arg(vargs, int));
759 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 break;
761 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
763 sprintf(realbuffer, fmt, va_arg(vargs, int));
764 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000765 break;
766 case 's':
767 p = va_arg(vargs, char*);
768 appendstring(p);
769 break;
770 case 'U':
771 {
772 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000773 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
774 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
775 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000778 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000779 case 'R':
780 {
781 /* unused, since we already have the result */
782 (void) va_arg(vargs, PyObject *);
783 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
784 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
785 Py_ssize_t upos;
786 for (upos = 0; upos<usize;)
787 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000788 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000789 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000790 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000791 ++callresult;
792 break;
793 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794 case 'p':
795 sprintf(buffer, "%p", va_arg(vargs, void*));
796 /* %p is ill-defined: ensure leading 0x. */
797 if (buffer[1] == 'X')
798 buffer[1] = 'x';
799 else if (buffer[1] != 'x') {
800 memmove(buffer+2, buffer, strlen(buffer)+1);
801 buffer[0] = '0';
802 buffer[1] = 'x';
803 }
804 appendstring(buffer);
805 break;
806 case '%':
807 *s++ = '%';
808 break;
809 default:
810 appendstring(p);
811 goto end;
812 }
813 } else
814 *s++ = *f;
815 }
816
817 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 if (callresults)
819 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000820 if (abuffer)
821 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000822 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
823 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000824 fail:
825 if (callresults) {
826 PyObject **callresult2 = callresults;
827 while (callresult2 <= callresult) {
828 Py_DECREF(*callresult2);
829 ++callresult2;
830 }
831 PyMem_Free(callresults);
832 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000833 if (abuffer)
834 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000835 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836}
837
838#undef appendstring
839
840PyObject *
841PyUnicode_FromFormat(const char *format, ...)
842{
843 PyObject* ret;
844 va_list vargs;
845
846#ifdef HAVE_STDARG_PROTOTYPES
847 va_start(vargs, format);
848#else
849 va_start(vargs);
850#endif
851 ret = PyUnicode_FromFormatV(format, vargs);
852 va_end(vargs);
853 return ret;
854}
855
Martin v. Löwis18e16552006-02-15 17:27:45 +0000856Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
857 wchar_t *w,
858 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859{
860 if (unicode == NULL) {
861 PyErr_BadInternalCall();
862 return -1;
863 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000864
865 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000867 size = PyUnicode_GET_SIZE(unicode) + 1;
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869#ifdef HAVE_USABLE_WCHAR_T
870 memcpy(w, unicode->str, size * sizeof(wchar_t));
871#else
872 {
873 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000874 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000876 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877 *w++ = *u++;
878 }
879#endif
880
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000881 if (size > PyUnicode_GET_SIZE(unicode))
882 return PyUnicode_GET_SIZE(unicode);
883 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return size;
885}
886
887#endif
888
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000889PyObject *PyUnicode_FromOrdinal(int ordinal)
890{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000891 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000892
893#ifdef Py_UNICODE_WIDE
894 if (ordinal < 0 || ordinal > 0x10ffff) {
895 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000896 "chr() arg not in range(0x110000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000897 "(wide Python build)");
898 return NULL;
899 }
900#else
901 if (ordinal < 0 || ordinal > 0xffff) {
902 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000903 "chr() arg not in range(0x10000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000904 "(narrow Python build)");
905 return NULL;
906 }
907#endif
908
Hye-Shik Chang40574832004-04-06 07:24:51 +0000909 s[0] = (Py_UNICODE)ordinal;
910 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000911}
912
Guido van Rossumd57fd912000-03-10 22:53:23 +0000913PyObject *PyUnicode_FromObject(register PyObject *obj)
914{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000915 /* XXX Perhaps we should make this API an alias of
916 PyObject_Unicode() instead ?! */
917 if (PyUnicode_CheckExact(obj)) {
918 Py_INCREF(obj);
919 return obj;
920 }
921 if (PyUnicode_Check(obj)) {
922 /* For a Unicode subtype that's not a Unicode object,
923 return a true Unicode object with the same data. */
924 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
925 PyUnicode_GET_SIZE(obj));
926 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000927 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
928}
929
930PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
931 const char *encoding,
932 const char *errors)
933{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000934 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000935 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000936 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938 if (obj == NULL) {
939 PyErr_BadInternalCall();
940 return NULL;
941 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000942
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000943#if 0
944 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000945 that no encodings is given and then redirect to
946 PyObject_Unicode() which then applies the additional logic for
947 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000948
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000949 NOTE: This API should really only be used for object which
950 represent *encoded* Unicode !
951
952 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000953 if (PyUnicode_Check(obj)) {
954 if (encoding) {
955 PyErr_SetString(PyExc_TypeError,
956 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000957 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000958 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000959 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000960 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000961#else
962 if (PyUnicode_Check(obj)) {
963 PyErr_SetString(PyExc_TypeError,
964 "decoding Unicode is not supported");
965 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000966 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000967#endif
968
969 /* Coerce object */
970 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971 s = PyString_AS_STRING(obj);
972 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000973 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000974 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
975 /* Overwrite the error message with something more useful in
976 case of a TypeError. */
977 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 "coercing to Unicode: need string or buffer, "
980 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000981 obj->ob_type->tp_name);
982 goto onError;
983 }
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000985 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 if (len == 0) {
987 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000988 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 }
Tim Petersced69f82003-09-16 20:30:58 +0000990 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000991 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000992
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 return v;
994
995 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997}
998
999PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001 const char *encoding,
1002 const char *errors)
1003{
1004 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001005
1006 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001007 encoding = PyUnicode_GetDefaultEncoding();
1008
1009 /* Shortcuts for common default encodings */
1010 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001012 else if (strcmp(encoding, "latin-1") == 0)
1013 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001014#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1015 else if (strcmp(encoding, "mbcs") == 0)
1016 return PyUnicode_DecodeMBCS(s, size, errors);
1017#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001018 else if (strcmp(encoding, "ascii") == 0)
1019 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020
1021 /* Decode via the codec registry */
1022 buffer = PyBuffer_FromMemory((void *)s, size);
1023 if (buffer == NULL)
1024 goto onError;
1025 unicode = PyCodec_Decode(buffer, encoding, errors);
1026 if (unicode == NULL)
1027 goto onError;
1028 if (!PyUnicode_Check(unicode)) {
1029 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001030 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 unicode->ob_type->tp_name);
1032 Py_DECREF(unicode);
1033 goto onError;
1034 }
1035 Py_DECREF(buffer);
1036 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001037
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 onError:
1039 Py_XDECREF(buffer);
1040 return NULL;
1041}
1042
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001043PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1044 const char *encoding,
1045 const char *errors)
1046{
1047 PyObject *v;
1048
1049 if (!PyUnicode_Check(unicode)) {
1050 PyErr_BadArgument();
1051 goto onError;
1052 }
1053
1054 if (encoding == NULL)
1055 encoding = PyUnicode_GetDefaultEncoding();
1056
1057 /* Decode via the codec registry */
1058 v = PyCodec_Decode(unicode, encoding, errors);
1059 if (v == NULL)
1060 goto onError;
1061 return v;
1062
1063 onError:
1064 return NULL;
1065}
1066
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001068 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069 const char *encoding,
1070 const char *errors)
1071{
1072 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001073
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 unicode = PyUnicode_FromUnicode(s, size);
1075 if (unicode == NULL)
1076 return NULL;
1077 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1078 Py_DECREF(unicode);
1079 return v;
1080}
1081
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001082PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1083 const char *encoding,
1084 const char *errors)
1085{
1086 PyObject *v;
1087
1088 if (!PyUnicode_Check(unicode)) {
1089 PyErr_BadArgument();
1090 goto onError;
1091 }
1092
1093 if (encoding == NULL)
1094 encoding = PyUnicode_GetDefaultEncoding();
1095
1096 /* Encode via the codec registry */
1097 v = PyCodec_Encode(unicode, encoding, errors);
1098 if (v == NULL)
1099 goto onError;
1100 return v;
1101
1102 onError:
1103 return NULL;
1104}
1105
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1107 const char *encoding,
1108 const char *errors)
1109{
1110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (!PyUnicode_Check(unicode)) {
1113 PyErr_BadArgument();
1114 goto onError;
1115 }
Fred Drakee4315f52000-05-09 19:53:39 +00001116
Tim Petersced69f82003-09-16 20:30:58 +00001117 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001118 encoding = PyUnicode_GetDefaultEncoding();
1119
1120 /* Shortcuts for common default encodings */
1121 if (errors == NULL) {
1122 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001123 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001124 else if (strcmp(encoding, "latin-1") == 0)
1125 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001126#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1127 else if (strcmp(encoding, "mbcs") == 0)
1128 return PyUnicode_AsMBCSString(unicode);
1129#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001130 else if (strcmp(encoding, "ascii") == 0)
1131 return PyUnicode_AsASCIIString(unicode);
1132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 /* Encode via the codec registry */
1135 v = PyCodec_Encode(unicode, encoding, errors);
1136 if (v == NULL)
1137 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001138 if (!PyBytes_Check(v)) {
1139 if (PyString_Check(v)) {
1140 /* Old codec, turn it into bytes */
1141 PyObject *b = PyBytes_FromObject(v);
1142 Py_DECREF(v);
1143 return b;
1144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001146 "encoder did not return a bytes object "
1147 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1148 v->ob_type->tp_name,
1149 encoding ? encoding : "NULL",
1150 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 Py_DECREF(v);
1152 goto onError;
1153 }
1154 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001155
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 onError:
1157 return NULL;
1158}
1159
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001160PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1161 const char *errors)
1162{
1163 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001164 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001165 if (v)
1166 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001167 if (errors != NULL)
1168 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1169 if (errors == NULL) {
1170 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1171 PyUnicode_GET_SIZE(unicode),
1172 NULL);
1173 }
1174 else {
1175 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1176 }
1177 if (!b)
1178 return NULL;
1179 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1180 PyBytes_Size(b));
1181 Py_DECREF(b);
1182 if (!errors) {
1183 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001184 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001185 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001186 return v;
1187}
1188
Martin v. Löwis5b222132007-06-10 09:51:05 +00001189char*
1190PyUnicode_AsString(PyObject *unicode)
1191{
1192 assert(PyUnicode_Check(unicode));
1193 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1194 if (!unicode)
1195 return NULL;
1196 return PyString_AsString(unicode);
1197}
1198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1200{
1201 if (!PyUnicode_Check(unicode)) {
1202 PyErr_BadArgument();
1203 goto onError;
1204 }
1205 return PyUnicode_AS_UNICODE(unicode);
1206
1207 onError:
1208 return NULL;
1209}
1210
Martin v. Löwis18e16552006-02-15 17:27:45 +00001211Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212{
1213 if (!PyUnicode_Check(unicode)) {
1214 PyErr_BadArgument();
1215 goto onError;
1216 }
1217 return PyUnicode_GET_SIZE(unicode);
1218
1219 onError:
1220 return -1;
1221}
1222
Thomas Wouters78890102000-07-22 19:25:51 +00001223const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001224{
1225 return unicode_default_encoding;
1226}
1227
1228int PyUnicode_SetDefaultEncoding(const char *encoding)
1229{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001230 if (strcmp(encoding, unicode_default_encoding) != 0) {
1231 PyErr_Format(PyExc_ValueError,
1232 "Can only set default encoding to %s",
1233 unicode_default_encoding);
1234 return -1;
1235 }
Fred Drakee4315f52000-05-09 19:53:39 +00001236 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001237}
1238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001239/* error handling callback helper:
1240 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001241 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242 and adjust various state variables.
1243 return 0 on success, -1 on error
1244*/
1245
1246static
1247int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1248 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001249 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1250 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001252 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253
1254 PyObject *restuple = NULL;
1255 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001256 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1257 Py_ssize_t requiredsize;
1258 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001259 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001260 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 int res = -1;
1262
1263 if (*errorHandler == NULL) {
1264 *errorHandler = PyCodec_LookupError(errors);
1265 if (*errorHandler == NULL)
1266 goto onError;
1267 }
1268
1269 if (*exceptionObject == NULL) {
1270 *exceptionObject = PyUnicodeDecodeError_Create(
1271 encoding, input, insize, *startinpos, *endinpos, reason);
1272 if (*exceptionObject == NULL)
1273 goto onError;
1274 }
1275 else {
1276 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1277 goto onError;
1278 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1279 goto onError;
1280 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1281 goto onError;
1282 }
1283
1284 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1285 if (restuple == NULL)
1286 goto onError;
1287 if (!PyTuple_Check(restuple)) {
1288 PyErr_Format(PyExc_TypeError, &argparse[4]);
1289 goto onError;
1290 }
1291 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1292 goto onError;
1293 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001294 newpos = insize+newpos;
1295 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001296 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001297 goto onError;
1298 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299
1300 /* need more space? (at least enough for what we
1301 have+the replacement+the rest of the string (starting
1302 at the new input position), so we won't have to check space
1303 when there are no errors in the rest of the string) */
1304 repptr = PyUnicode_AS_UNICODE(repunicode);
1305 repsize = PyUnicode_GET_SIZE(repunicode);
1306 requiredsize = *outpos + repsize + insize-newpos;
1307 if (requiredsize > outsize) {
1308 if (requiredsize<2*outsize)
1309 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001310 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 goto onError;
1312 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1313 }
1314 *endinpos = newpos;
1315 *inptr = input + newpos;
1316 Py_UNICODE_COPY(*outptr, repptr, repsize);
1317 *outptr += repsize;
1318 *outpos += repsize;
1319 /* we made it! */
1320 res = 0;
1321
1322 onError:
1323 Py_XDECREF(restuple);
1324 return res;
1325}
1326
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001327/* --- UTF-7 Codec -------------------------------------------------------- */
1328
1329/* see RFC2152 for details */
1330
Tim Petersced69f82003-09-16 20:30:58 +00001331static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001332char utf7_special[128] = {
1333 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1334 encoded:
1335 0 - not special
1336 1 - special
1337 2 - whitespace (optional)
1338 3 - RFC2152 Set O (optional) */
1339 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1341 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1343 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1345 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1347
1348};
1349
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001350/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1351 warnings about the comparison always being false; since
1352 utf7_special[0] is 1, we can safely make that one comparison
1353 true */
1354
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001355#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001356 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001357 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001358 (encodeO && (utf7_special[(c)] == 3)))
1359
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001360#define B64(n) \
1361 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1362#define B64CHAR(c) \
1363 (isalnum(c) || (c) == '+' || (c) == '/')
1364#define UB64(c) \
1365 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1366 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001367
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001368#define ENCODE(out, ch, bits) \
1369 while (bits >= 6) { \
1370 *out++ = B64(ch >> (bits-6)); \
1371 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001372 }
1373
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001374#define DECODE(out, ch, bits, surrogate) \
1375 while (bits >= 16) { \
1376 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1377 bits -= 16; \
1378 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379 /* We have already generated an error for the high surrogate \
1380 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001381 surrogate = 0; \
1382 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001383 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001384 it in a 16-bit character */ \
1385 surrogate = 1; \
1386 errmsg = "code pairs are not supported"; \
1387 goto utf7Error; \
1388 } else { \
1389 *out++ = outCh; \
1390 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001391 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001393PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001394 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001395 const char *errors)
1396{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001398 Py_ssize_t startinpos;
1399 Py_ssize_t endinpos;
1400 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401 const char *e;
1402 PyUnicodeObject *unicode;
1403 Py_UNICODE *p;
1404 const char *errmsg = "";
1405 int inShift = 0;
1406 unsigned int bitsleft = 0;
1407 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408 int surrogate = 0;
1409 PyObject *errorHandler = NULL;
1410 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411
1412 unicode = _PyUnicode_New(size);
1413 if (!unicode)
1414 return NULL;
1415 if (size == 0)
1416 return (PyObject *)unicode;
1417
1418 p = unicode->str;
1419 e = s + size;
1420
1421 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001422 Py_UNICODE ch;
1423 restart:
1424 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425
1426 if (inShift) {
1427 if ((ch == '-') || !B64CHAR(ch)) {
1428 inShift = 0;
1429 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001430
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1432 if (bitsleft >= 6) {
1433 /* The shift sequence has a partial character in it. If
1434 bitsleft < 6 then we could just classify it as padding
1435 but that is not the case here */
1436
1437 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001438 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439 }
1440 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001441 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 here so indicate the potential of a misencoded character. */
1443
1444 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1445 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1446 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001447 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448 }
1449
1450 if (ch == '-') {
1451 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001452 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453 inShift = 1;
1454 }
1455 } else if (SPECIAL(ch,0,0)) {
1456 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001457 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 } else {
1459 *p++ = ch;
1460 }
1461 } else {
1462 charsleft = (charsleft << 6) | UB64(ch);
1463 bitsleft += 6;
1464 s++;
1465 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1466 }
1467 }
1468 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001469 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 s++;
1471 if (s < e && *s == '-') {
1472 s++;
1473 *p++ = '+';
1474 } else
1475 {
1476 inShift = 1;
1477 bitsleft = 0;
1478 }
1479 }
1480 else if (SPECIAL(ch,0,0)) {
1481 errmsg = "unexpected special character";
1482 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001483 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001484 }
1485 else {
1486 *p++ = ch;
1487 s++;
1488 }
1489 continue;
1490 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 outpos = p-PyUnicode_AS_UNICODE(unicode);
1492 endinpos = s-starts;
1493 if (unicode_decode_call_errorhandler(
1494 errors, &errorHandler,
1495 "utf7", errmsg,
1496 starts, size, &startinpos, &endinpos, &exc, &s,
1497 (PyObject **)&unicode, &outpos, &p))
1498 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499 }
1500
1501 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502 outpos = p-PyUnicode_AS_UNICODE(unicode);
1503 endinpos = size;
1504 if (unicode_decode_call_errorhandler(
1505 errors, &errorHandler,
1506 "utf7", "unterminated shift sequence",
1507 starts, size, &startinpos, &endinpos, &exc, &s,
1508 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 if (s < e)
1511 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 }
1513
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001514 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001515 goto onError;
1516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 Py_XDECREF(errorHandler);
1518 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519 return (PyObject *)unicode;
1520
1521onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 Py_XDECREF(errorHandler);
1523 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 Py_DECREF(unicode);
1525 return NULL;
1526}
1527
1528
1529PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001530 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531 int encodeSetO,
1532 int encodeWhiteSpace,
1533 const char *errors)
1534{
1535 PyObject *v;
1536 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001537 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 unsigned int bitsleft = 0;
1541 unsigned long charsleft = 0;
1542 char * out;
1543 char * start;
1544
1545 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001546 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547
Walter Dörwald51ab4142007-05-05 14:43:36 +00001548 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 if (v == NULL)
1550 return NULL;
1551
Walter Dörwald51ab4142007-05-05 14:43:36 +00001552 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 for (;i < size; ++i) {
1554 Py_UNICODE ch = s[i];
1555
1556 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001557 if (ch == '+') {
1558 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 *out++ = '-';
1560 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1561 charsleft = ch;
1562 bitsleft = 16;
1563 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001564 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001566 } else {
1567 *out++ = (char) ch;
1568 }
1569 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1571 *out++ = B64(charsleft << (6-bitsleft));
1572 charsleft = 0;
1573 bitsleft = 0;
1574 /* Characters not in the BASE64 set implicitly unshift the sequence
1575 so no '-' is required, except if the character is itself a '-' */
1576 if (B64CHAR(ch) || ch == '-') {
1577 *out++ = '-';
1578 }
1579 inShift = 0;
1580 *out++ = (char) ch;
1581 } else {
1582 bitsleft += 16;
1583 charsleft = (charsleft << 16) | ch;
1584 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1585
1586 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001587 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 or '-' then the shift sequence will be terminated implicitly and we
1589 don't have to insert a '-'. */
1590
1591 if (bitsleft == 0) {
1592 if (i + 1 < size) {
1593 Py_UNICODE ch2 = s[i+1];
1594
1595 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001596
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 } else if (B64CHAR(ch2) || ch2 == '-') {
1598 *out++ = '-';
1599 inShift = 0;
1600 } else {
1601 inShift = 0;
1602 }
1603
1604 }
1605 else {
1606 *out++ = '-';
1607 inShift = 0;
1608 }
1609 }
Tim Petersced69f82003-09-16 20:30:58 +00001610 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001613 if (bitsleft) {
1614 *out++= B64(charsleft << (6-bitsleft) );
1615 *out++ = '-';
1616 }
1617
Walter Dörwald51ab4142007-05-05 14:43:36 +00001618 if (PyBytes_Resize(v, out - start)) {
1619 Py_DECREF(v);
1620 return NULL;
1621 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 return v;
1623}
1624
1625#undef SPECIAL
1626#undef B64
1627#undef B64CHAR
1628#undef UB64
1629#undef ENCODE
1630#undef DECODE
1631
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632/* --- UTF-8 Codec -------------------------------------------------------- */
1633
Tim Petersced69f82003-09-16 20:30:58 +00001634static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635char utf8_code_length[256] = {
1636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1637 illegal prefix. see RFC 2279 for details */
1638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1654};
1655
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001657 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658 const char *errors)
1659{
Walter Dörwald69652032004-09-07 20:24:22 +00001660 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1661}
1662
1663PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001664 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001665 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001666 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001667{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001668 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001670 Py_ssize_t startinpos;
1671 Py_ssize_t endinpos;
1672 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 const char *e;
1674 PyUnicodeObject *unicode;
1675 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001676 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 PyObject *errorHandler = NULL;
1678 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679
1680 /* Note: size will always be longer than the resulting Unicode
1681 character count */
1682 unicode = _PyUnicode_New(size);
1683 if (!unicode)
1684 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001685 if (size == 0) {
1686 if (consumed)
1687 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690
1691 /* Unpack UTF-8 encoded data */
1692 p = unicode->str;
1693 e = s + size;
1694
1695 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001696 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697
1698 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001699 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 s++;
1701 continue;
1702 }
1703
1704 n = utf8_code_length[ch];
1705
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001706 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001707 if (consumed)
1708 break;
1709 else {
1710 errmsg = "unexpected end of data";
1711 startinpos = s-starts;
1712 endinpos = size;
1713 goto utf8Error;
1714 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
1717 switch (n) {
1718
1719 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001720 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 startinpos = s-starts;
1722 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001723 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001726 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727 startinpos = s-starts;
1728 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730
1731 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001732 if ((s[1] & 0xc0) != 0x80) {
1733 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 startinpos = s-starts;
1735 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001736 goto utf8Error;
1737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001739 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740 startinpos = s-starts;
1741 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001742 errmsg = "illegal encoding";
1743 goto utf8Error;
1744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 break;
1748
1749 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 (s[2] & 0xc0) != 0x80) {
1752 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 startinpos = s-starts;
1754 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 goto utf8Error;
1756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001758 if (ch < 0x0800) {
1759 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001760 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001761
1762 XXX For wide builds (UCS-4) we should probably try
1763 to recombine the surrogates into a single code
1764 unit.
1765 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 startinpos = s-starts;
1768 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 goto utf8Error;
1770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001772 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001773 break;
1774
1775 case 4:
1776 if ((s[1] & 0xc0) != 0x80 ||
1777 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 (s[3] & 0xc0) != 0x80) {
1779 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 startinpos = s-starts;
1781 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001782 goto utf8Error;
1783 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001784 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1785 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1786 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001787 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001788 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001789 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001790 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001791 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793 startinpos = s-starts;
1794 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001795 goto utf8Error;
1796 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001797#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001798 *p++ = (Py_UNICODE)ch;
1799#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001800 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001801
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001802 /* translate from 10000..10FFFF to 0..FFFF */
1803 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001804
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001805 /* high surrogate = top 10 bits added to D800 */
1806 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001807
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001808 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001809 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 break;
1812
1813 default:
1814 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 }
1820 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001822
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 outpos = p-PyUnicode_AS_UNICODE(unicode);
1825 if (unicode_decode_call_errorhandler(
1826 errors, &errorHandler,
1827 "utf8", errmsg,
1828 starts, size, &startinpos, &endinpos, &exc, &s,
1829 (PyObject **)&unicode, &outpos, &p))
1830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Walter Dörwald69652032004-09-07 20:24:22 +00001832 if (consumed)
1833 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834
1835 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001836 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 goto onError;
1838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 Py_XDECREF(errorHandler);
1840 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 return (PyObject *)unicode;
1842
1843onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 Py_XDECREF(errorHandler);
1845 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 Py_DECREF(unicode);
1847 return NULL;
1848}
1849
Tim Peters602f7402002-04-27 18:03:26 +00001850/* Allocation strategy: if the string is short, convert into a stack buffer
1851 and allocate exactly as much space needed at the end. Else allocate the
1852 maximum possible needed (4 result bytes per Unicode character), and return
1853 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001854*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001855PyObject *
1856PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001857 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859{
Tim Peters602f7402002-04-27 18:03:26 +00001860#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001861
Martin v. Löwis18e16552006-02-15 17:27:45 +00001862 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001863 PyObject *v; /* result string object */
1864 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001865 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001866 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001867 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001868
Tim Peters602f7402002-04-27 18:03:26 +00001869 assert(s != NULL);
1870 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871
Tim Peters602f7402002-04-27 18:03:26 +00001872 if (size <= MAX_SHORT_UNICHARS) {
1873 /* Write into the stack buffer; nallocated can't overflow.
1874 * At the end, we'll allocate exactly as much heap space as it
1875 * turns out we need.
1876 */
1877 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1878 v = NULL; /* will allocate after we're done */
1879 p = stackbuf;
1880 }
1881 else {
1882 /* Overallocate on the heap, and give the excess back at the end. */
1883 nallocated = size * 4;
1884 if (nallocated / 4 != size) /* overflow! */
1885 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001886 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001887 if (v == NULL)
1888 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001889 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001890 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001891
Tim Peters602f7402002-04-27 18:03:26 +00001892 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001893 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001894
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001895 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001896 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001898
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001900 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001901 *p++ = (char)(0xc0 | (ch >> 6));
1902 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001903 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001904 else {
Tim Peters602f7402002-04-27 18:03:26 +00001905 /* Encode UCS2 Unicode ordinals */
1906 if (ch < 0x10000) {
1907 /* Special case: check for high surrogate */
1908 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1909 Py_UCS4 ch2 = s[i];
1910 /* Check for low surrogate and combine the two to
1911 form a UCS4 value */
1912 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001913 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001914 i++;
1915 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 }
Tim Peters602f7402002-04-27 18:03:26 +00001917 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001919 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001920 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1921 *p++ = (char)(0x80 | (ch & 0x3f));
1922 continue;
1923 }
1924encodeUCS4:
1925 /* Encode UCS4 Unicode ordinals */
1926 *p++ = (char)(0xf0 | (ch >> 18));
1927 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1928 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1929 *p++ = (char)(0x80 | (ch & 0x3f));
1930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001932
Tim Peters602f7402002-04-27 18:03:26 +00001933 if (v == NULL) {
1934 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001935 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001936 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001937 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001938 }
1939 else {
1940 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001941 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001942 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001943 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001944 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001946
Tim Peters602f7402002-04-27 18:03:26 +00001947#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948}
1949
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 if (!PyUnicode_Check(unicode)) {
1953 PyErr_BadArgument();
1954 return NULL;
1955 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001956 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1957 PyUnicode_GET_SIZE(unicode),
1958 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959}
1960
1961/* --- UTF-16 Codec ------------------------------------------------------- */
1962
Tim Peters772747b2001-08-09 22:21:55 +00001963PyObject *
1964PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001965 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001966 const char *errors,
1967 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968{
Walter Dörwald69652032004-09-07 20:24:22 +00001969 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1970}
1971
1972PyObject *
1973PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001974 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001975 const char *errors,
1976 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001977 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001980 Py_ssize_t startinpos;
1981 Py_ssize_t endinpos;
1982 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 PyUnicodeObject *unicode;
1984 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001985 const unsigned char *q, *e;
1986 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001987 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001988 /* Offsets from q for retrieving byte pairs in the right order. */
1989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1990 int ihi = 1, ilo = 0;
1991#else
1992 int ihi = 0, ilo = 1;
1993#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 PyObject *errorHandler = NULL;
1995 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
1997 /* Note: size will always be longer than the resulting Unicode
1998 character count */
1999 unicode = _PyUnicode_New(size);
2000 if (!unicode)
2001 return NULL;
2002 if (size == 0)
2003 return (PyObject *)unicode;
2004
2005 /* Unpack UTF-16 encoded data */
2006 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002007 q = (unsigned char *)s;
2008 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009
2010 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002011 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002013 /* Check for BOM marks (U+FEFF) in the input and adjust current
2014 byte order setting accordingly. In native mode, the leading BOM
2015 mark is skipped, in all other modes, it is copied to the output
2016 stream as-is (giving a ZWNBSP character). */
2017 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002018 if (size >= 2) {
2019 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002020#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002021 if (bom == 0xFEFF) {
2022 q += 2;
2023 bo = -1;
2024 }
2025 else if (bom == 0xFFFE) {
2026 q += 2;
2027 bo = 1;
2028 }
Tim Petersced69f82003-09-16 20:30:58 +00002029#else
Walter Dörwald69652032004-09-07 20:24:22 +00002030 if (bom == 0xFEFF) {
2031 q += 2;
2032 bo = 1;
2033 }
2034 else if (bom == 0xFFFE) {
2035 q += 2;
2036 bo = -1;
2037 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002038#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002039 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
Tim Peters772747b2001-08-09 22:21:55 +00002042 if (bo == -1) {
2043 /* force LE */
2044 ihi = 1;
2045 ilo = 0;
2046 }
2047 else if (bo == 1) {
2048 /* force BE */
2049 ihi = 0;
2050 ilo = 1;
2051 }
2052
2053 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002055 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002057 if (consumed)
2058 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 errmsg = "truncated data";
2060 startinpos = ((const char *)q)-starts;
2061 endinpos = ((const char *)e)-starts;
2062 goto utf16Error;
2063 /* The remaining input chars are ignored if the callback
2064 chooses to skip the input */
2065 }
2066 ch = (q[ihi] << 8) | q[ilo];
2067
Tim Peters772747b2001-08-09 22:21:55 +00002068 q += 2;
2069
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 if (ch < 0xD800 || ch > 0xDFFF) {
2071 *p++ = ch;
2072 continue;
2073 }
2074
2075 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002076 if (q >= e) {
2077 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 startinpos = (((const char *)q)-2)-starts;
2079 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002080 goto utf16Error;
2081 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002082 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002083 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2084 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002085 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002086#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002087 *p++ = ch;
2088 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002089#else
2090 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002091#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002092 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002093 }
2094 else {
2095 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096 startinpos = (((const char *)q)-4)-starts;
2097 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002098 goto utf16Error;
2099 }
2100
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002102 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 startinpos = (((const char *)q)-2)-starts;
2104 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002105 /* Fall through to report the error */
2106
2107 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108 outpos = p-PyUnicode_AS_UNICODE(unicode);
2109 if (unicode_decode_call_errorhandler(
2110 errors, &errorHandler,
2111 "utf16", errmsg,
2112 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2113 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 }
2116
2117 if (byteorder)
2118 *byteorder = bo;
2119
Walter Dörwald69652032004-09-07 20:24:22 +00002120 if (consumed)
2121 *consumed = (const char *)q-starts;
2122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002124 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 goto onError;
2126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 Py_XDECREF(errorHandler);
2128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 return (PyObject *)unicode;
2130
2131onError:
2132 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 Py_XDECREF(errorHandler);
2134 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 return NULL;
2136}
2137
Tim Peters772747b2001-08-09 22:21:55 +00002138PyObject *
2139PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002140 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002141 const char *errors,
2142 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143{
2144 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002145 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002146#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002147 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002148#else
2149 const int pairs = 0;
2150#endif
Tim Peters772747b2001-08-09 22:21:55 +00002151 /* Offsets from p for storing byte pairs in the right order. */
2152#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2153 int ihi = 1, ilo = 0;
2154#else
2155 int ihi = 0, ilo = 1;
2156#endif
2157
2158#define STORECHAR(CH) \
2159 do { \
2160 p[ihi] = ((CH) >> 8) & 0xff; \
2161 p[ilo] = (CH) & 0xff; \
2162 p += 2; \
2163 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002165#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002166 for (i = pairs = 0; i < size; i++)
2167 if (s[i] >= 0x10000)
2168 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002169#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002170 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002171 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 if (v == NULL)
2173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174
Walter Dörwald3cc34522007-05-04 10:48:27 +00002175 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002177 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002178 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002179 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002180
2181 if (byteorder == -1) {
2182 /* force LE */
2183 ihi = 1;
2184 ilo = 0;
2185 }
2186 else if (byteorder == 1) {
2187 /* force BE */
2188 ihi = 0;
2189 ilo = 1;
2190 }
2191
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002192 while (size-- > 0) {
2193 Py_UNICODE ch = *s++;
2194 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002195#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002196 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002197 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2198 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002200#endif
Tim Peters772747b2001-08-09 22:21:55 +00002201 STORECHAR(ch);
2202 if (ch2)
2203 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002206#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207}
2208
2209PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2210{
2211 if (!PyUnicode_Check(unicode)) {
2212 PyErr_BadArgument();
2213 return NULL;
2214 }
2215 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2216 PyUnicode_GET_SIZE(unicode),
2217 NULL,
2218 0);
2219}
2220
2221/* --- Unicode Escape Codec ----------------------------------------------- */
2222
Fredrik Lundh06d12682001-01-24 07:59:11 +00002223static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002224
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002226 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 const char *errors)
2228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002230 Py_ssize_t startinpos;
2231 Py_ssize_t endinpos;
2232 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002237 char* message;
2238 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 PyObject *errorHandler = NULL;
2240 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002241
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 /* Escaped strings will always be longer than the resulting
2243 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 length after conversion to the true value.
2245 (but if the error callback returns a long replacement string
2246 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 v = _PyUnicode_New(size);
2248 if (v == NULL)
2249 goto onError;
2250 if (size == 0)
2251 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002253 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002255
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 while (s < end) {
2257 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002258 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260
2261 /* Non-escape characters are interpreted as Unicode ordinals */
2262 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002263 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 continue;
2265 }
2266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 /* \ - Escapes */
2269 s++;
2270 switch (*s++) {
2271
2272 /* \x escapes */
2273 case '\n': break;
2274 case '\\': *p++ = '\\'; break;
2275 case '\'': *p++ = '\''; break;
2276 case '\"': *p++ = '\"'; break;
2277 case 'b': *p++ = '\b'; break;
2278 case 'f': *p++ = '\014'; break; /* FF */
2279 case 't': *p++ = '\t'; break;
2280 case 'n': *p++ = '\n'; break;
2281 case 'r': *p++ = '\r'; break;
2282 case 'v': *p++ = '\013'; break; /* VT */
2283 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2284
2285 /* \OOO (octal) escapes */
2286 case '0': case '1': case '2': case '3':
2287 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002288 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002290 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002292 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002294 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 break;
2296
Fredrik Lundhccc74732001-02-18 22:13:49 +00002297 /* hex escapes */
2298 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002300 digits = 2;
2301 message = "truncated \\xXX escape";
2302 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303
Fredrik Lundhccc74732001-02-18 22:13:49 +00002304 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002306 digits = 4;
2307 message = "truncated \\uXXXX escape";
2308 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309
Fredrik Lundhccc74732001-02-18 22:13:49 +00002310 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002311 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002312 digits = 8;
2313 message = "truncated \\UXXXXXXXX escape";
2314 hexescape:
2315 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 outpos = p-PyUnicode_AS_UNICODE(v);
2317 if (s+digits>end) {
2318 endinpos = size;
2319 if (unicode_decode_call_errorhandler(
2320 errors, &errorHandler,
2321 "unicodeescape", "end of string in escape sequence",
2322 starts, size, &startinpos, &endinpos, &exc, &s,
2323 (PyObject **)&v, &outpos, &p))
2324 goto onError;
2325 goto nextByte;
2326 }
2327 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002328 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002329 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 endinpos = (s+i+1)-starts;
2331 if (unicode_decode_call_errorhandler(
2332 errors, &errorHandler,
2333 "unicodeescape", message,
2334 starts, size, &startinpos, &endinpos, &exc, &s,
2335 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002336 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002337 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002338 }
2339 chr = (chr<<4) & ~0xF;
2340 if (c >= '0' && c <= '9')
2341 chr += c - '0';
2342 else if (c >= 'a' && c <= 'f')
2343 chr += 10 + c - 'a';
2344 else
2345 chr += 10 + c - 'A';
2346 }
2347 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002348 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 /* _decoding_error will have already written into the
2350 target buffer. */
2351 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002352 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002353 /* when we get here, chr is a 32-bit unicode character */
2354 if (chr <= 0xffff)
2355 /* UCS-2 character */
2356 *p++ = (Py_UNICODE) chr;
2357 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002358 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002359 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002360#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002361 *p++ = chr;
2362#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002363 chr -= 0x10000L;
2364 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002365 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002366#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002367 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 endinpos = s-starts;
2369 outpos = p-PyUnicode_AS_UNICODE(v);
2370 if (unicode_decode_call_errorhandler(
2371 errors, &errorHandler,
2372 "unicodeescape", "illegal Unicode character",
2373 starts, size, &startinpos, &endinpos, &exc, &s,
2374 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002375 goto onError;
2376 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002377 break;
2378
2379 /* \N{name} */
2380 case 'N':
2381 message = "malformed \\N character escape";
2382 if (ucnhash_CAPI == NULL) {
2383 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002384 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002385 m = PyImport_ImportModule("unicodedata");
2386 if (m == NULL)
2387 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002388 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002389 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002390 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002391 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002392 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002393 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002394 if (ucnhash_CAPI == NULL)
2395 goto ucnhashError;
2396 }
2397 if (*s == '{') {
2398 const char *start = s+1;
2399 /* look for the closing brace */
2400 while (*s != '}' && s < end)
2401 s++;
2402 if (s > start && s < end && *s == '}') {
2403 /* found a name. look it up in the unicode database */
2404 message = "unknown Unicode character name";
2405 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002406 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002407 goto store;
2408 }
2409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002410 endinpos = s-starts;
2411 outpos = p-PyUnicode_AS_UNICODE(v);
2412 if (unicode_decode_call_errorhandler(
2413 errors, &errorHandler,
2414 "unicodeescape", message,
2415 starts, size, &startinpos, &endinpos, &exc, &s,
2416 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002417 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002418 break;
2419
2420 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002421 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 message = "\\ at end of string";
2423 s--;
2424 endinpos = s-starts;
2425 outpos = p-PyUnicode_AS_UNICODE(v);
2426 if (unicode_decode_call_errorhandler(
2427 errors, &errorHandler,
2428 "unicodeescape", message,
2429 starts, size, &startinpos, &endinpos, &exc, &s,
2430 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002431 goto onError;
2432 }
2433 else {
2434 *p++ = '\\';
2435 *p++ = (unsigned char)s[-1];
2436 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002437 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002439 nextByte:
2440 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002442 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002444 Py_XDECREF(errorHandler);
2445 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002447
Fredrik Lundhccc74732001-02-18 22:13:49 +00002448ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002449 PyErr_SetString(
2450 PyExc_UnicodeError,
2451 "\\N escapes not supported (can't load unicodedata module)"
2452 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002453 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 Py_XDECREF(errorHandler);
2455 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002456 return NULL;
2457
Fredrik Lundhccc74732001-02-18 22:13:49 +00002458onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 Py_XDECREF(errorHandler);
2461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 return NULL;
2463}
2464
2465/* Return a Unicode-Escape string version of the Unicode object.
2466
2467 If quotes is true, the string is enclosed in u"" or u'' quotes as
2468 appropriate.
2469
2470*/
2471
Thomas Wouters477c8d52006-05-27 19:21:47 +00002472Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2473 Py_ssize_t size,
2474 Py_UNICODE ch)
2475{
2476 /* like wcschr, but doesn't stop at NULL characters */
2477
2478 while (size-- > 0) {
2479 if (*s == ch)
2480 return s;
2481 s++;
2482 }
2483
2484 return NULL;
2485}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002486
Walter Dörwald79e913e2007-05-12 11:08:06 +00002487static const char *hexdigits = "0123456789abcdef";
2488
2489PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2490 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491{
2492 PyObject *repr;
2493 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494
Thomas Wouters89f507f2006-12-13 04:49:30 +00002495 /* XXX(nnorwitz): rather than over-allocating, it would be
2496 better to choose a different scheme. Perhaps scan the
2497 first N-chars of the string and allocate based on that size.
2498 */
2499 /* Initial allocation is based on the longest-possible unichr
2500 escape.
2501
2502 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2503 unichr, so in this case it's the longest unichr escape. In
2504 narrow (UTF-16) builds this is five chars per source unichr
2505 since there are two unichrs in the surrogate pair, so in narrow
2506 (UTF-16) builds it's not the longest unichr escape.
2507
2508 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2509 so in the narrow (UTF-16) build case it's the longest unichr
2510 escape.
2511 */
2512
Walter Dörwald79e913e2007-05-12 11:08:06 +00002513 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002514#ifdef Py_UNICODE_WIDE
2515 + 10*size
2516#else
2517 + 6*size
2518#endif
2519 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 if (repr == NULL)
2521 return NULL;
2522
Walter Dörwald79e913e2007-05-12 11:08:06 +00002523 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 while (size-- > 0) {
2526 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002527
Walter Dörwald79e913e2007-05-12 11:08:06 +00002528 /* Escape backslashes */
2529 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 *p++ = '\\';
2531 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002532 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002533 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002534
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002535#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002536 /* Map 21-bit characters to '\U00xxxxxx' */
2537 else if (ch >= 0x10000) {
2538 *p++ = '\\';
2539 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002540 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2541 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2542 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2543 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2544 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2545 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2546 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2547 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002548 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002549 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002550#else
2551 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002552 else if (ch >= 0xD800 && ch < 0xDC00) {
2553 Py_UNICODE ch2;
2554 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002555
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002556 ch2 = *s++;
2557 size--;
2558 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2559 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2560 *p++ = '\\';
2561 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002562 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2563 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2564 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2565 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2566 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2567 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2568 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2569 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002570 continue;
2571 }
2572 /* Fall through: isolated surrogates are copied as-is */
2573 s--;
2574 size++;
2575 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002576#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002577
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 *p++ = '\\';
2581 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002582 *p++ = hexdigits[(ch >> 12) & 0x000F];
2583 *p++ = hexdigits[(ch >> 8) & 0x000F];
2584 *p++ = hexdigits[(ch >> 4) & 0x000F];
2585 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002587
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002588 /* Map special whitespace to '\t', \n', '\r' */
2589 else if (ch == '\t') {
2590 *p++ = '\\';
2591 *p++ = 't';
2592 }
2593 else if (ch == '\n') {
2594 *p++ = '\\';
2595 *p++ = 'n';
2596 }
2597 else if (ch == '\r') {
2598 *p++ = '\\';
2599 *p++ = 'r';
2600 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002601
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002602 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002603 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002605 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002606 *p++ = hexdigits[(ch >> 4) & 0x000F];
2607 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002608 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002609
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 /* Copy everything else as-is */
2611 else
2612 *p++ = (char) ch;
2613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614
2615 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002616 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2617 Py_DECREF(repr);
2618 return NULL;
2619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 return repr;
2621}
2622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2624{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002625 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 if (!PyUnicode_Check(unicode)) {
2627 PyErr_BadArgument();
2628 return NULL;
2629 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002630 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2631 PyUnicode_GET_SIZE(unicode));
2632
2633 if (!s)
2634 return NULL;
2635 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2636 PyBytes_GET_SIZE(s));
2637 Py_DECREF(s);
2638 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639}
2640
2641/* --- Raw Unicode Escape Codec ------------------------------------------- */
2642
2643PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 const char *errors)
2646{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002648 Py_ssize_t startinpos;
2649 Py_ssize_t endinpos;
2650 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 const char *end;
2654 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 PyObject *errorHandler = NULL;
2656 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002657
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 /* Escaped strings will always be longer than the resulting
2659 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 length after conversion to the true value. (But decoding error
2661 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 v = _PyUnicode_New(size);
2663 if (v == NULL)
2664 goto onError;
2665 if (size == 0)
2666 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 end = s + size;
2669 while (s < end) {
2670 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002671 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002673 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674
2675 /* Non-escape characters are interpreted as Unicode ordinals */
2676 if (*s != '\\') {
2677 *p++ = (unsigned char)*s++;
2678 continue;
2679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002680 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681
2682 /* \u-escapes are only interpreted iff the number of leading
2683 backslashes if odd */
2684 bs = s;
2685 for (;s < end;) {
2686 if (*s != '\\')
2687 break;
2688 *p++ = (unsigned char)*s++;
2689 }
2690 if (((s - bs) & 1) == 0 ||
2691 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002692 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 continue;
2694 }
2695 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002696 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 s++;
2698
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002699 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002701 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002702 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 endinpos = s-starts;
2705 if (unicode_decode_call_errorhandler(
2706 errors, &errorHandler,
2707 "rawunicodeescape", "truncated \\uXXXX",
2708 starts, size, &startinpos, &endinpos, &exc, &s,
2709 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 }
2713 x = (x<<4) & ~0xF;
2714 if (c >= '0' && c <= '9')
2715 x += c - '0';
2716 else if (c >= 'a' && c <= 'f')
2717 x += 10 + c - 'a';
2718 else
2719 x += 10 + c - 'A';
2720 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002721#ifndef Py_UNICODE_WIDE
2722 if (x > 0x10000) {
2723 if (unicode_decode_call_errorhandler(
2724 errors, &errorHandler,
2725 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2726 starts, size, &startinpos, &endinpos, &exc, &s,
2727 (PyObject **)&v, &outpos, &p))
2728 goto onError;
2729 }
2730#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 *p++ = x;
2732 nextByte:
2733 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002735 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002736 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 Py_XDECREF(errorHandler);
2738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002740
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 onError:
2742 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 return NULL;
2746}
2747
2748PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002749 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750{
2751 PyObject *repr;
2752 char *p;
2753 char *q;
2754
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002755#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002756 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002757#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002758 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002759#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 if (repr == NULL)
2761 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002762 if (size == 0)
2763 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764
Walter Dörwald711005d2007-05-12 12:03:26 +00002765 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 while (size-- > 0) {
2767 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002768#ifdef Py_UNICODE_WIDE
2769 /* Map 32-bit characters to '\Uxxxxxxxx' */
2770 if (ch >= 0x10000) {
2771 *p++ = '\\';
2772 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002773 *p++ = hexdigits[(ch >> 28) & 0xf];
2774 *p++ = hexdigits[(ch >> 24) & 0xf];
2775 *p++ = hexdigits[(ch >> 20) & 0xf];
2776 *p++ = hexdigits[(ch >> 16) & 0xf];
2777 *p++ = hexdigits[(ch >> 12) & 0xf];
2778 *p++ = hexdigits[(ch >> 8) & 0xf];
2779 *p++ = hexdigits[(ch >> 4) & 0xf];
2780 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002781 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782 else
2783#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 /* Map 16-bit characters to '\uxxxx' */
2785 if (ch >= 256) {
2786 *p++ = '\\';
2787 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002788 *p++ = hexdigits[(ch >> 12) & 0xf];
2789 *p++ = hexdigits[(ch >> 8) & 0xf];
2790 *p++ = hexdigits[(ch >> 4) & 0xf];
2791 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
2793 /* Copy everything else as-is */
2794 else
2795 *p++ = (char) ch;
2796 }
2797 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002798 if (PyBytes_Resize(repr, p - q)) {
2799 Py_DECREF(repr);
2800 return NULL;
2801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 return repr;
2803}
2804
2805PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2806{
Walter Dörwald711005d2007-05-12 12:03:26 +00002807 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002809 PyErr_BadArgument();
2810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002812 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2813 PyUnicode_GET_SIZE(unicode));
2814
2815 if (!s)
2816 return NULL;
2817 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2818 PyBytes_GET_SIZE(s));
2819 Py_DECREF(s);
2820 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821}
2822
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002823/* --- Unicode Internal Codec ------------------------------------------- */
2824
2825PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002827 const char *errors)
2828{
2829 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002830 Py_ssize_t startinpos;
2831 Py_ssize_t endinpos;
2832 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002833 PyUnicodeObject *v;
2834 Py_UNICODE *p;
2835 const char *end;
2836 const char *reason;
2837 PyObject *errorHandler = NULL;
2838 PyObject *exc = NULL;
2839
Neal Norwitzd43069c2006-01-08 01:12:10 +00002840#ifdef Py_UNICODE_WIDE
2841 Py_UNICODE unimax = PyUnicode_GetMax();
2842#endif
2843
Thomas Wouters89f507f2006-12-13 04:49:30 +00002844 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002845 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2846 if (v == NULL)
2847 goto onError;
2848 if (PyUnicode_GetSize((PyObject *)v) == 0)
2849 return (PyObject *)v;
2850 p = PyUnicode_AS_UNICODE(v);
2851 end = s + size;
2852
2853 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002854 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002855 /* We have to sanity check the raw data, otherwise doom looms for
2856 some malformed UCS-4 data. */
2857 if (
2858 #ifdef Py_UNICODE_WIDE
2859 *p > unimax || *p < 0 ||
2860 #endif
2861 end-s < Py_UNICODE_SIZE
2862 )
2863 {
2864 startinpos = s - starts;
2865 if (end-s < Py_UNICODE_SIZE) {
2866 endinpos = end-starts;
2867 reason = "truncated input";
2868 }
2869 else {
2870 endinpos = s - starts + Py_UNICODE_SIZE;
2871 reason = "illegal code point (> 0x10FFFF)";
2872 }
2873 outpos = p - PyUnicode_AS_UNICODE(v);
2874 if (unicode_decode_call_errorhandler(
2875 errors, &errorHandler,
2876 "unicode_internal", reason,
2877 starts, size, &startinpos, &endinpos, &exc, &s,
2878 (PyObject **)&v, &outpos, &p)) {
2879 goto onError;
2880 }
2881 }
2882 else {
2883 p++;
2884 s += Py_UNICODE_SIZE;
2885 }
2886 }
2887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002888 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002889 goto onError;
2890 Py_XDECREF(errorHandler);
2891 Py_XDECREF(exc);
2892 return (PyObject *)v;
2893
2894 onError:
2895 Py_XDECREF(v);
2896 Py_XDECREF(errorHandler);
2897 Py_XDECREF(exc);
2898 return NULL;
2899}
2900
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901/* --- Latin-1 Codec ------------------------------------------------------ */
2902
2903PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002904 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 const char *errors)
2906{
2907 PyUnicodeObject *v;
2908 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002909
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002911 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002912 Py_UNICODE r = *(unsigned char*)s;
2913 return PyUnicode_FromUnicode(&r, 1);
2914 }
2915
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 v = _PyUnicode_New(size);
2917 if (v == NULL)
2918 goto onError;
2919 if (size == 0)
2920 return (PyObject *)v;
2921 p = PyUnicode_AS_UNICODE(v);
2922 while (size-- > 0)
2923 *p++ = (unsigned char)*s++;
2924 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 onError:
2927 Py_XDECREF(v);
2928 return NULL;
2929}
2930
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931/* create or adjust a UnicodeEncodeError */
2932static void make_encode_exception(PyObject **exceptionObject,
2933 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002934 const Py_UNICODE *unicode, Py_ssize_t size,
2935 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002938 if (*exceptionObject == NULL) {
2939 *exceptionObject = PyUnicodeEncodeError_Create(
2940 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 }
2942 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2944 goto onError;
2945 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2946 goto onError;
2947 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2948 goto onError;
2949 return;
2950 onError:
2951 Py_DECREF(*exceptionObject);
2952 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 }
2954}
2955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956/* raises a UnicodeEncodeError */
2957static void raise_encode_exception(PyObject **exceptionObject,
2958 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002959 const Py_UNICODE *unicode, Py_ssize_t size,
2960 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 const char *reason)
2962{
2963 make_encode_exception(exceptionObject,
2964 encoding, unicode, size, startpos, endpos, reason);
2965 if (*exceptionObject != NULL)
2966 PyCodec_StrictErrors(*exceptionObject);
2967}
2968
2969/* error handling callback helper:
2970 build arguments, call the callback and check the arguments,
2971 put the result into newpos and return the replacement string, which
2972 has to be freed by the caller */
2973static PyObject *unicode_encode_call_errorhandler(const char *errors,
2974 PyObject **errorHandler,
2975 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002976 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2977 Py_ssize_t startpos, Py_ssize_t endpos,
2978 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002980 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981
2982 PyObject *restuple;
2983 PyObject *resunicode;
2984
2985 if (*errorHandler == NULL) {
2986 *errorHandler = PyCodec_LookupError(errors);
2987 if (*errorHandler == NULL)
2988 return NULL;
2989 }
2990
2991 make_encode_exception(exceptionObject,
2992 encoding, unicode, size, startpos, endpos, reason);
2993 if (*exceptionObject == NULL)
2994 return NULL;
2995
2996 restuple = PyObject_CallFunctionObjArgs(
2997 *errorHandler, *exceptionObject, NULL);
2998 if (restuple == NULL)
2999 return NULL;
3000 if (!PyTuple_Check(restuple)) {
3001 PyErr_Format(PyExc_TypeError, &argparse[4]);
3002 Py_DECREF(restuple);
3003 return NULL;
3004 }
3005 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3006 &resunicode, newpos)) {
3007 Py_DECREF(restuple);
3008 return NULL;
3009 }
3010 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003011 *newpos = size+*newpos;
3012 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003013 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003014 Py_DECREF(restuple);
3015 return NULL;
3016 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 Py_INCREF(resunicode);
3018 Py_DECREF(restuple);
3019 return resunicode;
3020}
3021
3022static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003023 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 const char *errors,
3025 int limit)
3026{
3027 /* output object */
3028 PyObject *res;
3029 /* pointers to the beginning and end+1 of input */
3030 const Py_UNICODE *startp = p;
3031 const Py_UNICODE *endp = p + size;
3032 /* pointer to the beginning of the unencodable characters */
3033 /* const Py_UNICODE *badp = NULL; */
3034 /* pointer into the output */
3035 char *str;
3036 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003037 Py_ssize_t respos = 0;
3038 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003039 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3040 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 PyObject *errorHandler = NULL;
3042 PyObject *exc = NULL;
3043 /* the following variable is used for caching string comparisons
3044 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3045 int known_errorHandler = -1;
3046
3047 /* allocate enough for a simple encoding without
3048 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003049 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 if (res == NULL)
3051 goto onError;
3052 if (size == 0)
3053 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003054 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 ressize = size;
3056
3057 while (p<endp) {
3058 Py_UNICODE c = *p;
3059
3060 /* can we encode this? */
3061 if (c<limit) {
3062 /* no overflow check, because we know that the space is enough */
3063 *str++ = (char)c;
3064 ++p;
3065 }
3066 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003067 Py_ssize_t unicodepos = p-startp;
3068 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003070 Py_ssize_t repsize;
3071 Py_ssize_t newpos;
3072 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003073 Py_UNICODE *uni2;
3074 /* startpos for collecting unencodable chars */
3075 const Py_UNICODE *collstart = p;
3076 const Py_UNICODE *collend = p;
3077 /* find all unecodable characters */
3078 while ((collend < endp) && ((*collend)>=limit))
3079 ++collend;
3080 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3081 if (known_errorHandler==-1) {
3082 if ((errors==NULL) || (!strcmp(errors, "strict")))
3083 known_errorHandler = 1;
3084 else if (!strcmp(errors, "replace"))
3085 known_errorHandler = 2;
3086 else if (!strcmp(errors, "ignore"))
3087 known_errorHandler = 3;
3088 else if (!strcmp(errors, "xmlcharrefreplace"))
3089 known_errorHandler = 4;
3090 else
3091 known_errorHandler = 0;
3092 }
3093 switch (known_errorHandler) {
3094 case 1: /* strict */
3095 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3096 goto onError;
3097 case 2: /* replace */
3098 while (collstart++<collend)
3099 *str++ = '?'; /* fall through */
3100 case 3: /* ignore */
3101 p = collend;
3102 break;
3103 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003104 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105 /* determine replacement size (temporarily (mis)uses p) */
3106 for (p = collstart, repsize = 0; p < collend; ++p) {
3107 if (*p<10)
3108 repsize += 2+1+1;
3109 else if (*p<100)
3110 repsize += 2+2+1;
3111 else if (*p<1000)
3112 repsize += 2+3+1;
3113 else if (*p<10000)
3114 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003115#ifndef Py_UNICODE_WIDE
3116 else
3117 repsize += 2+5+1;
3118#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 else if (*p<100000)
3120 repsize += 2+5+1;
3121 else if (*p<1000000)
3122 repsize += 2+6+1;
3123 else
3124 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003125#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 }
3127 requiredsize = respos+repsize+(endp-collend);
3128 if (requiredsize > ressize) {
3129 if (requiredsize<2*ressize)
3130 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003131 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003133 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003134 ressize = requiredsize;
3135 }
3136 /* generate replacement (temporarily (mis)uses p) */
3137 for (p = collstart; p < collend; ++p) {
3138 str += sprintf(str, "&#%d;", (int)*p);
3139 }
3140 p = collend;
3141 break;
3142 default:
3143 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3144 encoding, reason, startp, size, &exc,
3145 collstart-startp, collend-startp, &newpos);
3146 if (repunicode == NULL)
3147 goto onError;
3148 /* need more space? (at least enough for what we
3149 have+the replacement+the rest of the string, so
3150 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003151 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003152 repsize = PyUnicode_GET_SIZE(repunicode);
3153 requiredsize = respos+repsize+(endp-collend);
3154 if (requiredsize > ressize) {
3155 if (requiredsize<2*ressize)
3156 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003157 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 Py_DECREF(repunicode);
3159 goto onError;
3160 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003161 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 ressize = requiredsize;
3163 }
3164 /* check if there is anything unencodable in the replacement
3165 and copy it to the output */
3166 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3167 c = *uni2;
3168 if (c >= limit) {
3169 raise_encode_exception(&exc, encoding, startp, size,
3170 unicodepos, unicodepos+1, reason);
3171 Py_DECREF(repunicode);
3172 goto onError;
3173 }
3174 *str = (char)c;
3175 }
3176 p = startp + newpos;
3177 Py_DECREF(repunicode);
3178 }
3179 }
3180 }
3181 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003182 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 if (respos<ressize)
3184 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003185 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 Py_XDECREF(errorHandler);
3187 Py_XDECREF(exc);
3188 return res;
3189
3190 onError:
3191 Py_XDECREF(res);
3192 Py_XDECREF(errorHandler);
3193 Py_XDECREF(exc);
3194 return NULL;
3195}
3196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003198 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 const char *errors)
3200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202}
3203
3204PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3205{
3206 if (!PyUnicode_Check(unicode)) {
3207 PyErr_BadArgument();
3208 return NULL;
3209 }
3210 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3211 PyUnicode_GET_SIZE(unicode),
3212 NULL);
3213}
3214
3215/* --- 7-bit ASCII Codec -------------------------------------------------- */
3216
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003218 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 const char *errors)
3220{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 PyUnicodeObject *v;
3223 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003224 Py_ssize_t startinpos;
3225 Py_ssize_t endinpos;
3226 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 const char *e;
3228 PyObject *errorHandler = NULL;
3229 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003232 if (size == 1 && *(unsigned char*)s < 128) {
3233 Py_UNICODE r = *(unsigned char*)s;
3234 return PyUnicode_FromUnicode(&r, 1);
3235 }
Tim Petersced69f82003-09-16 20:30:58 +00003236
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 v = _PyUnicode_New(size);
3238 if (v == NULL)
3239 goto onError;
3240 if (size == 0)
3241 return (PyObject *)v;
3242 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 e = s + size;
3244 while (s < e) {
3245 register unsigned char c = (unsigned char)*s;
3246 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 ++s;
3249 }
3250 else {
3251 startinpos = s-starts;
3252 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003253 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 if (unicode_decode_call_errorhandler(
3255 errors, &errorHandler,
3256 "ascii", "ordinal not in range(128)",
3257 starts, size, &startinpos, &endinpos, &exc, &s,
3258 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003262 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003263 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003264 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 onError:
3270 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return NULL;
3274}
3275
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 const char *errors)
3279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281}
3282
3283PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3284{
3285 if (!PyUnicode_Check(unicode)) {
3286 PyErr_BadArgument();
3287 return NULL;
3288 }
3289 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3290 PyUnicode_GET_SIZE(unicode),
3291 NULL);
3292}
3293
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003294#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003295
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003296/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003297
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003298#if SIZEOF_INT < SIZEOF_SSIZE_T
3299#define NEED_RETRY
3300#endif
3301
3302/* XXX This code is limited to "true" double-byte encodings, as
3303 a) it assumes an incomplete character consists of a single byte, and
3304 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3305 encodings, see IsDBCSLeadByteEx documentation. */
3306
3307static int is_dbcs_lead_byte(const char *s, int offset)
3308{
3309 const char *curr = s + offset;
3310
3311 if (IsDBCSLeadByte(*curr)) {
3312 const char *prev = CharPrev(s, curr);
3313 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3314 }
3315 return 0;
3316}
3317
3318/*
3319 * Decode MBCS string into unicode object. If 'final' is set, converts
3320 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3321 */
3322static int decode_mbcs(PyUnicodeObject **v,
3323 const char *s, /* MBCS string */
3324 int size, /* sizeof MBCS string */
3325 int final)
3326{
3327 Py_UNICODE *p;
3328 Py_ssize_t n = 0;
3329 int usize = 0;
3330
3331 assert(size >= 0);
3332
3333 /* Skip trailing lead-byte unless 'final' is set */
3334 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3335 --size;
3336
3337 /* First get the size of the result */
3338 if (size > 0) {
3339 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3340 if (usize == 0) {
3341 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3342 return -1;
3343 }
3344 }
3345
3346 if (*v == NULL) {
3347 /* Create unicode object */
3348 *v = _PyUnicode_New(usize);
3349 if (*v == NULL)
3350 return -1;
3351 }
3352 else {
3353 /* Extend unicode object */
3354 n = PyUnicode_GET_SIZE(*v);
3355 if (_PyUnicode_Resize(v, n + usize) < 0)
3356 return -1;
3357 }
3358
3359 /* Do the conversion */
3360 if (size > 0) {
3361 p = PyUnicode_AS_UNICODE(*v) + n;
3362 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3363 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3364 return -1;
3365 }
3366 }
3367
3368 return size;
3369}
3370
3371PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3372 Py_ssize_t size,
3373 const char *errors,
3374 Py_ssize_t *consumed)
3375{
3376 PyUnicodeObject *v = NULL;
3377 int done;
3378
3379 if (consumed)
3380 *consumed = 0;
3381
3382#ifdef NEED_RETRY
3383 retry:
3384 if (size > INT_MAX)
3385 done = decode_mbcs(&v, s, INT_MAX, 0);
3386 else
3387#endif
3388 done = decode_mbcs(&v, s, (int)size, !consumed);
3389
3390 if (done < 0) {
3391 Py_XDECREF(v);
3392 return NULL;
3393 }
3394
3395 if (consumed)
3396 *consumed += done;
3397
3398#ifdef NEED_RETRY
3399 if (size > INT_MAX) {
3400 s += done;
3401 size -= done;
3402 goto retry;
3403 }
3404#endif
3405
3406 return (PyObject *)v;
3407}
3408
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003409PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003410 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003411 const char *errors)
3412{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003413 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3414}
3415
3416/*
3417 * Convert unicode into string object (MBCS).
3418 * Returns 0 if succeed, -1 otherwise.
3419 */
3420static int encode_mbcs(PyObject **repr,
3421 const Py_UNICODE *p, /* unicode */
3422 int size) /* size of unicode */
3423{
3424 int mbcssize = 0;
3425 Py_ssize_t n = 0;
3426
3427 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003428
3429 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003430 if (size > 0) {
3431 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3432 if (mbcssize == 0) {
3433 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3434 return -1;
3435 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003436 }
3437
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003438 if (*repr == NULL) {
3439 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003440 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003441 if (*repr == NULL)
3442 return -1;
3443 }
3444 else {
3445 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003446 n = PyBytes_Size(*repr);
3447 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003448 return -1;
3449 }
3450
3451 /* Do the conversion */
3452 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003453 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003454 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3455 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3456 return -1;
3457 }
3458 }
3459
3460 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003461}
3462
3463PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003464 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003465 const char *errors)
3466{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003467 PyObject *repr = NULL;
3468 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003469
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003470#ifdef NEED_RETRY
3471 retry:
3472 if (size > INT_MAX)
3473 ret = encode_mbcs(&repr, p, INT_MAX);
3474 else
3475#endif
3476 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003477
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003478 if (ret < 0) {
3479 Py_XDECREF(repr);
3480 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003481 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003482
3483#ifdef NEED_RETRY
3484 if (size > INT_MAX) {
3485 p += INT_MAX;
3486 size -= INT_MAX;
3487 goto retry;
3488 }
3489#endif
3490
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003491 return repr;
3492}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003493
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003494PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3495{
3496 if (!PyUnicode_Check(unicode)) {
3497 PyErr_BadArgument();
3498 return NULL;
3499 }
3500 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3501 PyUnicode_GET_SIZE(unicode),
3502 NULL);
3503}
3504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505#undef NEED_RETRY
3506
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003507#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003508
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509/* --- Character Mapping Codec -------------------------------------------- */
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 PyObject *mapping,
3514 const char *errors)
3515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t startinpos;
3518 Py_ssize_t endinpos;
3519 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 PyUnicodeObject *v;
3522 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003523 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003526 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003527 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003528
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 /* Default to Latin-1 */
3530 if (mapping == NULL)
3531 return PyUnicode_DecodeLatin1(s, size, errors);
3532
3533 v = _PyUnicode_New(size);
3534 if (v == NULL)
3535 goto onError;
3536 if (size == 0)
3537 return (PyObject *)v;
3538 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003540 if (PyUnicode_CheckExact(mapping)) {
3541 mapstring = PyUnicode_AS_UNICODE(mapping);
3542 maplen = PyUnicode_GET_SIZE(mapping);
3543 while (s < e) {
3544 unsigned char ch = *s;
3545 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003547 if (ch < maplen)
3548 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003550 if (x == 0xfffe) {
3551 /* undefined mapping */
3552 outpos = p-PyUnicode_AS_UNICODE(v);
3553 startinpos = s-starts;
3554 endinpos = startinpos+1;
3555 if (unicode_decode_call_errorhandler(
3556 errors, &errorHandler,
3557 "charmap", "character maps to <undefined>",
3558 starts, size, &startinpos, &endinpos, &exc, &s,
3559 (PyObject **)&v, &outpos, &p)) {
3560 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003561 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003562 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003563 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003564 *p++ = x;
3565 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003567 }
3568 else {
3569 while (s < e) {
3570 unsigned char ch = *s;
3571 PyObject *w, *x;
3572
3573 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3574 w = PyInt_FromLong((long)ch);
3575 if (w == NULL)
3576 goto onError;
3577 x = PyObject_GetItem(mapping, w);
3578 Py_DECREF(w);
3579 if (x == NULL) {
3580 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3581 /* No mapping found means: mapping is undefined. */
3582 PyErr_Clear();
3583 x = Py_None;
3584 Py_INCREF(x);
3585 } else
3586 goto onError;
3587 }
3588
3589 /* Apply mapping */
3590 if (PyInt_Check(x)) {
3591 long value = PyInt_AS_LONG(x);
3592 if (value < 0 || value > 65535) {
3593 PyErr_SetString(PyExc_TypeError,
3594 "character mapping must be in range(65536)");
3595 Py_DECREF(x);
3596 goto onError;
3597 }
3598 *p++ = (Py_UNICODE)value;
3599 }
3600 else if (x == Py_None) {
3601 /* undefined mapping */
3602 outpos = p-PyUnicode_AS_UNICODE(v);
3603 startinpos = s-starts;
3604 endinpos = startinpos+1;
3605 if (unicode_decode_call_errorhandler(
3606 errors, &errorHandler,
3607 "charmap", "character maps to <undefined>",
3608 starts, size, &startinpos, &endinpos, &exc, &s,
3609 (PyObject **)&v, &outpos, &p)) {
3610 Py_DECREF(x);
3611 goto onError;
3612 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003613 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003614 continue;
3615 }
3616 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003618
3619 if (targetsize == 1)
3620 /* 1-1 mapping */
3621 *p++ = *PyUnicode_AS_UNICODE(x);
3622
3623 else if (targetsize > 1) {
3624 /* 1-n mapping */
3625 if (targetsize > extrachars) {
3626 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003627 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3628 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003629 (targetsize << 2);
3630 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003631 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003632 if (_PyUnicode_Resize(&v,
3633 PyUnicode_GET_SIZE(v) + needed) < 0) {
3634 Py_DECREF(x);
3635 goto onError;
3636 }
3637 p = PyUnicode_AS_UNICODE(v) + oldpos;
3638 }
3639 Py_UNICODE_COPY(p,
3640 PyUnicode_AS_UNICODE(x),
3641 targetsize);
3642 p += targetsize;
3643 extrachars -= targetsize;
3644 }
3645 /* 1-0 mapping: skip the character */
3646 }
3647 else {
3648 /* wrong return value */
3649 PyErr_SetString(PyExc_TypeError,
3650 "character mapping must return integer, None or unicode");
3651 Py_DECREF(x);
3652 goto onError;
3653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003655 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 }
3658 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003659 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 Py_XDECREF(errorHandler);
3662 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_XDECREF(errorHandler);
3667 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 Py_XDECREF(v);
3669 return NULL;
3670}
3671
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003672/* Charmap encoding: the lookup table */
3673
3674struct encoding_map{
3675 PyObject_HEAD
3676 unsigned char level1[32];
3677 int count2, count3;
3678 unsigned char level23[1];
3679};
3680
3681static PyObject*
3682encoding_map_size(PyObject *obj, PyObject* args)
3683{
3684 struct encoding_map *map = (struct encoding_map*)obj;
3685 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3686 128*map->count3);
3687}
3688
3689static PyMethodDef encoding_map_methods[] = {
3690 {"size", encoding_map_size, METH_NOARGS,
3691 PyDoc_STR("Return the size (in bytes) of this object") },
3692 { 0 }
3693};
3694
3695static void
3696encoding_map_dealloc(PyObject* o)
3697{
3698 PyObject_FREE(o);
3699}
3700
3701static PyTypeObject EncodingMapType = {
3702 PyObject_HEAD_INIT(NULL)
3703 0, /*ob_size*/
3704 "EncodingMap", /*tp_name*/
3705 sizeof(struct encoding_map), /*tp_basicsize*/
3706 0, /*tp_itemsize*/
3707 /* methods */
3708 encoding_map_dealloc, /*tp_dealloc*/
3709 0, /*tp_print*/
3710 0, /*tp_getattr*/
3711 0, /*tp_setattr*/
3712 0, /*tp_compare*/
3713 0, /*tp_repr*/
3714 0, /*tp_as_number*/
3715 0, /*tp_as_sequence*/
3716 0, /*tp_as_mapping*/
3717 0, /*tp_hash*/
3718 0, /*tp_call*/
3719 0, /*tp_str*/
3720 0, /*tp_getattro*/
3721 0, /*tp_setattro*/
3722 0, /*tp_as_buffer*/
3723 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3724 0, /*tp_doc*/
3725 0, /*tp_traverse*/
3726 0, /*tp_clear*/
3727 0, /*tp_richcompare*/
3728 0, /*tp_weaklistoffset*/
3729 0, /*tp_iter*/
3730 0, /*tp_iternext*/
3731 encoding_map_methods, /*tp_methods*/
3732 0, /*tp_members*/
3733 0, /*tp_getset*/
3734 0, /*tp_base*/
3735 0, /*tp_dict*/
3736 0, /*tp_descr_get*/
3737 0, /*tp_descr_set*/
3738 0, /*tp_dictoffset*/
3739 0, /*tp_init*/
3740 0, /*tp_alloc*/
3741 0, /*tp_new*/
3742 0, /*tp_free*/
3743 0, /*tp_is_gc*/
3744};
3745
3746PyObject*
3747PyUnicode_BuildEncodingMap(PyObject* string)
3748{
3749 Py_UNICODE *decode;
3750 PyObject *result;
3751 struct encoding_map *mresult;
3752 int i;
3753 int need_dict = 0;
3754 unsigned char level1[32];
3755 unsigned char level2[512];
3756 unsigned char *mlevel1, *mlevel2, *mlevel3;
3757 int count2 = 0, count3 = 0;
3758
3759 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3760 PyErr_BadArgument();
3761 return NULL;
3762 }
3763 decode = PyUnicode_AS_UNICODE(string);
3764 memset(level1, 0xFF, sizeof level1);
3765 memset(level2, 0xFF, sizeof level2);
3766
3767 /* If there isn't a one-to-one mapping of NULL to \0,
3768 or if there are non-BMP characters, we need to use
3769 a mapping dictionary. */
3770 if (decode[0] != 0)
3771 need_dict = 1;
3772 for (i = 1; i < 256; i++) {
3773 int l1, l2;
3774 if (decode[i] == 0
3775 #ifdef Py_UNICODE_WIDE
3776 || decode[i] > 0xFFFF
3777 #endif
3778 ) {
3779 need_dict = 1;
3780 break;
3781 }
3782 if (decode[i] == 0xFFFE)
3783 /* unmapped character */
3784 continue;
3785 l1 = decode[i] >> 11;
3786 l2 = decode[i] >> 7;
3787 if (level1[l1] == 0xFF)
3788 level1[l1] = count2++;
3789 if (level2[l2] == 0xFF)
3790 level2[l2] = count3++;
3791 }
3792
3793 if (count2 >= 0xFF || count3 >= 0xFF)
3794 need_dict = 1;
3795
3796 if (need_dict) {
3797 PyObject *result = PyDict_New();
3798 PyObject *key, *value;
3799 if (!result)
3800 return NULL;
3801 for (i = 0; i < 256; i++) {
3802 key = value = NULL;
3803 key = PyInt_FromLong(decode[i]);
3804 value = PyInt_FromLong(i);
3805 if (!key || !value)
3806 goto failed1;
3807 if (PyDict_SetItem(result, key, value) == -1)
3808 goto failed1;
3809 Py_DECREF(key);
3810 Py_DECREF(value);
3811 }
3812 return result;
3813 failed1:
3814 Py_XDECREF(key);
3815 Py_XDECREF(value);
3816 Py_DECREF(result);
3817 return NULL;
3818 }
3819
3820 /* Create a three-level trie */
3821 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3822 16*count2 + 128*count3 - 1);
3823 if (!result)
3824 return PyErr_NoMemory();
3825 PyObject_Init(result, &EncodingMapType);
3826 mresult = (struct encoding_map*)result;
3827 mresult->count2 = count2;
3828 mresult->count3 = count3;
3829 mlevel1 = mresult->level1;
3830 mlevel2 = mresult->level23;
3831 mlevel3 = mresult->level23 + 16*count2;
3832 memcpy(mlevel1, level1, 32);
3833 memset(mlevel2, 0xFF, 16*count2);
3834 memset(mlevel3, 0, 128*count3);
3835 count3 = 0;
3836 for (i = 1; i < 256; i++) {
3837 int o1, o2, o3, i2, i3;
3838 if (decode[i] == 0xFFFE)
3839 /* unmapped character */
3840 continue;
3841 o1 = decode[i]>>11;
3842 o2 = (decode[i]>>7) & 0xF;
3843 i2 = 16*mlevel1[o1] + o2;
3844 if (mlevel2[i2] == 0xFF)
3845 mlevel2[i2] = count3++;
3846 o3 = decode[i] & 0x7F;
3847 i3 = 128*mlevel2[i2] + o3;
3848 mlevel3[i3] = i;
3849 }
3850 return result;
3851}
3852
3853static int
3854encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3855{
3856 struct encoding_map *map = (struct encoding_map*)mapping;
3857 int l1 = c>>11;
3858 int l2 = (c>>7) & 0xF;
3859 int l3 = c & 0x7F;
3860 int i;
3861
3862#ifdef Py_UNICODE_WIDE
3863 if (c > 0xFFFF) {
3864 return -1;
3865 }
3866#endif
3867 if (c == 0)
3868 return 0;
3869 /* level 1*/
3870 i = map->level1[l1];
3871 if (i == 0xFF) {
3872 return -1;
3873 }
3874 /* level 2*/
3875 i = map->level23[16*i+l2];
3876 if (i == 0xFF) {
3877 return -1;
3878 }
3879 /* level 3 */
3880 i = map->level23[16*map->count2 + 128*i + l3];
3881 if (i == 0) {
3882 return -1;
3883 }
3884 return i;
3885}
3886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887/* Lookup the character ch in the mapping. If the character
3888 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003889 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 PyObject *w = PyInt_FromLong((long)c);
3893 PyObject *x;
3894
3895 if (w == NULL)
3896 return NULL;
3897 x = PyObject_GetItem(mapping, w);
3898 Py_DECREF(w);
3899 if (x == NULL) {
3900 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3901 /* No mapping found means: mapping is undefined. */
3902 PyErr_Clear();
3903 x = Py_None;
3904 Py_INCREF(x);
3905 return x;
3906 } else
3907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003909 else if (x == Py_None)
3910 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 else if (PyInt_Check(x)) {
3912 long value = PyInt_AS_LONG(x);
3913 if (value < 0 || value > 255) {
3914 PyErr_SetString(PyExc_TypeError,
3915 "character mapping must be in range(256)");
3916 Py_DECREF(x);
3917 return NULL;
3918 }
3919 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 else if (PyString_Check(x))
3922 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003925 PyErr_Format(PyExc_TypeError,
3926 "character mapping must return integer, None or str8, not %.400s",
3927 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 Py_DECREF(x);
3929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
3931}
3932
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003933static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003934charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003935{
Walter Dörwald827b0552007-05-12 13:23:53 +00003936 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003937 /* exponentially overallocate to minimize reallocations */
3938 if (requiredsize < 2*outsize)
3939 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003940 if (PyBytes_Resize(outobj, requiredsize)) {
3941 Py_DECREF(outobj);
3942 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003943 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003944 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003945}
3946
3947typedef enum charmapencode_result {
3948 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3949}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003951 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 space is available. Return a new reference to the object that
3953 was put in the output buffer, or Py_None, if the mapping was undefined
3954 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003955 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003957charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003958 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003960 PyObject *rep;
3961 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003962 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003964 if (mapping->ob_type == &EncodingMapType) {
3965 int res = encoding_map_lookup(c, mapping);
3966 Py_ssize_t requiredsize = *outpos+1;
3967 if (res == -1)
3968 return enc_FAILED;
3969 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003970 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003971 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003972 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003973 outstart[(*outpos)++] = (char)res;
3974 return enc_SUCCESS;
3975 }
3976
3977 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003979 return enc_EXCEPTION;
3980 else if (rep==Py_None) {
3981 Py_DECREF(rep);
3982 return enc_FAILED;
3983 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003986 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003987 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003989 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003991 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3993 }
3994 else {
3995 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003996 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3997 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003998 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003999 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004001 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004003 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 memcpy(outstart + *outpos, repchars, repsize);
4005 *outpos += repsize;
4006 }
4007 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 Py_DECREF(rep);
4009 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010}
4011
4012/* handle an error in PyUnicode_EncodeCharmap
4013 Return 0 on success, -1 on error */
4014static
4015int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004016 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004018 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004019 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020{
4021 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t repsize;
4023 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 Py_UNICODE *uni2;
4025 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004026 Py_ssize_t collstartpos = *inpos;
4027 Py_ssize_t collendpos = *inpos+1;
4028 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 char *encoding = "charmap";
4030 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004031 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 /* find all unencodable characters */
4034 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 PyObject *rep;
4036 if (mapping->ob_type == &EncodingMapType) {
4037 int res = encoding_map_lookup(p[collendpos], mapping);
4038 if (res != -1)
4039 break;
4040 ++collendpos;
4041 continue;
4042 }
4043
4044 rep = charmapencode_lookup(p[collendpos], mapping);
4045 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004047 else if (rep!=Py_None) {
4048 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049 break;
4050 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004051 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 ++collendpos;
4053 }
4054 /* cache callback name lookup
4055 * (if not done yet, i.e. it's the first error) */
4056 if (*known_errorHandler==-1) {
4057 if ((errors==NULL) || (!strcmp(errors, "strict")))
4058 *known_errorHandler = 1;
4059 else if (!strcmp(errors, "replace"))
4060 *known_errorHandler = 2;
4061 else if (!strcmp(errors, "ignore"))
4062 *known_errorHandler = 3;
4063 else if (!strcmp(errors, "xmlcharrefreplace"))
4064 *known_errorHandler = 4;
4065 else
4066 *known_errorHandler = 0;
4067 }
4068 switch (*known_errorHandler) {
4069 case 1: /* strict */
4070 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4071 return -1;
4072 case 2: /* replace */
4073 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4074 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004075 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 return -1;
4077 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004078 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4080 return -1;
4081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 }
4083 /* fall through */
4084 case 3: /* ignore */
4085 *inpos = collendpos;
4086 break;
4087 case 4: /* xmlcharrefreplace */
4088 /* generate replacement (temporarily (mis)uses p) */
4089 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4090 char buffer[2+29+1+1];
4091 char *cp;
4092 sprintf(buffer, "&#%d;", (int)p[collpos]);
4093 for (cp = buffer; *cp; ++cp) {
4094 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004095 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004097 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4099 return -1;
4100 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 }
4102 }
4103 *inpos = collendpos;
4104 break;
4105 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004106 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 encoding, reason, p, size, exceptionObject,
4108 collstartpos, collendpos, &newpos);
4109 if (repunicode == NULL)
4110 return -1;
4111 /* generate replacement */
4112 repsize = PyUnicode_GET_SIZE(repunicode);
4113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4114 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004115 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 return -1;
4117 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004118 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4121 return -1;
4122 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124 *inpos = newpos;
4125 Py_DECREF(repunicode);
4126 }
4127 return 0;
4128}
4129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004131 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 PyObject *mapping,
4133 const char *errors)
4134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 /* output object */
4136 PyObject *res = NULL;
4137 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004138 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004140 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 PyObject *errorHandler = NULL;
4142 PyObject *exc = NULL;
4143 /* the following variable is used for caching string comparisons
4144 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4145 * 3=ignore, 4=xmlcharrefreplace */
4146 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147
4148 /* Default to Latin-1 */
4149 if (mapping == NULL)
4150 return PyUnicode_EncodeLatin1(p, size, errors);
4151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 /* allocate enough for a simple encoding without
4153 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004154 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 if (res == NULL)
4156 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004157 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 while (inpos<size) {
4161 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004162 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004163 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004165 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 if (charmap_encoding_error(p, size, &inpos, mapping,
4167 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004168 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004169 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004170 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 else
4174 /* done with this character => adjust input position */
4175 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004179 if (respos<PyBytes_GET_SIZE(res)) {
4180 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 goto onError;
4182 }
4183 Py_XDECREF(exc);
4184 Py_XDECREF(errorHandler);
4185 return res;
4186
4187 onError:
4188 Py_XDECREF(res);
4189 Py_XDECREF(exc);
4190 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 return NULL;
4192}
4193
4194PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4195 PyObject *mapping)
4196{
4197 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4198 PyErr_BadArgument();
4199 return NULL;
4200 }
4201 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4202 PyUnicode_GET_SIZE(unicode),
4203 mapping,
4204 NULL);
4205}
4206
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207/* create or adjust a UnicodeTranslateError */
4208static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 const Py_UNICODE *unicode, Py_ssize_t size,
4210 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 if (*exceptionObject == NULL) {
4214 *exceptionObject = PyUnicodeTranslateError_Create(
4215 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
4217 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4219 goto onError;
4220 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4221 goto onError;
4222 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4223 goto onError;
4224 return;
4225 onError:
4226 Py_DECREF(*exceptionObject);
4227 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 }
4229}
4230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231/* raises a UnicodeTranslateError */
4232static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004233 const Py_UNICODE *unicode, Py_ssize_t size,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 const char *reason)
4236{
4237 make_translate_exception(exceptionObject,
4238 unicode, size, startpos, endpos, reason);
4239 if (*exceptionObject != NULL)
4240 PyCodec_StrictErrors(*exceptionObject);
4241}
4242
4243/* error handling callback helper:
4244 build arguments, call the callback and check the arguments,
4245 put the result into newpos and return the replacement string, which
4246 has to be freed by the caller */
4247static PyObject *unicode_translate_call_errorhandler(const char *errors,
4248 PyObject **errorHandler,
4249 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004250 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4251 Py_ssize_t startpos, Py_ssize_t endpos,
4252 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004254 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004256 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 PyObject *restuple;
4258 PyObject *resunicode;
4259
4260 if (*errorHandler == NULL) {
4261 *errorHandler = PyCodec_LookupError(errors);
4262 if (*errorHandler == NULL)
4263 return NULL;
4264 }
4265
4266 make_translate_exception(exceptionObject,
4267 unicode, size, startpos, endpos, reason);
4268 if (*exceptionObject == NULL)
4269 return NULL;
4270
4271 restuple = PyObject_CallFunctionObjArgs(
4272 *errorHandler, *exceptionObject, NULL);
4273 if (restuple == NULL)
4274 return NULL;
4275 if (!PyTuple_Check(restuple)) {
4276 PyErr_Format(PyExc_TypeError, &argparse[4]);
4277 Py_DECREF(restuple);
4278 return NULL;
4279 }
4280 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004281 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 Py_DECREF(restuple);
4283 return NULL;
4284 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004285 if (i_newpos<0)
4286 *newpos = size+i_newpos;
4287 else
4288 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004289 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004290 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004291 Py_DECREF(restuple);
4292 return NULL;
4293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 Py_INCREF(resunicode);
4295 Py_DECREF(restuple);
4296 return resunicode;
4297}
4298
4299/* Lookup the character ch in the mapping and put the result in result,
4300 which must be decrefed by the caller.
4301 Return 0 on success, -1 on error */
4302static
4303int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4304{
4305 PyObject *w = PyInt_FromLong((long)c);
4306 PyObject *x;
4307
4308 if (w == NULL)
4309 return -1;
4310 x = PyObject_GetItem(mapping, w);
4311 Py_DECREF(w);
4312 if (x == NULL) {
4313 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4314 /* No mapping found means: use 1:1 mapping. */
4315 PyErr_Clear();
4316 *result = NULL;
4317 return 0;
4318 } else
4319 return -1;
4320 }
4321 else if (x == Py_None) {
4322 *result = x;
4323 return 0;
4324 }
4325 else if (PyInt_Check(x)) {
4326 long value = PyInt_AS_LONG(x);
4327 long max = PyUnicode_GetMax();
4328 if (value < 0 || value > max) {
4329 PyErr_Format(PyExc_TypeError,
4330 "character mapping must be in range(0x%lx)", max+1);
4331 Py_DECREF(x);
4332 return -1;
4333 }
4334 *result = x;
4335 return 0;
4336 }
4337 else if (PyUnicode_Check(x)) {
4338 *result = x;
4339 return 0;
4340 }
4341 else {
4342 /* wrong return value */
4343 PyErr_SetString(PyExc_TypeError,
4344 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004345 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 return -1;
4347 }
4348}
4349/* ensure that *outobj is at least requiredsize characters long,
4350if not reallocate and adjust various state variables.
4351Return 0 on success, -1 on error */
4352static
Walter Dörwald4894c302003-10-24 14:25:28 +00004353int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004354 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004356 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004357 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004359 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004361 if (requiredsize < 2 * oldsize)
4362 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004363 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 return -1;
4365 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 }
4367 return 0;
4368}
4369/* lookup the character, put the result in the output string and adjust
4370 various state variables. Return a new reference to the object that
4371 was put in the output buffer in *result, or Py_None, if the mapping was
4372 undefined (in which case no character was written).
4373 The called must decref result.
4374 Return 0 on success, -1 on error. */
4375static
Walter Dörwald4894c302003-10-24 14:25:28 +00004376int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004378 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379{
Walter Dörwald4894c302003-10-24 14:25:28 +00004380 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 return -1;
4382 if (*res==NULL) {
4383 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004384 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 }
4386 else if (*res==Py_None)
4387 ;
4388 else if (PyInt_Check(*res)) {
4389 /* no overflow check, because we know that the space is enough */
4390 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4391 }
4392 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (repsize==1) {
4395 /* no overflow check, because we know that the space is enough */
4396 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4397 }
4398 else if (repsize!=0) {
4399 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004400 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004401 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004402 repsize - 1;
4403 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 return -1;
4405 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4406 *outp += repsize;
4407 }
4408 }
4409 else
4410 return -1;
4411 return 0;
4412}
4413
4414PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 PyObject *mapping,
4417 const char *errors)
4418{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 /* output object */
4420 PyObject *res = NULL;
4421 /* pointers to the beginning and end+1 of input */
4422 const Py_UNICODE *startp = p;
4423 const Py_UNICODE *endp = p + size;
4424 /* pointer into the output */
4425 Py_UNICODE *str;
4426 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004427 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 char *reason = "character maps to <undefined>";
4429 PyObject *errorHandler = NULL;
4430 PyObject *exc = NULL;
4431 /* the following variable is used for caching string comparisons
4432 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4433 * 3=ignore, 4=xmlcharrefreplace */
4434 int known_errorHandler = -1;
4435
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 if (mapping == NULL) {
4437 PyErr_BadArgument();
4438 return NULL;
4439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440
4441 /* allocate enough for a simple 1:1 translation without
4442 replacements, if we need more, we'll resize */
4443 res = PyUnicode_FromUnicode(NULL, size);
4444 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004445 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 return res;
4448 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 while (p<endp) {
4451 /* try to encode it */
4452 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004453 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 goto onError;
4456 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004457 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 if (x!=Py_None) /* it worked => adjust input pointer */
4459 ++p;
4460 else { /* untranslatable character */
4461 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004462 Py_ssize_t repsize;
4463 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 Py_UNICODE *uni2;
4465 /* startpos for collecting untranslatable chars */
4466 const Py_UNICODE *collstart = p;
4467 const Py_UNICODE *collend = p+1;
4468 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 /* find all untranslatable characters */
4471 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004472 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 goto onError;
4474 Py_XDECREF(x);
4475 if (x!=Py_None)
4476 break;
4477 ++collend;
4478 }
4479 /* cache callback name lookup
4480 * (if not done yet, i.e. it's the first error) */
4481 if (known_errorHandler==-1) {
4482 if ((errors==NULL) || (!strcmp(errors, "strict")))
4483 known_errorHandler = 1;
4484 else if (!strcmp(errors, "replace"))
4485 known_errorHandler = 2;
4486 else if (!strcmp(errors, "ignore"))
4487 known_errorHandler = 3;
4488 else if (!strcmp(errors, "xmlcharrefreplace"))
4489 known_errorHandler = 4;
4490 else
4491 known_errorHandler = 0;
4492 }
4493 switch (known_errorHandler) {
4494 case 1: /* strict */
4495 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4496 goto onError;
4497 case 2: /* replace */
4498 /* No need to check for space, this is a 1:1 replacement */
4499 for (coll = collstart; coll<collend; ++coll)
4500 *str++ = '?';
4501 /* fall through */
4502 case 3: /* ignore */
4503 p = collend;
4504 break;
4505 case 4: /* xmlcharrefreplace */
4506 /* generate replacement (temporarily (mis)uses p) */
4507 for (p = collstart; p < collend; ++p) {
4508 char buffer[2+29+1+1];
4509 char *cp;
4510 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004511 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4513 goto onError;
4514 for (cp = buffer; *cp; ++cp)
4515 *str++ = *cp;
4516 }
4517 p = collend;
4518 break;
4519 default:
4520 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4521 reason, startp, size, &exc,
4522 collstart-startp, collend-startp, &newpos);
4523 if (repunicode == NULL)
4524 goto onError;
4525 /* generate replacement */
4526 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004527 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4529 Py_DECREF(repunicode);
4530 goto onError;
4531 }
4532 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4533 *str++ = *uni2;
4534 p = startp + newpos;
4535 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 }
4537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 /* Resize if we allocated to much */
4540 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004541 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004542 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 }
4545 Py_XDECREF(exc);
4546 Py_XDECREF(errorHandler);
4547 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 onError:
4550 Py_XDECREF(res);
4551 Py_XDECREF(exc);
4552 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 return NULL;
4554}
4555
4556PyObject *PyUnicode_Translate(PyObject *str,
4557 PyObject *mapping,
4558 const char *errors)
4559{
4560 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004561
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 str = PyUnicode_FromObject(str);
4563 if (str == NULL)
4564 goto onError;
4565 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4566 PyUnicode_GET_SIZE(str),
4567 mapping,
4568 errors);
4569 Py_DECREF(str);
4570 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004571
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 onError:
4573 Py_XDECREF(str);
4574 return NULL;
4575}
Tim Petersced69f82003-09-16 20:30:58 +00004576
Guido van Rossum9e896b32000-04-05 20:11:21 +00004577/* --- Decimal Encoder ---------------------------------------------------- */
4578
4579int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004581 char *output,
4582 const char *errors)
4583{
4584 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 PyObject *errorHandler = NULL;
4586 PyObject *exc = NULL;
4587 const char *encoding = "decimal";
4588 const char *reason = "invalid decimal Unicode string";
4589 /* the following variable is used for caching string comparisons
4590 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4591 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004592
4593 if (output == NULL) {
4594 PyErr_BadArgument();
4595 return -1;
4596 }
4597
4598 p = s;
4599 end = s + length;
4600 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004602 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004604 Py_ssize_t repsize;
4605 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 Py_UNICODE *uni2;
4607 Py_UNICODE *collstart;
4608 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004609
Guido van Rossum9e896b32000-04-05 20:11:21 +00004610 if (Py_UNICODE_ISSPACE(ch)) {
4611 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004613 continue;
4614 }
4615 decimal = Py_UNICODE_TODECIMAL(ch);
4616 if (decimal >= 0) {
4617 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004619 continue;
4620 }
Guido van Rossumba477042000-04-06 18:18:10 +00004621 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004622 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004624 continue;
4625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 /* All other characters are considered unencodable */
4627 collstart = p;
4628 collend = p+1;
4629 while (collend < end) {
4630 if ((0 < *collend && *collend < 256) ||
4631 !Py_UNICODE_ISSPACE(*collend) ||
4632 Py_UNICODE_TODECIMAL(*collend))
4633 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 /* cache callback name lookup
4636 * (if not done yet, i.e. it's the first error) */
4637 if (known_errorHandler==-1) {
4638 if ((errors==NULL) || (!strcmp(errors, "strict")))
4639 known_errorHandler = 1;
4640 else if (!strcmp(errors, "replace"))
4641 known_errorHandler = 2;
4642 else if (!strcmp(errors, "ignore"))
4643 known_errorHandler = 3;
4644 else if (!strcmp(errors, "xmlcharrefreplace"))
4645 known_errorHandler = 4;
4646 else
4647 known_errorHandler = 0;
4648 }
4649 switch (known_errorHandler) {
4650 case 1: /* strict */
4651 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4652 goto onError;
4653 case 2: /* replace */
4654 for (p = collstart; p < collend; ++p)
4655 *output++ = '?';
4656 /* fall through */
4657 case 3: /* ignore */
4658 p = collend;
4659 break;
4660 case 4: /* xmlcharrefreplace */
4661 /* generate replacement (temporarily (mis)uses p) */
4662 for (p = collstart; p < collend; ++p)
4663 output += sprintf(output, "&#%d;", (int)*p);
4664 p = collend;
4665 break;
4666 default:
4667 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4668 encoding, reason, s, length, &exc,
4669 collstart-s, collend-s, &newpos);
4670 if (repunicode == NULL)
4671 goto onError;
4672 /* generate replacement */
4673 repsize = PyUnicode_GET_SIZE(repunicode);
4674 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4675 Py_UNICODE ch = *uni2;
4676 if (Py_UNICODE_ISSPACE(ch))
4677 *output++ = ' ';
4678 else {
4679 decimal = Py_UNICODE_TODECIMAL(ch);
4680 if (decimal >= 0)
4681 *output++ = '0' + decimal;
4682 else if (0 < ch && ch < 256)
4683 *output++ = (char)ch;
4684 else {
4685 Py_DECREF(repunicode);
4686 raise_encode_exception(&exc, encoding,
4687 s, length, collstart-s, collend-s, reason);
4688 goto onError;
4689 }
4690 }
4691 }
4692 p = s + newpos;
4693 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004694 }
4695 }
4696 /* 0-terminate the output string */
4697 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 Py_XDECREF(exc);
4699 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004700 return 0;
4701
4702 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 Py_XDECREF(exc);
4704 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004705 return -1;
4706}
4707
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708/* --- Helpers ------------------------------------------------------------ */
4709
Thomas Wouters477c8d52006-05-27 19:21:47 +00004710#define STRINGLIB_CHAR Py_UNICODE
4711
4712#define STRINGLIB_LEN PyUnicode_GET_SIZE
4713#define STRINGLIB_NEW PyUnicode_FromUnicode
4714#define STRINGLIB_STR PyUnicode_AS_UNICODE
4715
4716Py_LOCAL_INLINE(int)
4717STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004719 if (str[0] != other[0])
4720 return 1;
4721 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722}
4723
Thomas Wouters477c8d52006-05-27 19:21:47 +00004724#define STRINGLIB_EMPTY unicode_empty
4725
4726#include "stringlib/fastsearch.h"
4727
4728#include "stringlib/count.h"
4729#include "stringlib/find.h"
4730#include "stringlib/partition.h"
4731
4732/* helper macro to fixup start/end slice values */
4733#define FIX_START_END(obj) \
4734 if (start < 0) \
4735 start += (obj)->length; \
4736 if (start < 0) \
4737 start = 0; \
4738 if (end > (obj)->length) \
4739 end = (obj)->length; \
4740 if (end < 0) \
4741 end += (obj)->length; \
4742 if (end < 0) \
4743 end = 0;
4744
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004746 PyObject *substr,
4747 Py_ssize_t start,
4748 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004751 PyUnicodeObject* str_obj;
4752 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004753
Thomas Wouters477c8d52006-05-27 19:21:47 +00004754 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4755 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004757 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4758 if (!sub_obj) {
4759 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 return -1;
4761 }
Tim Petersced69f82003-09-16 20:30:58 +00004762
Thomas Wouters477c8d52006-05-27 19:21:47 +00004763 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004764
Thomas Wouters477c8d52006-05-27 19:21:47 +00004765 result = stringlib_count(
4766 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4767 );
4768
4769 Py_DECREF(sub_obj);
4770 Py_DECREF(str_obj);
4771
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 return result;
4773}
4774
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004776 PyObject *sub,
4777 Py_ssize_t start,
4778 Py_ssize_t end,
4779 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004782
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004784 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004785 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004786 sub = PyUnicode_FromObject(sub);
4787 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004788 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004789 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 }
Tim Petersced69f82003-09-16 20:30:58 +00004791
Thomas Wouters477c8d52006-05-27 19:21:47 +00004792 if (direction > 0)
4793 result = stringlib_find_slice(
4794 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4795 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4796 start, end
4797 );
4798 else
4799 result = stringlib_rfind_slice(
4800 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4801 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4802 start, end
4803 );
4804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004806 Py_DECREF(sub);
4807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 return result;
4809}
4810
Tim Petersced69f82003-09-16 20:30:58 +00004811static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812int tailmatch(PyUnicodeObject *self,
4813 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004814 Py_ssize_t start,
4815 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 int direction)
4817{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 if (substring->length == 0)
4819 return 1;
4820
Thomas Wouters477c8d52006-05-27 19:21:47 +00004821 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
4823 end -= substring->length;
4824 if (end < start)
4825 return 0;
4826
4827 if (direction > 0) {
4828 if (Py_UNICODE_MATCH(self, end, substring))
4829 return 1;
4830 } else {
4831 if (Py_UNICODE_MATCH(self, start, substring))
4832 return 1;
4833 }
4834
4835 return 0;
4836}
4837
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004840 Py_ssize_t start,
4841 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 int direction)
4843{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 str = PyUnicode_FromObject(str);
4847 if (str == NULL)
4848 return -1;
4849 substr = PyUnicode_FromObject(substr);
4850 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004851 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return -1;
4853 }
Tim Petersced69f82003-09-16 20:30:58 +00004854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 result = tailmatch((PyUnicodeObject *)str,
4856 (PyUnicodeObject *)substr,
4857 start, end, direction);
4858 Py_DECREF(str);
4859 Py_DECREF(substr);
4860 return result;
4861}
4862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863/* Apply fixfct filter to the Unicode object self and return a
4864 reference to the modified object */
4865
Tim Petersced69f82003-09-16 20:30:58 +00004866static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867PyObject *fixup(PyUnicodeObject *self,
4868 int (*fixfct)(PyUnicodeObject *s))
4869{
4870
4871 PyUnicodeObject *u;
4872
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004873 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 if (u == NULL)
4875 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004876
4877 Py_UNICODE_COPY(u->str, self->str, self->length);
4878
Tim Peters7a29bd52001-09-12 03:03:31 +00004879 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 /* fixfct should return TRUE if it modified the buffer. If
4881 FALSE, return a reference to the original buffer instead
4882 (to save space, not time) */
4883 Py_INCREF(self);
4884 Py_DECREF(u);
4885 return (PyObject*) self;
4886 }
4887 return (PyObject*) u;
4888}
4889
Tim Petersced69f82003-09-16 20:30:58 +00004890static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891int fixupper(PyUnicodeObject *self)
4892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004893 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 Py_UNICODE *s = self->str;
4895 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004896
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 while (len-- > 0) {
4898 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 ch = Py_UNICODE_TOUPPER(*s);
4901 if (ch != *s) {
4902 status = 1;
4903 *s = ch;
4904 }
4905 s++;
4906 }
4907
4908 return status;
4909}
4910
Tim Petersced69f82003-09-16 20:30:58 +00004911static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912int fixlower(PyUnicodeObject *self)
4913{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004914 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 Py_UNICODE *s = self->str;
4916 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 while (len-- > 0) {
4919 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004920
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 ch = Py_UNICODE_TOLOWER(*s);
4922 if (ch != *s) {
4923 status = 1;
4924 *s = ch;
4925 }
4926 s++;
4927 }
4928
4929 return status;
4930}
4931
Tim Petersced69f82003-09-16 20:30:58 +00004932static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933int fixswapcase(PyUnicodeObject *self)
4934{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004935 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 Py_UNICODE *s = self->str;
4937 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004938
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 while (len-- > 0) {
4940 if (Py_UNICODE_ISUPPER(*s)) {
4941 *s = Py_UNICODE_TOLOWER(*s);
4942 status = 1;
4943 } else if (Py_UNICODE_ISLOWER(*s)) {
4944 *s = Py_UNICODE_TOUPPER(*s);
4945 status = 1;
4946 }
4947 s++;
4948 }
4949
4950 return status;
4951}
4952
Tim Petersced69f82003-09-16 20:30:58 +00004953static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954int fixcapitalize(PyUnicodeObject *self)
4955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004957 Py_UNICODE *s = self->str;
4958 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004959
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004960 if (len == 0)
4961 return 0;
4962 if (Py_UNICODE_ISLOWER(*s)) {
4963 *s = Py_UNICODE_TOUPPER(*s);
4964 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004966 s++;
4967 while (--len > 0) {
4968 if (Py_UNICODE_ISUPPER(*s)) {
4969 *s = Py_UNICODE_TOLOWER(*s);
4970 status = 1;
4971 }
4972 s++;
4973 }
4974 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975}
4976
4977static
4978int fixtitle(PyUnicodeObject *self)
4979{
4980 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4981 register Py_UNICODE *e;
4982 int previous_is_cased;
4983
4984 /* Shortcut for single character strings */
4985 if (PyUnicode_GET_SIZE(self) == 1) {
4986 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4987 if (*p != ch) {
4988 *p = ch;
4989 return 1;
4990 }
4991 else
4992 return 0;
4993 }
Tim Petersced69f82003-09-16 20:30:58 +00004994
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 e = p + PyUnicode_GET_SIZE(self);
4996 previous_is_cased = 0;
4997 for (; p < e; p++) {
4998 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004999
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 if (previous_is_cased)
5001 *p = Py_UNICODE_TOLOWER(ch);
5002 else
5003 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005004
5005 if (Py_UNICODE_ISLOWER(ch) ||
5006 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 Py_UNICODE_ISTITLE(ch))
5008 previous_is_cased = 1;
5009 else
5010 previous_is_cased = 0;
5011 }
5012 return 1;
5013}
5014
Tim Peters8ce9f162004-08-27 01:49:32 +00005015PyObject *
5016PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017{
Tim Peters8ce9f162004-08-27 01:49:32 +00005018 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005019 const Py_UNICODE blank = ' ';
5020 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005021 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005022 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005023 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5024 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005025 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5026 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005027 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005028 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005029 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030
Tim Peters05eba1f2004-08-27 21:32:02 +00005031 fseq = PySequence_Fast(seq, "");
5032 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005033 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005034 }
5035
Tim Peters91879ab2004-08-27 22:35:44 +00005036 /* Grrrr. A codec may be invoked to convert str objects to
5037 * Unicode, and so it's possible to call back into Python code
5038 * during PyUnicode_FromObject(), and so it's possible for a sick
5039 * codec to change the size of fseq (if seq is a list). Therefore
5040 * we have to keep refetching the size -- can't assume seqlen
5041 * is invariant.
5042 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005043 seqlen = PySequence_Fast_GET_SIZE(fseq);
5044 /* If empty sequence, return u"". */
5045 if (seqlen == 0) {
5046 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5047 goto Done;
5048 }
5049 /* If singleton sequence with an exact Unicode, return that. */
5050 if (seqlen == 1) {
5051 item = PySequence_Fast_GET_ITEM(fseq, 0);
5052 if (PyUnicode_CheckExact(item)) {
5053 Py_INCREF(item);
5054 res = (PyUnicodeObject *)item;
5055 goto Done;
5056 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005057 }
5058
Tim Peters05eba1f2004-08-27 21:32:02 +00005059 /* At least two items to join, or one that isn't exact Unicode. */
5060 if (seqlen > 1) {
5061 /* Set up sep and seplen -- they're needed. */
5062 if (separator == NULL) {
5063 sep = &blank;
5064 seplen = 1;
5065 }
5066 else {
5067 internal_separator = PyUnicode_FromObject(separator);
5068 if (internal_separator == NULL)
5069 goto onError;
5070 sep = PyUnicode_AS_UNICODE(internal_separator);
5071 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005072 /* In case PyUnicode_FromObject() mutated seq. */
5073 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005074 }
5075 }
5076
5077 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005078 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005079 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005080 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005081 res_p = PyUnicode_AS_UNICODE(res);
5082 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005083
Tim Peters05eba1f2004-08-27 21:32:02 +00005084 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005085 Py_ssize_t itemlen;
5086 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005087
5088 item = PySequence_Fast_GET_ITEM(fseq, i);
5089 /* Convert item to Unicode. */
5090 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5091 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005092 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005093 " %.80s found",
5094 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005095 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005096 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005097 item = PyUnicode_FromObject(item);
5098 if (item == NULL)
5099 goto onError;
5100 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005101
Tim Peters91879ab2004-08-27 22:35:44 +00005102 /* In case PyUnicode_FromObject() mutated seq. */
5103 seqlen = PySequence_Fast_GET_SIZE(fseq);
5104
Tim Peters8ce9f162004-08-27 01:49:32 +00005105 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005107 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005108 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005109 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005110 if (i < seqlen - 1) {
5111 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005112 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005113 goto Overflow;
5114 }
5115 if (new_res_used > res_alloc) {
5116 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005117 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005119 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005120 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005121 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005122 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005123 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005125 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005126 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005128
5129 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005130 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005131 res_p += itemlen;
5132 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005133 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 res_p += seplen;
5135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005137 res_used = new_res_used;
5138 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005139
Tim Peters05eba1f2004-08-27 21:32:02 +00005140 /* Shrink res to match the used area; this probably can't fail,
5141 * but it's cheap to check.
5142 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005143 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005144 goto onError;
5145
5146 Done:
5147 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005148 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 return (PyObject *)res;
5150
Tim Peters8ce9f162004-08-27 01:49:32 +00005151 Overflow:
5152 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005153 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005154 Py_DECREF(item);
5155 /* fall through */
5156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005158 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005159 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005160 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 return NULL;
5162}
5163
Tim Petersced69f82003-09-16 20:30:58 +00005164static
5165PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005166 Py_ssize_t left,
5167 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 Py_UNICODE fill)
5169{
5170 PyUnicodeObject *u;
5171
5172 if (left < 0)
5173 left = 0;
5174 if (right < 0)
5175 right = 0;
5176
Tim Peters7a29bd52001-09-12 03:03:31 +00005177 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 Py_INCREF(self);
5179 return self;
5180 }
5181
5182 u = _PyUnicode_New(left + self->length + right);
5183 if (u) {
5184 if (left)
5185 Py_UNICODE_FILL(u->str, fill, left);
5186 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5187 if (right)
5188 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5189 }
5190
5191 return u;
5192}
5193
5194#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005195 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 if (!str) \
5197 goto onError; \
5198 if (PyList_Append(list, str)) { \
5199 Py_DECREF(str); \
5200 goto onError; \
5201 } \
5202 else \
5203 Py_DECREF(str);
5204
5205static
5206PyObject *split_whitespace(PyUnicodeObject *self,
5207 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005210 register Py_ssize_t i;
5211 register Py_ssize_t j;
5212 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 PyObject *str;
5214
5215 for (i = j = 0; i < len; ) {
5216 /* find a token */
5217 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5218 i++;
5219 j = i;
5220 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5221 i++;
5222 if (j < i) {
5223 if (maxcount-- <= 0)
5224 break;
5225 SPLIT_APPEND(self->str, j, i);
5226 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5227 i++;
5228 j = i;
5229 }
5230 }
5231 if (j < len) {
5232 SPLIT_APPEND(self->str, j, len);
5233 }
5234 return list;
5235
5236 onError:
5237 Py_DECREF(list);
5238 return NULL;
5239}
5240
5241PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005242 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005244 register Py_ssize_t i;
5245 register Py_ssize_t j;
5246 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 PyObject *list;
5248 PyObject *str;
5249 Py_UNICODE *data;
5250
5251 string = PyUnicode_FromObject(string);
5252 if (string == NULL)
5253 return NULL;
5254 data = PyUnicode_AS_UNICODE(string);
5255 len = PyUnicode_GET_SIZE(string);
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 list = PyList_New(0);
5258 if (!list)
5259 goto onError;
5260
5261 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005263
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005265 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
5268 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005269 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 if (i < len) {
5271 if (data[i] == '\r' && i + 1 < len &&
5272 data[i+1] == '\n')
5273 i += 2;
5274 else
5275 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005276 if (keepends)
5277 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 }
Guido van Rossum86662912000-04-11 15:38:46 +00005279 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 j = i;
5281 }
5282 if (j < len) {
5283 SPLIT_APPEND(data, j, len);
5284 }
5285
5286 Py_DECREF(string);
5287 return list;
5288
5289 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005290 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 Py_DECREF(string);
5292 return NULL;
5293}
5294
Tim Petersced69f82003-09-16 20:30:58 +00005295static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296PyObject *split_char(PyUnicodeObject *self,
5297 PyObject *list,
5298 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005301 register Py_ssize_t i;
5302 register Py_ssize_t j;
5303 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 PyObject *str;
5305
5306 for (i = j = 0; i < len; ) {
5307 if (self->str[i] == ch) {
5308 if (maxcount-- <= 0)
5309 break;
5310 SPLIT_APPEND(self->str, j, i);
5311 i = j = i + 1;
5312 } else
5313 i++;
5314 }
5315 if (j <= len) {
5316 SPLIT_APPEND(self->str, j, len);
5317 }
5318 return list;
5319
5320 onError:
5321 Py_DECREF(list);
5322 return NULL;
5323}
5324
Tim Petersced69f82003-09-16 20:30:58 +00005325static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326PyObject *split_substring(PyUnicodeObject *self,
5327 PyObject *list,
5328 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 register Py_ssize_t i;
5332 register Py_ssize_t j;
5333 Py_ssize_t len = self->length;
5334 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 PyObject *str;
5336
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005337 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 if (Py_UNICODE_MATCH(self, i, substring)) {
5339 if (maxcount-- <= 0)
5340 break;
5341 SPLIT_APPEND(self->str, j, i);
5342 i = j = i + sublen;
5343 } else
5344 i++;
5345 }
5346 if (j <= len) {
5347 SPLIT_APPEND(self->str, j, len);
5348 }
5349 return list;
5350
5351 onError:
5352 Py_DECREF(list);
5353 return NULL;
5354}
5355
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005356static
5357PyObject *rsplit_whitespace(PyUnicodeObject *self,
5358 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005359 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005360{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005361 register Py_ssize_t i;
5362 register Py_ssize_t j;
5363 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005364 PyObject *str;
5365
5366 for (i = j = len - 1; i >= 0; ) {
5367 /* find a token */
5368 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5369 i--;
5370 j = i;
5371 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5372 i--;
5373 if (j > i) {
5374 if (maxcount-- <= 0)
5375 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005376 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005377 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5378 i--;
5379 j = i;
5380 }
5381 }
5382 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005383 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005384 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005385 if (PyList_Reverse(list) < 0)
5386 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005387 return list;
5388
5389 onError:
5390 Py_DECREF(list);
5391 return NULL;
5392}
5393
5394static
5395PyObject *rsplit_char(PyUnicodeObject *self,
5396 PyObject *list,
5397 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005399{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005400 register Py_ssize_t i;
5401 register Py_ssize_t j;
5402 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005403 PyObject *str;
5404
5405 for (i = j = len - 1; i >= 0; ) {
5406 if (self->str[i] == ch) {
5407 if (maxcount-- <= 0)
5408 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005410 j = i = i - 1;
5411 } else
5412 i--;
5413 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005414 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005415 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005416 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 if (PyList_Reverse(list) < 0)
5418 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005419 return list;
5420
5421 onError:
5422 Py_DECREF(list);
5423 return NULL;
5424}
5425
5426static
5427PyObject *rsplit_substring(PyUnicodeObject *self,
5428 PyObject *list,
5429 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005430 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005432 register Py_ssize_t i;
5433 register Py_ssize_t j;
5434 Py_ssize_t len = self->length;
5435 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005436 PyObject *str;
5437
5438 for (i = len - sublen, j = len; i >= 0; ) {
5439 if (Py_UNICODE_MATCH(self, i, substring)) {
5440 if (maxcount-- <= 0)
5441 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005442 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005443 j = i;
5444 i -= sublen;
5445 } else
5446 i--;
5447 }
5448 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005449 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005450 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005451 if (PyList_Reverse(list) < 0)
5452 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005453 return list;
5454
5455 onError:
5456 Py_DECREF(list);
5457 return NULL;
5458}
5459
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460#undef SPLIT_APPEND
5461
5462static
5463PyObject *split(PyUnicodeObject *self,
5464 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005465 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
5467 PyObject *list;
5468
5469 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005470 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
5472 list = PyList_New(0);
5473 if (!list)
5474 return NULL;
5475
5476 if (substring == NULL)
5477 return split_whitespace(self,list,maxcount);
5478
5479 else if (substring->length == 1)
5480 return split_char(self,list,substring->str[0],maxcount);
5481
5482 else if (substring->length == 0) {
5483 Py_DECREF(list);
5484 PyErr_SetString(PyExc_ValueError, "empty separator");
5485 return NULL;
5486 }
5487 else
5488 return split_substring(self,list,substring,maxcount);
5489}
5490
Tim Petersced69f82003-09-16 20:30:58 +00005491static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005492PyObject *rsplit(PyUnicodeObject *self,
5493 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005494 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005495{
5496 PyObject *list;
5497
5498 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005499 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005500
5501 list = PyList_New(0);
5502 if (!list)
5503 return NULL;
5504
5505 if (substring == NULL)
5506 return rsplit_whitespace(self,list,maxcount);
5507
5508 else if (substring->length == 1)
5509 return rsplit_char(self,list,substring->str[0],maxcount);
5510
5511 else if (substring->length == 0) {
5512 Py_DECREF(list);
5513 PyErr_SetString(PyExc_ValueError, "empty separator");
5514 return NULL;
5515 }
5516 else
5517 return rsplit_substring(self,list,substring,maxcount);
5518}
5519
5520static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521PyObject *replace(PyUnicodeObject *self,
5522 PyUnicodeObject *str1,
5523 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005524 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
5526 PyUnicodeObject *u;
5527
5528 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005529 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Thomas Wouters477c8d52006-05-27 19:21:47 +00005531 if (str1->length == str2->length) {
5532 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005533 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005534 if (str1->length == 1) {
5535 /* replace characters */
5536 Py_UNICODE u1, u2;
5537 if (!findchar(self->str, self->length, str1->str[0]))
5538 goto nothing;
5539 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5540 if (!u)
5541 return NULL;
5542 Py_UNICODE_COPY(u->str, self->str, self->length);
5543 u1 = str1->str[0];
5544 u2 = str2->str[0];
5545 for (i = 0; i < u->length; i++)
5546 if (u->str[i] == u1) {
5547 if (--maxcount < 0)
5548 break;
5549 u->str[i] = u2;
5550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005552 i = fastsearch(
5553 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 if (i < 0)
5556 goto nothing;
5557 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5558 if (!u)
5559 return NULL;
5560 Py_UNICODE_COPY(u->str, self->str, self->length);
5561 while (i <= self->length - str1->length)
5562 if (Py_UNICODE_MATCH(self, i, str1)) {
5563 if (--maxcount < 0)
5564 break;
5565 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5566 i += str1->length;
5567 } else
5568 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005571
5572 Py_ssize_t n, i, j, e;
5573 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 Py_UNICODE *p;
5575
5576 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005577 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 if (n > maxcount)
5579 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005580 if (n == 0)
5581 goto nothing;
5582 /* new_size = self->length + n * (str2->length - str1->length)); */
5583 delta = (str2->length - str1->length);
5584 if (delta == 0) {
5585 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005587 product = n * (str2->length - str1->length);
5588 if ((product / (str2->length - str1->length)) != n) {
5589 PyErr_SetString(PyExc_OverflowError,
5590 "replace string is too long");
5591 return NULL;
5592 }
5593 new_size = self->length + product;
5594 if (new_size < 0) {
5595 PyErr_SetString(PyExc_OverflowError,
5596 "replace string is too long");
5597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 }
5599 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005600 u = _PyUnicode_New(new_size);
5601 if (!u)
5602 return NULL;
5603 i = 0;
5604 p = u->str;
5605 e = self->length - str1->length;
5606 if (str1->length > 0) {
5607 while (n-- > 0) {
5608 /* look for next match */
5609 j = i;
5610 while (j <= e) {
5611 if (Py_UNICODE_MATCH(self, j, str1))
5612 break;
5613 j++;
5614 }
5615 if (j > i) {
5616 if (j > e)
5617 break;
5618 /* copy unchanged part [i:j] */
5619 Py_UNICODE_COPY(p, self->str+i, j-i);
5620 p += j - i;
5621 }
5622 /* copy substitution string */
5623 if (str2->length > 0) {
5624 Py_UNICODE_COPY(p, str2->str, str2->length);
5625 p += str2->length;
5626 }
5627 i = j + str1->length;
5628 }
5629 if (i < self->length)
5630 /* copy tail [i:] */
5631 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5632 } else {
5633 /* interleave */
5634 while (n > 0) {
5635 Py_UNICODE_COPY(p, str2->str, str2->length);
5636 p += str2->length;
5637 if (--n <= 0)
5638 break;
5639 *p++ = self->str[i++];
5640 }
5641 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005645
5646nothing:
5647 /* nothing to replace; return original string (when possible) */
5648 if (PyUnicode_CheckExact(self)) {
5649 Py_INCREF(self);
5650 return (PyObject *) self;
5651 }
5652 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653}
5654
5655/* --- Unicode Object Methods --------------------------------------------- */
5656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005657PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658"S.title() -> unicode\n\
5659\n\
5660Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662
5663static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005664unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return fixup(self, fixtitle);
5667}
5668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005669PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670"S.capitalize() -> unicode\n\
5671\n\
5672Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005673have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005676unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return fixup(self, fixcapitalize);
5679}
5680
5681#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683"S.capwords() -> unicode\n\
5684\n\
5685Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
5688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005689unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690{
5691 PyObject *list;
5692 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005693 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 /* Split into words */
5696 list = split(self, NULL, -1);
5697 if (!list)
5698 return NULL;
5699
5700 /* Capitalize each word */
5701 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5702 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5703 fixcapitalize);
5704 if (item == NULL)
5705 goto onError;
5706 Py_DECREF(PyList_GET_ITEM(list, i));
5707 PyList_SET_ITEM(list, i, item);
5708 }
5709
5710 /* Join the words to form a new string */
5711 item = PyUnicode_Join(NULL, list);
5712
5713onError:
5714 Py_DECREF(list);
5715 return (PyObject *)item;
5716}
5717#endif
5718
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005719/* Argument converter. Coerces to a single unicode character */
5720
5721static int
5722convert_uc(PyObject *obj, void *addr)
5723{
5724 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5725 PyObject *uniobj;
5726 Py_UNICODE *unistr;
5727
5728 uniobj = PyUnicode_FromObject(obj);
5729 if (uniobj == NULL) {
5730 PyErr_SetString(PyExc_TypeError,
5731 "The fill character cannot be converted to Unicode");
5732 return 0;
5733 }
5734 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5735 PyErr_SetString(PyExc_TypeError,
5736 "The fill character must be exactly one character long");
5737 Py_DECREF(uniobj);
5738 return 0;
5739 }
5740 unistr = PyUnicode_AS_UNICODE(uniobj);
5741 *fillcharloc = unistr[0];
5742 Py_DECREF(uniobj);
5743 return 1;
5744}
5745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005746PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005747"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005749Return S centered in a Unicode string of length width. Padding is\n\
5750done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
5752static PyObject *
5753unicode_center(PyUnicodeObject *self, PyObject *args)
5754{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t marg, left;
5756 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005757 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Thomas Woutersde017742006-02-16 19:34:37 +00005759 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
5761
Tim Peters7a29bd52001-09-12 03:03:31 +00005762 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 Py_INCREF(self);
5764 return (PyObject*) self;
5765 }
5766
5767 marg = width - self->length;
5768 left = marg / 2 + (marg & width & 1);
5769
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005770 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771}
5772
Marc-André Lemburge5034372000-08-08 08:04:29 +00005773#if 0
5774
5775/* This code should go into some future Unicode collation support
5776 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005777 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005778
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005779/* speedy UTF-16 code point order comparison */
5780/* gleaned from: */
5781/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5782
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005783static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005784{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005785 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005786 0, 0, 0, 0, 0, 0, 0, 0,
5787 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005788 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005789};
5790
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791static int
5792unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5793{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 Py_UNICODE *s1 = str1->str;
5797 Py_UNICODE *s2 = str2->str;
5798
5799 len1 = str1->length;
5800 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005803 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005804
5805 c1 = *s1++;
5806 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005807
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005808 if (c1 > (1<<11) * 26)
5809 c1 += utf16Fixup[c1>>11];
5810 if (c2 > (1<<11) * 26)
5811 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005812 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005813
5814 if (c1 != c2)
5815 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005816
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005817 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 }
5819
5820 return (len1 < len2) ? -1 : (len1 != len2);
5821}
5822
Marc-André Lemburge5034372000-08-08 08:04:29 +00005823#else
5824
5825static int
5826unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5827{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005828 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005829
5830 Py_UNICODE *s1 = str1->str;
5831 Py_UNICODE *s2 = str2->str;
5832
5833 len1 = str1->length;
5834 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005835
Marc-André Lemburge5034372000-08-08 08:04:29 +00005836 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005837 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005838
Fredrik Lundh45714e92001-06-26 16:39:36 +00005839 c1 = *s1++;
5840 c2 = *s2++;
5841
5842 if (c1 != c2)
5843 return (c1 < c2) ? -1 : 1;
5844
Marc-André Lemburge5034372000-08-08 08:04:29 +00005845 len1--; len2--;
5846 }
5847
5848 return (len1 < len2) ? -1 : (len1 != len2);
5849}
5850
5851#endif
5852
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853int PyUnicode_Compare(PyObject *left,
5854 PyObject *right)
5855{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005856 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5857 return unicode_compare((PyUnicodeObject *)left,
5858 (PyUnicodeObject *)right);
5859 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5860 (PyUnicode_Check(left) && PyString_Check(right))) {
5861 if (PyUnicode_Check(left))
5862 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5863 if (PyUnicode_Check(right))
5864 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5865 assert(PyString_Check(left));
5866 assert(PyString_Check(right));
5867 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005869 PyErr_Format(PyExc_TypeError,
5870 "Can't compare %.100s and %.100s",
5871 left->ob_type->tp_name,
5872 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 return -1;
5874}
5875
Martin v. Löwis5b222132007-06-10 09:51:05 +00005876int
5877PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5878{
5879 int i;
5880 Py_UNICODE *id;
5881 assert(PyUnicode_Check(uni));
5882 id = PyUnicode_AS_UNICODE(uni);
5883 /* Compare Unicode string and source character set string */
5884 for (i = 0; id[i] && str[i]; i++)
5885 if (id[i] != str[i])
5886 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5887 if (id[i])
5888 return 1; /* uni is longer */
5889 if (str[i])
5890 return -1; /* str is longer */
5891 return 0;
5892}
5893
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005894PyObject *PyUnicode_RichCompare(PyObject *left,
5895 PyObject *right,
5896 int op)
5897{
5898 int result;
5899
5900 result = PyUnicode_Compare(left, right);
5901 if (result == -1 && PyErr_Occurred())
5902 goto onError;
5903
5904 /* Convert the return value to a Boolean */
5905 switch (op) {
5906 case Py_EQ:
5907 result = (result == 0);
5908 break;
5909 case Py_NE:
5910 result = (result != 0);
5911 break;
5912 case Py_LE:
5913 result = (result <= 0);
5914 break;
5915 case Py_GE:
5916 result = (result >= 0);
5917 break;
5918 case Py_LT:
5919 result = (result == -1);
5920 break;
5921 case Py_GT:
5922 result = (result == 1);
5923 break;
5924 }
5925 return PyBool_FromLong(result);
5926
5927 onError:
5928
5929 /* Standard case
5930
5931 Type errors mean that PyUnicode_FromObject() could not convert
5932 one of the arguments (usually the right hand side) to Unicode,
5933 ie. we can't handle the comparison request. However, it is
5934 possible that the other object knows a comparison method, which
5935 is why we return Py_NotImplemented to give the other object a
5936 chance.
5937
5938 */
5939 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5940 PyErr_Clear();
5941 Py_INCREF(Py_NotImplemented);
5942 return Py_NotImplemented;
5943 }
5944 if (op != Py_EQ && op != Py_NE)
5945 return NULL;
5946
5947 /* Equality comparison.
5948
5949 This is a special case: we silence any PyExc_UnicodeDecodeError
5950 and instead turn it into a PyErr_UnicodeWarning.
5951
5952 */
5953 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5954 return NULL;
5955 PyErr_Clear();
5956 if (PyErr_Warn(PyExc_UnicodeWarning,
5957 (op == Py_EQ) ?
5958 "Unicode equal comparison "
5959 "failed to convert both arguments to Unicode - "
5960 "interpreting them as being unequal" :
5961 "Unicode unequal comparison "
5962 "failed to convert both arguments to Unicode - "
5963 "interpreting them as being unequal"
5964 ) < 0)
5965 return NULL;
5966 result = (op == Py_NE);
5967 return PyBool_FromLong(result);
5968}
5969
Guido van Rossum403d68b2000-03-13 15:55:09 +00005970int PyUnicode_Contains(PyObject *container,
5971 PyObject *element)
5972{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005974 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005975
5976 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005977 sub = PyUnicode_FromObject(element);
5978 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005979 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005980 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005981 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005982 }
5983
Thomas Wouters477c8d52006-05-27 19:21:47 +00005984 str = PyUnicode_FromObject(container);
5985 if (!str) {
5986 Py_DECREF(sub);
5987 return -1;
5988 }
5989
5990 result = stringlib_contains_obj(str, sub);
5991
5992 Py_DECREF(str);
5993 Py_DECREF(sub);
5994
Guido van Rossum403d68b2000-03-13 15:55:09 +00005995 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005996}
5997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998/* Concat to string or Unicode object giving a new Unicode object. */
5999
6000PyObject *PyUnicode_Concat(PyObject *left,
6001 PyObject *right)
6002{
6003 PyUnicodeObject *u = NULL, *v = NULL, *w;
6004
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006005 if (PyBytes_Check(left) || PyBytes_Check(right))
6006 return PyBytes_Concat(left, right);
6007
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 /* Coerce the two arguments */
6009 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6010 if (u == NULL)
6011 goto onError;
6012 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6013 if (v == NULL)
6014 goto onError;
6015
6016 /* Shortcuts */
6017 if (v == unicode_empty) {
6018 Py_DECREF(v);
6019 return (PyObject *)u;
6020 }
6021 if (u == unicode_empty) {
6022 Py_DECREF(u);
6023 return (PyObject *)v;
6024 }
6025
6026 /* Concat the two Unicode strings */
6027 w = _PyUnicode_New(u->length + v->length);
6028 if (w == NULL)
6029 goto onError;
6030 Py_UNICODE_COPY(w->str, u->str, u->length);
6031 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6032
6033 Py_DECREF(u);
6034 Py_DECREF(v);
6035 return (PyObject *)w;
6036
6037onError:
6038 Py_XDECREF(u);
6039 Py_XDECREF(v);
6040 return NULL;
6041}
6042
Walter Dörwald1ab83302007-05-18 17:15:44 +00006043void
6044PyUnicode_Append(PyObject **pleft, PyObject *right)
6045{
6046 PyObject *new;
6047 if (*pleft == NULL)
6048 return;
6049 if (right == NULL || !PyUnicode_Check(*pleft)) {
6050 Py_DECREF(*pleft);
6051 *pleft = NULL;
6052 return;
6053 }
6054 new = PyUnicode_Concat(*pleft, right);
6055 Py_DECREF(*pleft);
6056 *pleft = new;
6057}
6058
6059void
6060PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6061{
6062 PyUnicode_Append(pleft, right);
6063 Py_XDECREF(right);
6064}
6065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006066PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067"S.count(sub[, start[, end]]) -> int\n\
6068\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069Return the number of non-overlapping occurrences of substring sub in\n\
6070Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073static PyObject *
6074unicode_count(PyUnicodeObject *self, PyObject *args)
6075{
6076 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006077 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006078 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 PyObject *result;
6080
Guido van Rossumb8872e62000-05-09 14:14:27 +00006081 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6082 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 return NULL;
6084
6085 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006086 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 if (substring == NULL)
6088 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006089
Thomas Wouters477c8d52006-05-27 19:21:47 +00006090 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091
Thomas Wouters477c8d52006-05-27 19:21:47 +00006092 result = PyInt_FromSsize_t(
6093 stringlib_count(self->str + start, end - start,
6094 substring->str, substring->length)
6095 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
6097 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return result;
6100}
6101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006103"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006105Encodes S using the codec registered for encoding. encoding defaults\n\
6106to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006107handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6109'xmlcharrefreplace' as well as any other name registered with\n\
6110codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
6112static PyObject *
6113unicode_encode(PyUnicodeObject *self, PyObject *args)
6114{
6115 char *encoding = NULL;
6116 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006117 PyObject *v;
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6120 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006121 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006122 if (v == NULL)
6123 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006124 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006125 if (PyString_Check(v)) {
6126 /* Old codec, turn it into bytes */
6127 PyObject *b = PyBytes_FromObject(v);
6128 Py_DECREF(v);
6129 return b;
6130 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006131 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006132 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006133 "(type=%.400s)",
6134 v->ob_type->tp_name);
6135 Py_DECREF(v);
6136 return NULL;
6137 }
6138 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006139
6140 onError:
6141 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006142}
6143
6144PyDoc_STRVAR(decode__doc__,
6145"S.decode([encoding[,errors]]) -> string or unicode\n\
6146\n\
6147Decodes S using the codec registered for encoding. encoding defaults\n\
6148to the default encoding. errors may be given to set a different error\n\
6149handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6150a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6151as well as any other name registerd with codecs.register_error that is\n\
6152able to handle UnicodeDecodeErrors.");
6153
6154static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006155unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156{
6157 char *encoding = NULL;
6158 char *errors = NULL;
6159 PyObject *v;
6160
6161 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6162 return NULL;
6163 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006164 if (v == NULL)
6165 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006166 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6167 PyErr_Format(PyExc_TypeError,
6168 "decoder did not return a string/unicode object "
6169 "(type=%.400s)",
6170 v->ob_type->tp_name);
6171 Py_DECREF(v);
6172 return NULL;
6173 }
6174 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006175
6176 onError:
6177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181"S.expandtabs([tabsize]) -> unicode\n\
6182\n\
6183Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006184If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
6187unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6188{
6189 Py_UNICODE *e;
6190 Py_UNICODE *p;
6191 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 PyUnicodeObject *u;
6194 int tabsize = 8;
6195
6196 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6197 return NULL;
6198
Thomas Wouters7e474022000-07-16 12:04:32 +00006199 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 i = j = 0;
6201 e = self->str + self->length;
6202 for (p = self->str; p < e; p++)
6203 if (*p == '\t') {
6204 if (tabsize > 0)
6205 j += tabsize - (j % tabsize);
6206 }
6207 else {
6208 j++;
6209 if (*p == '\n' || *p == '\r') {
6210 i += j;
6211 j = 0;
6212 }
6213 }
6214
6215 /* Second pass: create output string and fill it */
6216 u = _PyUnicode_New(i + j);
6217 if (!u)
6218 return NULL;
6219
6220 j = 0;
6221 q = u->str;
6222
6223 for (p = self->str; p < e; p++)
6224 if (*p == '\t') {
6225 if (tabsize > 0) {
6226 i = tabsize - (j % tabsize);
6227 j += i;
6228 while (i--)
6229 *q++ = ' ';
6230 }
6231 }
6232 else {
6233 j++;
6234 *q++ = *p;
6235 if (*p == '\n' || *p == '\r')
6236 j = 0;
6237 }
6238
6239 return (PyObject*) u;
6240}
6241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243"S.find(sub [,start [,end]]) -> int\n\
6244\n\
6245Return the lowest index in S where substring sub is found,\n\
6246such that sub is contained within s[start,end]. Optional\n\
6247arguments start and end are interpreted as in slice notation.\n\
6248\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006249Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
6251static PyObject *
6252unicode_find(PyUnicodeObject *self, PyObject *args)
6253{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006254 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006256 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006257 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
Guido van Rossumb8872e62000-05-09 14:14:27 +00006259 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6260 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 substring = PyUnicode_FromObject(substring);
6263 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 return NULL;
6265
Thomas Wouters477c8d52006-05-27 19:21:47 +00006266 result = stringlib_find_slice(
6267 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6268 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6269 start, end
6270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273
6274 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275}
6276
6277static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006278unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279{
6280 if (index < 0 || index >= self->length) {
6281 PyErr_SetString(PyExc_IndexError, "string index out of range");
6282 return NULL;
6283 }
6284
6285 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6286}
6287
6288static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006289unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006291 /* Since Unicode objects compare equal to their UTF-8 string
6292 counterparts, we hash the UTF-8 string. */
6293 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6294 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295}
6296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006297PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298"S.index(sub [,start [,end]]) -> int\n\
6299\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006300Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject *
6303unicode_index(PyUnicodeObject *self, PyObject *args)
6304{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006305 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006306 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006307 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006308 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
Guido van Rossumb8872e62000-05-09 14:14:27 +00006310 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6311 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313 substring = PyUnicode_FromObject(substring);
6314 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 return NULL;
6316
Thomas Wouters477c8d52006-05-27 19:21:47 +00006317 result = stringlib_find_slice(
6318 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6319 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6320 start, end
6321 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
6323 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006324
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 if (result < 0) {
6326 PyErr_SetString(PyExc_ValueError, "substring not found");
6327 return NULL;
6328 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006329
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331}
6332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006333PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006334"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006336Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
6339static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006340unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
6342 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6343 register const Py_UNICODE *e;
6344 int cased;
6345
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 /* Shortcut for single character strings */
6347 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006348 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006350 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006351 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006352 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 e = p + PyUnicode_GET_SIZE(self);
6355 cased = 0;
6356 for (; p < e; p++) {
6357 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006360 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 else if (!cased && Py_UNICODE_ISLOWER(ch))
6362 cased = 1;
6363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006364 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365}
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006370Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006371at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
6373static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006374unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6377 register const Py_UNICODE *e;
6378 int cased;
6379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 /* Shortcut for single character strings */
6381 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006382 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006384 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006385 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 e = p + PyUnicode_GET_SIZE(self);
6389 cased = 0;
6390 for (; p < e; p++) {
6391 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 else if (!cased && Py_UNICODE_ISUPPER(ch))
6396 cased = 1;
6397 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006398 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399}
6400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006402"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006404Return True if S is a titlecased string and there is at least one\n\
6405character in S, i.e. upper- and titlecase characters may only\n\
6406follow uncased characters and lowercase characters only cased ones.\n\
6407Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006410unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
6412 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6413 register const Py_UNICODE *e;
6414 int cased, previous_is_cased;
6415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 /* Shortcut for single character strings */
6417 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006418 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6419 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006421 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006422 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 e = p + PyUnicode_GET_SIZE(self);
6426 cased = 0;
6427 previous_is_cased = 0;
6428 for (; p < e; p++) {
6429 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6432 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006433 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 previous_is_cased = 1;
6435 cased = 1;
6436 }
6437 else if (Py_UNICODE_ISLOWER(ch)) {
6438 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006439 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 previous_is_cased = 1;
6441 cased = 1;
6442 }
6443 else
6444 previous_is_cased = 0;
6445 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006446 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447}
6448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006449PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006450"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006452Return True if all characters in S are whitespace\n\
6453and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454
6455static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006456unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457{
6458 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6459 register const Py_UNICODE *e;
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* Shortcut for single character strings */
6462 if (PyUnicode_GET_SIZE(self) == 1 &&
6463 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006464 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006466 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006467 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006468 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 e = p + PyUnicode_GET_SIZE(self);
6471 for (; p < e; p++) {
6472 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006473 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006475 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476}
6477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006479"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006480\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006481Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006483
6484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006485unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006486{
6487 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6488 register const Py_UNICODE *e;
6489
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006490 /* Shortcut for single character strings */
6491 if (PyUnicode_GET_SIZE(self) == 1 &&
6492 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006493 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006494
6495 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006496 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006497 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006498
6499 e = p + PyUnicode_GET_SIZE(self);
6500 for (; p < e; p++) {
6501 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006503 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006504 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006505}
6506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006507PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006508"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006509\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006510Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006511and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006512
6513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006514unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006515{
6516 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6517 register const Py_UNICODE *e;
6518
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006519 /* Shortcut for single character strings */
6520 if (PyUnicode_GET_SIZE(self) == 1 &&
6521 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006522 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006523
6524 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006525 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006526 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006527
6528 e = p + PyUnicode_GET_SIZE(self);
6529 for (; p < e; p++) {
6530 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006532 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006533 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006534}
6535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006536PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006537"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006539Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006540False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541
6542static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006543unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
6545 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6546 register const Py_UNICODE *e;
6547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 /* Shortcut for single character strings */
6549 if (PyUnicode_GET_SIZE(self) == 1 &&
6550 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006551 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006553 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006554 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006556
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 e = p + PyUnicode_GET_SIZE(self);
6558 for (; p < e; p++) {
6559 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006560 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006562 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006565PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006566"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006568Return True if all characters in S are digits\n\
6569and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
6571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006572unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573{
6574 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6575 register const Py_UNICODE *e;
6576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 /* Shortcut for single character strings */
6578 if (PyUnicode_GET_SIZE(self) == 1 &&
6579 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006580 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006582 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006583 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006584 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006585
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 e = p + PyUnicode_GET_SIZE(self);
6587 for (; p < e; p++) {
6588 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006589 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006591 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592}
6593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006594PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006595"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006597Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006598False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
6600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006601unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
6603 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6604 register const Py_UNICODE *e;
6605
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 /* Shortcut for single character strings */
6607 if (PyUnicode_GET_SIZE(self) == 1 &&
6608 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006609 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006611 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006612 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006613 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006614
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 e = p + PyUnicode_GET_SIZE(self);
6616 for (; p < e; p++) {
6617 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621}
6622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624"S.join(sequence) -> unicode\n\
6625\n\
6626Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006627sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628
6629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006630unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006632 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633}
6634
Martin v. Löwis18e16552006-02-15 17:27:45 +00006635static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636unicode_length(PyUnicodeObject *self)
6637{
6638 return self->length;
6639}
6640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006641PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006642"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643\n\
6644Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006645done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
6647static PyObject *
6648unicode_ljust(PyUnicodeObject *self, PyObject *args)
6649{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006650 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006651 Py_UNICODE fillchar = ' ';
6652
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006653 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 return NULL;
6655
Tim Peters7a29bd52001-09-12 03:03:31 +00006656 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 Py_INCREF(self);
6658 return (PyObject*) self;
6659 }
6660
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006661 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665"S.lower() -> unicode\n\
6666\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
6669static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006670unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 return fixup(self, fixlower);
6673}
6674
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006675#define LEFTSTRIP 0
6676#define RIGHTSTRIP 1
6677#define BOTHSTRIP 2
6678
6679/* Arrays indexed by above */
6680static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6681
6682#define STRIPNAME(i) (stripformat[i]+3)
6683
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006684/* externally visible for str.strip(unicode) */
6685PyObject *
6686_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6687{
6688 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006689 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006690 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6692 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006693
Thomas Wouters477c8d52006-05-27 19:21:47 +00006694 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006696 i = 0;
6697 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006698 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6699 i++;
6700 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006701 }
6702
6703 j = len;
6704 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705 do {
6706 j--;
6707 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6708 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006709 }
6710
6711 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006712 Py_INCREF(self);
6713 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006714 }
6715 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006716 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006717}
6718
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
6720static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006721do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006723 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006724 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006725
6726 i = 0;
6727 if (striptype != RIGHTSTRIP) {
6728 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6729 i++;
6730 }
6731 }
6732
6733 j = len;
6734 if (striptype != LEFTSTRIP) {
6735 do {
6736 j--;
6737 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6738 j++;
6739 }
6740
6741 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6742 Py_INCREF(self);
6743 return (PyObject*)self;
6744 }
6745 else
6746 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747}
6748
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749
6750static PyObject *
6751do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6752{
6753 PyObject *sep = NULL;
6754
6755 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6756 return NULL;
6757
6758 if (sep != NULL && sep != Py_None) {
6759 if (PyUnicode_Check(sep))
6760 return _PyUnicode_XStrip(self, striptype, sep);
6761 else if (PyString_Check(sep)) {
6762 PyObject *res;
6763 sep = PyUnicode_FromObject(sep);
6764 if (sep==NULL)
6765 return NULL;
6766 res = _PyUnicode_XStrip(self, striptype, sep);
6767 Py_DECREF(sep);
6768 return res;
6769 }
6770 else {
6771 PyErr_Format(PyExc_TypeError,
6772 "%s arg must be None, unicode or str",
6773 STRIPNAME(striptype));
6774 return NULL;
6775 }
6776 }
6777
6778 return do_strip(self, striptype);
6779}
6780
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006783"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006784\n\
6785Return a copy of the string S with leading and trailing\n\
6786whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006787If chars is given and not None, remove characters in chars instead.\n\
6788If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006789
6790static PyObject *
6791unicode_strip(PyUnicodeObject *self, PyObject *args)
6792{
6793 if (PyTuple_GET_SIZE(args) == 0)
6794 return do_strip(self, BOTHSTRIP); /* Common case */
6795 else
6796 return do_argstrip(self, BOTHSTRIP, args);
6797}
6798
6799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006801"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006802\n\
6803Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006804If chars is given and not None, remove characters in chars instead.\n\
6805If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006806
6807static PyObject *
6808unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6809{
6810 if (PyTuple_GET_SIZE(args) == 0)
6811 return do_strip(self, LEFTSTRIP); /* Common case */
6812 else
6813 return do_argstrip(self, LEFTSTRIP, args);
6814}
6815
6816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006817PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006818"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006819\n\
6820Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006821If chars is given and not None, remove characters in chars instead.\n\
6822If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006823
6824static PyObject *
6825unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6826{
6827 if (PyTuple_GET_SIZE(args) == 0)
6828 return do_strip(self, RIGHTSTRIP); /* Common case */
6829 else
6830 return do_argstrip(self, RIGHTSTRIP, args);
6831}
6832
6833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006835unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
6837 PyUnicodeObject *u;
6838 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006839 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006840 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
6842 if (len < 0)
6843 len = 0;
6844
Tim Peters7a29bd52001-09-12 03:03:31 +00006845 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 /* no repeat, return original string */
6847 Py_INCREF(str);
6848 return (PyObject*) str;
6849 }
Tim Peters8f422462000-09-09 06:13:41 +00006850
6851 /* ensure # of chars needed doesn't overflow int and # of bytes
6852 * needed doesn't overflow size_t
6853 */
6854 nchars = len * str->length;
6855 if (len && nchars / len != str->length) {
6856 PyErr_SetString(PyExc_OverflowError,
6857 "repeated string is too long");
6858 return NULL;
6859 }
6860 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6861 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6862 PyErr_SetString(PyExc_OverflowError,
6863 "repeated string is too long");
6864 return NULL;
6865 }
6866 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (!u)
6868 return NULL;
6869
6870 p = u->str;
6871
Thomas Wouters477c8d52006-05-27 19:21:47 +00006872 if (str->length == 1 && len > 0) {
6873 Py_UNICODE_FILL(p, str->str[0], len);
6874 } else {
6875 Py_ssize_t done = 0; /* number of characters copied this far */
6876 if (done < nchars) {
6877 Py_UNICODE_COPY(p, str->str, str->length);
6878 done = str->length;
6879 }
6880 while (done < nchars) {
6881 int n = (done <= nchars-done) ? done : nchars-done;
6882 Py_UNICODE_COPY(p+done, p, n);
6883 done += n;
6884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 }
6886
6887 return (PyObject*) u;
6888}
6889
6890PyObject *PyUnicode_Replace(PyObject *obj,
6891 PyObject *subobj,
6892 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
6895 PyObject *self;
6896 PyObject *str1;
6897 PyObject *str2;
6898 PyObject *result;
6899
6900 self = PyUnicode_FromObject(obj);
6901 if (self == NULL)
6902 return NULL;
6903 str1 = PyUnicode_FromObject(subobj);
6904 if (str1 == NULL) {
6905 Py_DECREF(self);
6906 return NULL;
6907 }
6908 str2 = PyUnicode_FromObject(replobj);
6909 if (str2 == NULL) {
6910 Py_DECREF(self);
6911 Py_DECREF(str1);
6912 return NULL;
6913 }
Tim Petersced69f82003-09-16 20:30:58 +00006914 result = replace((PyUnicodeObject *)self,
6915 (PyUnicodeObject *)str1,
6916 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 maxcount);
6918 Py_DECREF(self);
6919 Py_DECREF(str1);
6920 Py_DECREF(str2);
6921 return result;
6922}
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925"S.replace (old, new[, maxsplit]) -> unicode\n\
6926\n\
6927Return a copy of S with all occurrences of substring\n\
6928old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
6931static PyObject*
6932unicode_replace(PyUnicodeObject *self, PyObject *args)
6933{
6934 PyUnicodeObject *str1;
6935 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006936 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 PyObject *result;
6938
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 return NULL;
6941 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6942 if (str1 == NULL)
6943 return NULL;
6944 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006945 if (str2 == NULL) {
6946 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950 result = replace(self, str1, str2, maxcount);
6951
6952 Py_DECREF(str1);
6953 Py_DECREF(str2);
6954 return result;
6955}
6956
6957static
6958PyObject *unicode_repr(PyObject *unicode)
6959{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006960 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006961 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006962 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6963 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6964
6965 /* XXX(nnorwitz): rather than over-allocating, it would be
6966 better to choose a different scheme. Perhaps scan the
6967 first N-chars of the string and allocate based on that size.
6968 */
6969 /* Initial allocation is based on the longest-possible unichr
6970 escape.
6971
6972 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6973 unichr, so in this case it's the longest unichr escape. In
6974 narrow (UTF-16) builds this is five chars per source unichr
6975 since there are two unichrs in the surrogate pair, so in narrow
6976 (UTF-16) builds it's not the longest unichr escape.
6977
6978 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6979 so in the narrow (UTF-16) build case it's the longest unichr
6980 escape.
6981 */
6982
Walter Dörwald1ab83302007-05-18 17:15:44 +00006983 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006984 2 /* quotes */
6985#ifdef Py_UNICODE_WIDE
6986 + 10*size
6987#else
6988 + 6*size
6989#endif
6990 + 1);
6991 if (repr == NULL)
6992 return NULL;
6993
Walter Dörwald1ab83302007-05-18 17:15:44 +00006994 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006995
6996 /* Add quote */
6997 *p++ = (findchar(s, size, '\'') &&
6998 !findchar(s, size, '"')) ? '"' : '\'';
6999 while (size-- > 0) {
7000 Py_UNICODE ch = *s++;
7001
7002 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007003 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007004 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007005 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007006 continue;
7007 }
7008
7009#ifdef Py_UNICODE_WIDE
7010 /* Map 21-bit characters to '\U00xxxxxx' */
7011 else if (ch >= 0x10000) {
7012 *p++ = '\\';
7013 *p++ = 'U';
7014 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7015 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7016 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7017 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7018 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7019 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7020 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7021 *p++ = hexdigits[ch & 0x0000000F];
7022 continue;
7023 }
7024#else
7025 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7026 else if (ch >= 0xD800 && ch < 0xDC00) {
7027 Py_UNICODE ch2;
7028 Py_UCS4 ucs;
7029
7030 ch2 = *s++;
7031 size--;
7032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7034 *p++ = '\\';
7035 *p++ = 'U';
7036 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7037 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7038 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7039 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7040 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7041 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7042 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7043 *p++ = hexdigits[ucs & 0x0000000F];
7044 continue;
7045 }
7046 /* Fall through: isolated surrogates are copied as-is */
7047 s--;
7048 size++;
7049 }
7050#endif
7051
7052 /* Map 16-bit characters to '\uxxxx' */
7053 if (ch >= 256) {
7054 *p++ = '\\';
7055 *p++ = 'u';
7056 *p++ = hexdigits[(ch >> 12) & 0x000F];
7057 *p++ = hexdigits[(ch >> 8) & 0x000F];
7058 *p++ = hexdigits[(ch >> 4) & 0x000F];
7059 *p++ = hexdigits[ch & 0x000F];
7060 }
7061
7062 /* Map special whitespace to '\t', \n', '\r' */
7063 else if (ch == '\t') {
7064 *p++ = '\\';
7065 *p++ = 't';
7066 }
7067 else if (ch == '\n') {
7068 *p++ = '\\';
7069 *p++ = 'n';
7070 }
7071 else if (ch == '\r') {
7072 *p++ = '\\';
7073 *p++ = 'r';
7074 }
7075
7076 /* Map non-printable US ASCII to '\xhh' */
7077 else if (ch < ' ' || ch >= 0x7F) {
7078 *p++ = '\\';
7079 *p++ = 'x';
7080 *p++ = hexdigits[(ch >> 4) & 0x000F];
7081 *p++ = hexdigits[ch & 0x000F];
7082 }
7083
7084 /* Copy everything else as-is */
7085 else
7086 *p++ = (char) ch;
7087 }
7088 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007089 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007090
7091 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007092 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007093 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094}
7095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097"S.rfind(sub [,start [,end]]) -> int\n\
7098\n\
7099Return the highest index in S where substring sub is found,\n\
7100such that sub is contained within s[start,end]. Optional\n\
7101arguments start and end are interpreted as in slice notation.\n\
7102\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007103Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105static PyObject *
7106unicode_rfind(PyUnicodeObject *self, PyObject *args)
7107{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007108 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007110 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
Guido van Rossumb8872e62000-05-09 14:14:27 +00007113 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7114 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007116 substring = PyUnicode_FromObject(substring);
7117 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 return NULL;
7119
Thomas Wouters477c8d52006-05-27 19:21:47 +00007120 result = stringlib_rfind_slice(
7121 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7122 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7123 start, end
7124 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125
7126 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007127
7128 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129}
7130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132"S.rindex(sub [,start [,end]]) -> int\n\
7133\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007134Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135
7136static PyObject *
7137unicode_rindex(PyUnicodeObject *self, PyObject *args)
7138{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007139 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007140 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007141 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007142 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
Guido van Rossumb8872e62000-05-09 14:14:27 +00007144 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7145 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007147 substring = PyUnicode_FromObject(substring);
7148 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150
Thomas Wouters477c8d52006-05-27 19:21:47 +00007151 result = stringlib_rfind_slice(
7152 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7153 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7154 start, end
7155 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 if (result < 0) {
7160 PyErr_SetString(PyExc_ValueError, "substring not found");
7161 return NULL;
7162 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007163 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164}
7165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007166PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007167"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168\n\
7169Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007170done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171
7172static PyObject *
7173unicode_rjust(PyUnicodeObject *self, PyObject *args)
7174{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007175 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007176 Py_UNICODE fillchar = ' ';
7177
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007178 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 return NULL;
7180
Tim Peters7a29bd52001-09-12 03:03:31 +00007181 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 Py_INCREF(self);
7183 return (PyObject*) self;
7184 }
7185
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007186 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187}
7188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191{
7192 /* standard clamping */
7193 if (start < 0)
7194 start = 0;
7195 if (end < 0)
7196 end = 0;
7197 if (end > self->length)
7198 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007199 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 /* full slice, return original string */
7201 Py_INCREF(self);
7202 return (PyObject*) self;
7203 }
7204 if (start > end)
7205 start = end;
7206 /* copy slice */
7207 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7208 end - start);
7209}
7210
7211PyObject *PyUnicode_Split(PyObject *s,
7212 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007213 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214{
7215 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007216
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 s = PyUnicode_FromObject(s);
7218 if (s == NULL)
7219 return NULL;
7220 if (sep != NULL) {
7221 sep = PyUnicode_FromObject(sep);
7222 if (sep == NULL) {
7223 Py_DECREF(s);
7224 return NULL;
7225 }
7226 }
7227
7228 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7229
7230 Py_DECREF(s);
7231 Py_XDECREF(sep);
7232 return result;
7233}
7234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007235PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236"S.split([sep [,maxsplit]]) -> list of strings\n\
7237\n\
7238Return a list of the words in S, using sep as the\n\
7239delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007240splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007241any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242
7243static PyObject*
7244unicode_split(PyUnicodeObject *self, PyObject *args)
7245{
7246 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007247 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 return NULL;
7251
7252 if (substring == Py_None)
7253 return split(self, NULL, maxcount);
7254 else if (PyUnicode_Check(substring))
7255 return split(self, (PyUnicodeObject *)substring, maxcount);
7256 else
7257 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7258}
7259
Thomas Wouters477c8d52006-05-27 19:21:47 +00007260PyObject *
7261PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7262{
7263 PyObject* str_obj;
7264 PyObject* sep_obj;
7265 PyObject* out;
7266
7267 str_obj = PyUnicode_FromObject(str_in);
7268 if (!str_obj)
7269 return NULL;
7270 sep_obj = PyUnicode_FromObject(sep_in);
7271 if (!sep_obj) {
7272 Py_DECREF(str_obj);
7273 return NULL;
7274 }
7275
7276 out = stringlib_partition(
7277 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7278 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7279 );
7280
7281 Py_DECREF(sep_obj);
7282 Py_DECREF(str_obj);
7283
7284 return out;
7285}
7286
7287
7288PyObject *
7289PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7290{
7291 PyObject* str_obj;
7292 PyObject* sep_obj;
7293 PyObject* out;
7294
7295 str_obj = PyUnicode_FromObject(str_in);
7296 if (!str_obj)
7297 return NULL;
7298 sep_obj = PyUnicode_FromObject(sep_in);
7299 if (!sep_obj) {
7300 Py_DECREF(str_obj);
7301 return NULL;
7302 }
7303
7304 out = stringlib_rpartition(
7305 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7306 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7307 );
7308
7309 Py_DECREF(sep_obj);
7310 Py_DECREF(str_obj);
7311
7312 return out;
7313}
7314
7315PyDoc_STRVAR(partition__doc__,
7316"S.partition(sep) -> (head, sep, tail)\n\
7317\n\
7318Searches for the separator sep in S, and returns the part before it,\n\
7319the separator itself, and the part after it. If the separator is not\n\
7320found, returns S and two empty strings.");
7321
7322static PyObject*
7323unicode_partition(PyUnicodeObject *self, PyObject *separator)
7324{
7325 return PyUnicode_Partition((PyObject *)self, separator);
7326}
7327
7328PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007329"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007330\n\
7331Searches for the separator sep in S, starting at the end of S, and returns\n\
7332the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007333separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007334
7335static PyObject*
7336unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7337{
7338 return PyUnicode_RPartition((PyObject *)self, separator);
7339}
7340
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007341PyObject *PyUnicode_RSplit(PyObject *s,
7342 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007343 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007344{
7345 PyObject *result;
7346
7347 s = PyUnicode_FromObject(s);
7348 if (s == NULL)
7349 return NULL;
7350 if (sep != NULL) {
7351 sep = PyUnicode_FromObject(sep);
7352 if (sep == NULL) {
7353 Py_DECREF(s);
7354 return NULL;
7355 }
7356 }
7357
7358 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7359
7360 Py_DECREF(s);
7361 Py_XDECREF(sep);
7362 return result;
7363}
7364
7365PyDoc_STRVAR(rsplit__doc__,
7366"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7367\n\
7368Return a list of the words in S, using sep as the\n\
7369delimiter string, starting at the end of the string and\n\
7370working to the front. If maxsplit is given, at most maxsplit\n\
7371splits are done. If sep is not specified, any whitespace string\n\
7372is a separator.");
7373
7374static PyObject*
7375unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7376{
7377 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007378 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007379
Martin v. Löwis18e16552006-02-15 17:27:45 +00007380 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007381 return NULL;
7382
7383 if (substring == Py_None)
7384 return rsplit(self, NULL, maxcount);
7385 else if (PyUnicode_Check(substring))
7386 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7387 else
7388 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7389}
7390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007391PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007392"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393\n\
7394Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007395Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
7398static PyObject*
7399unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7400{
Guido van Rossum86662912000-04-11 15:38:46 +00007401 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Guido van Rossum86662912000-04-11 15:38:46 +00007403 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 return NULL;
7405
Guido van Rossum86662912000-04-11 15:38:46 +00007406 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407}
7408
7409static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007410PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411{
Walter Dörwald346737f2007-05-31 10:44:43 +00007412 if (PyUnicode_CheckExact(self)) {
7413 Py_INCREF(self);
7414 return self;
7415 } else
7416 /* Subtype -- return genuine unicode string with the same value. */
7417 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7418 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419}
7420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007421PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422"S.swapcase() -> unicode\n\
7423\n\
7424Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007425and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
7427static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007428unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 return fixup(self, fixswapcase);
7431}
7432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007433PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434"S.translate(table) -> unicode\n\
7435\n\
7436Return a copy of the string S, where all characters have been mapped\n\
7437through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007438Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7439Unmapped characters are left untouched. Characters mapped to None\n\
7440are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441
7442static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007443unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444{
Tim Petersced69f82003-09-16 20:30:58 +00007445 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007447 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 "ignore");
7449}
7450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007451PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452"S.upper() -> unicode\n\
7453\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007457unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 return fixup(self, fixupper);
7460}
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463"S.zfill(width) -> unicode\n\
7464\n\
7465Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject *
7469unicode_zfill(PyUnicodeObject *self, PyObject *args)
7470{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007471 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 PyUnicodeObject *u;
7473
Martin v. Löwis18e16552006-02-15 17:27:45 +00007474 Py_ssize_t width;
7475 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 return NULL;
7477
7478 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007479 if (PyUnicode_CheckExact(self)) {
7480 Py_INCREF(self);
7481 return (PyObject*) self;
7482 }
7483 else
7484 return PyUnicode_FromUnicode(
7485 PyUnicode_AS_UNICODE(self),
7486 PyUnicode_GET_SIZE(self)
7487 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 }
7489
7490 fill = width - self->length;
7491
7492 u = pad(self, fill, 0, '0');
7493
Walter Dörwald068325e2002-04-15 13:36:47 +00007494 if (u == NULL)
7495 return NULL;
7496
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 if (u->str[fill] == '+' || u->str[fill] == '-') {
7498 /* move sign to beginning of string */
7499 u->str[0] = u->str[fill];
7500 u->str[fill] = '0';
7501 }
7502
7503 return (PyObject*) u;
7504}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
7506#if 0
7507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007508unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 return PyInt_FromLong(unicode_freelist_size);
7511}
7512#endif
7513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007514PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007515"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007517Return True if S starts with the specified prefix, False otherwise.\n\
7518With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519With optional end, stop comparing S at that position.\n\
7520prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject *
7523unicode_startswith(PyUnicodeObject *self,
7524 PyObject *args)
7525{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007529 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007533 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535 if (PyTuple_Check(subobj)) {
7536 Py_ssize_t i;
7537 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7538 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7539 PyTuple_GET_ITEM(subobj, i));
7540 if (substring == NULL)
7541 return NULL;
7542 result = tailmatch(self, substring, start, end, -1);
7543 Py_DECREF(substring);
7544 if (result) {
7545 Py_RETURN_TRUE;
7546 }
7547 }
7548 /* nothing matched */
7549 Py_RETURN_FALSE;
7550 }
7551 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007553 return NULL;
7554 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007556 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
7559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007561"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007563Return True if S ends with the specified suffix, False otherwise.\n\
7564With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007565With optional end, stop comparing S at that position.\n\
7566suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject *
7569unicode_endswith(PyUnicodeObject *self,
7570 PyObject *args)
7571{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007574 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007575 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7579 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 if (PyTuple_Check(subobj)) {
7582 Py_ssize_t i;
7583 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7584 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7585 PyTuple_GET_ITEM(subobj, i));
7586 if (substring == NULL)
7587 return NULL;
7588 result = tailmatch(self, substring, start, end, +1);
7589 Py_DECREF(substring);
7590 if (result) {
7591 Py_RETURN_TRUE;
7592 }
7593 }
7594 Py_RETURN_FALSE;
7595 }
7596 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007600 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603}
7604
7605
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007606
7607static PyObject *
7608unicode_getnewargs(PyUnicodeObject *v)
7609{
7610 return Py_BuildValue("(u#)", v->str, v->length);
7611}
7612
7613
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614static PyMethodDef unicode_methods[] = {
7615
7616 /* Order is according to common usage: often used methods should
7617 appear first, since lookup is done sequentially. */
7618
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007619 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7620 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7621 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007622 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7624 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7625 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7626 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7627 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7628 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7629 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007630 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007631 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7632 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7633 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007634 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007635 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007636/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7637 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7638 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7639 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007640 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007641 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007642 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007643 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007644 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7645 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7646 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7647 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7648 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7649 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7650 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7651 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7652 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7653 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7654 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7655 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7656 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7657 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007658 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007659#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007660 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661#endif
7662
7663#if 0
7664 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007665 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666#endif
7667
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007668 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 {NULL, NULL}
7670};
7671
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007672static PyObject *
7673unicode_mod(PyObject *v, PyObject *w)
7674{
7675 if (!PyUnicode_Check(v)) {
7676 Py_INCREF(Py_NotImplemented);
7677 return Py_NotImplemented;
7678 }
7679 return PyUnicode_Format(v, w);
7680}
7681
7682static PyNumberMethods unicode_as_number = {
7683 0, /*nb_add*/
7684 0, /*nb_subtract*/
7685 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007686 unicode_mod, /*nb_remainder*/
7687};
7688
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007690 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007691 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7693 (ssizeargfunc) unicode_getitem, /* sq_item */
7694 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 0, /* sq_ass_item */
7696 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007697 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698};
7699
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007700static PyObject*
7701unicode_subscript(PyUnicodeObject* self, PyObject* item)
7702{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007703 if (PyIndex_Check(item)) {
7704 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007705 if (i == -1 && PyErr_Occurred())
7706 return NULL;
7707 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007708 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007709 return unicode_getitem(self, i);
7710 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007712 Py_UNICODE* source_buf;
7713 Py_UNICODE* result_buf;
7714 PyObject* result;
7715
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007716 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007717 &start, &stop, &step, &slicelength) < 0) {
7718 return NULL;
7719 }
7720
7721 if (slicelength <= 0) {
7722 return PyUnicode_FromUnicode(NULL, 0);
7723 } else {
7724 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007725 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7726 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007727
7728 if (result_buf == NULL)
7729 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007730
7731 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7732 result_buf[i] = source_buf[cur];
7733 }
Tim Petersced69f82003-09-16 20:30:58 +00007734
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007735 result = PyUnicode_FromUnicode(result_buf, slicelength);
7736 PyMem_FREE(result_buf);
7737 return result;
7738 }
7739 } else {
7740 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7741 return NULL;
7742 }
7743}
7744
7745static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007746 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007747 (binaryfunc)unicode_subscript, /* mp_subscript */
7748 (objobjargproc)0, /* mp_ass_subscript */
7749};
7750
Martin v. Löwis18e16552006-02-15 17:27:45 +00007751static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007753 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 const void **ptr)
7755{
7756 if (index != 0) {
7757 PyErr_SetString(PyExc_SystemError,
7758 "accessing non-existent unicode segment");
7759 return -1;
7760 }
7761 *ptr = (void *) self->str;
7762 return PyUnicode_GET_DATA_SIZE(self);
7763}
7764
Martin v. Löwis18e16552006-02-15 17:27:45 +00007765static Py_ssize_t
7766unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 const void **ptr)
7768{
7769 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007770 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 return -1;
7772}
7773
7774static int
7775unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777{
7778 if (lenp)
7779 *lenp = PyUnicode_GET_DATA_SIZE(self);
7780 return 1;
7781}
7782
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007783static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007785 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 const void **ptr)
7787{
7788 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007789
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 if (index != 0) {
7791 PyErr_SetString(PyExc_SystemError,
7792 "accessing non-existent unicode segment");
7793 return -1;
7794 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007795 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 if (str == NULL)
7797 return -1;
7798 *ptr = (void *) PyString_AS_STRING(str);
7799 return PyString_GET_SIZE(str);
7800}
7801
7802/* Helpers for PyUnicode_Format() */
7803
7804static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 if (argidx < arglen) {
7809 (*p_argidx)++;
7810 if (arglen < 0)
7811 return args;
7812 else
7813 return PyTuple_GetItem(args, argidx);
7814 }
7815 PyErr_SetString(PyExc_TypeError,
7816 "not enough arguments for format string");
7817 return NULL;
7818}
7819
7820#define F_LJUST (1<<0)
7821#define F_SIGN (1<<1)
7822#define F_BLANK (1<<2)
7823#define F_ALT (1<<3)
7824#define F_ZERO (1<<4)
7825
Martin v. Löwis18e16552006-02-15 17:27:45 +00007826static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007827strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007829 register Py_ssize_t i;
7830 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 for (i = len - 1; i >= 0; i--)
7832 buffer[i] = (Py_UNICODE) charbuffer[i];
7833
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 return len;
7835}
7836
Neal Norwitzfc76d632006-01-10 06:03:13 +00007837static int
7838doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7839{
Tim Peters15231542006-02-16 01:08:01 +00007840 Py_ssize_t result;
7841
Neal Norwitzfc76d632006-01-10 06:03:13 +00007842 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007843 result = strtounicode(buffer, (char *)buffer);
7844 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007845}
7846
7847static int
7848longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7849{
Tim Peters15231542006-02-16 01:08:01 +00007850 Py_ssize_t result;
7851
Neal Norwitzfc76d632006-01-10 06:03:13 +00007852 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007853 result = strtounicode(buffer, (char *)buffer);
7854 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007855}
7856
Guido van Rossum078151d2002-08-11 04:24:12 +00007857/* XXX To save some code duplication, formatfloat/long/int could have been
7858 shared with stringobject.c, converting from 8-bit to Unicode after the
7859 formatting is done. */
7860
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861static int
7862formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007863 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 int flags,
7865 int prec,
7866 int type,
7867 PyObject *v)
7868{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007869 /* fmt = '%#.' + `prec` + `type`
7870 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 char fmt[20];
7872 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 x = PyFloat_AsDouble(v);
7875 if (x == -1.0 && PyErr_Occurred())
7876 return -1;
7877 if (prec < 0)
7878 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7880 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007881 /* Worst case length calc to ensure no buffer overrun:
7882
7883 'g' formats:
7884 fmt = %#.<prec>g
7885 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7886 for any double rep.)
7887 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7888
7889 'f' formats:
7890 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7891 len = 1 + 50 + 1 + prec = 52 + prec
7892
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007893 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007894 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007895
7896 */
7897 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7898 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007899 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007900 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007901 return -1;
7902 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007903 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7904 (flags&F_ALT) ? "#" : "",
7905 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007906 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907}
7908
Tim Peters38fd5b62000-09-21 05:43:11 +00007909static PyObject*
7910formatlong(PyObject *val, int flags, int prec, int type)
7911{
7912 char *buf;
7913 int i, len;
7914 PyObject *str; /* temporary string object. */
7915 PyUnicodeObject *result;
7916
7917 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7918 if (!str)
7919 return NULL;
7920 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007921 if (!result) {
7922 Py_DECREF(str);
7923 return NULL;
7924 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007925 for (i = 0; i < len; i++)
7926 result->str[i] = buf[i];
7927 result->str[len] = 0;
7928 Py_DECREF(str);
7929 return (PyObject*)result;
7930}
7931
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932static int
7933formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007934 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 int flags,
7936 int prec,
7937 int type,
7938 PyObject *v)
7939{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007940 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007941 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7942 * + 1 + 1
7943 * = 24
7944 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007945 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007946 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 long x;
7948
7949 x = PyInt_AsLong(v);
7950 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007951 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007952 if (x < 0 && type == 'u') {
7953 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007954 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007955 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7956 sign = "-";
7957 else
7958 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007960 prec = 1;
7961
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007962 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7963 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007964 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007965 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007966 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007967 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007968 return -1;
7969 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007970
7971 if ((flags & F_ALT) &&
7972 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007973 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007974 * of issues that cause pain:
7975 * - when 0 is being converted, the C standard leaves off
7976 * the '0x' or '0X', which is inconsistent with other
7977 * %#x/%#X conversions and inconsistent with Python's
7978 * hex() function
7979 * - there are platforms that violate the standard and
7980 * convert 0 with the '0x' or '0X'
7981 * (Metrowerks, Compaq Tru64)
7982 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007983 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007984 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007985 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007986 * We can achieve the desired consistency by inserting our
7987 * own '0x' or '0X' prefix, and substituting %x/%X in place
7988 * of %#x/%#X.
7989 *
7990 * Note that this is the same approach as used in
7991 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007992 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007993 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7994 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007995 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007996 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007997 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7998 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007999 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008000 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008001 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008002 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008003 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008004 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005}
8006
8007static int
8008formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008009 size_t buflen,
8010 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008012 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008013 if (PyUnicode_Check(v)) {
8014 if (PyUnicode_GET_SIZE(v) != 1)
8015 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008019 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008020 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008021 goto onError;
8022 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
8025 else {
8026 /* Integer input truncated to a character */
8027 long x;
8028 x = PyInt_AsLong(v);
8029 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008030 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008031#ifdef Py_UNICODE_WIDE
8032 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008033 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008034 "%c arg not in range(0x110000) "
8035 "(wide Python build)");
8036 return -1;
8037 }
8038#else
8039 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008040 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008041 "%c arg not in range(0x10000) "
8042 "(narrow Python build)");
8043 return -1;
8044 }
8045#endif
8046 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 }
8048 buf[1] = '\0';
8049 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008050
8051 onError:
8052 PyErr_SetString(PyExc_TypeError,
8053 "%c requires int or char");
8054 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055}
8056
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008057/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8058
8059 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8060 chars are formatted. XXX This is a magic number. Each formatting
8061 routine does bounds checking to ensure no overflow, but a better
8062 solution may be to malloc a buffer of appropriate size for each
8063 format. For now, the current solution is sufficient.
8064*/
8065#define FORMATBUFLEN (size_t)120
8066
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067PyObject *PyUnicode_Format(PyObject *format,
8068 PyObject *args)
8069{
8070 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008071 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 int args_owned = 0;
8073 PyUnicodeObject *result = NULL;
8074 PyObject *dict = NULL;
8075 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008076
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 if (format == NULL || args == NULL) {
8078 PyErr_BadInternalCall();
8079 return NULL;
8080 }
8081 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008082 if (uformat == NULL)
8083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 fmt = PyUnicode_AS_UNICODE(uformat);
8085 fmtcnt = PyUnicode_GET_SIZE(uformat);
8086
8087 reslen = rescnt = fmtcnt + 100;
8088 result = _PyUnicode_New(reslen);
8089 if (result == NULL)
8090 goto onError;
8091 res = PyUnicode_AS_UNICODE(result);
8092
8093 if (PyTuple_Check(args)) {
8094 arglen = PyTuple_Size(args);
8095 argidx = 0;
8096 }
8097 else {
8098 arglen = -1;
8099 argidx = -2;
8100 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008101 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8102 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 dict = args;
8104
8105 while (--fmtcnt >= 0) {
8106 if (*fmt != '%') {
8107 if (--rescnt < 0) {
8108 rescnt = fmtcnt + 100;
8109 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008110 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008111 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8113 --rescnt;
8114 }
8115 *res++ = *fmt++;
8116 }
8117 else {
8118 /* Got a format specifier */
8119 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008120 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 Py_UNICODE c = '\0';
8123 Py_UNICODE fill;
8124 PyObject *v = NULL;
8125 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008126 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008129 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130
8131 fmt++;
8132 if (*fmt == '(') {
8133 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008134 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 PyObject *key;
8136 int pcount = 1;
8137
8138 if (dict == NULL) {
8139 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008140 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 goto onError;
8142 }
8143 ++fmt;
8144 --fmtcnt;
8145 keystart = fmt;
8146 /* Skip over balanced parentheses */
8147 while (pcount > 0 && --fmtcnt >= 0) {
8148 if (*fmt == ')')
8149 --pcount;
8150 else if (*fmt == '(')
8151 ++pcount;
8152 fmt++;
8153 }
8154 keylen = fmt - keystart - 1;
8155 if (fmtcnt < 0 || pcount > 0) {
8156 PyErr_SetString(PyExc_ValueError,
8157 "incomplete format key");
8158 goto onError;
8159 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008160#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008161 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 then looked up since Python uses strings to hold
8163 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008164 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 key = PyUnicode_EncodeUTF8(keystart,
8166 keylen,
8167 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008168#else
8169 key = PyUnicode_FromUnicode(keystart, keylen);
8170#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 if (key == NULL)
8172 goto onError;
8173 if (args_owned) {
8174 Py_DECREF(args);
8175 args_owned = 0;
8176 }
8177 args = PyObject_GetItem(dict, key);
8178 Py_DECREF(key);
8179 if (args == NULL) {
8180 goto onError;
8181 }
8182 args_owned = 1;
8183 arglen = -1;
8184 argidx = -2;
8185 }
8186 while (--fmtcnt >= 0) {
8187 switch (c = *fmt++) {
8188 case '-': flags |= F_LJUST; continue;
8189 case '+': flags |= F_SIGN; continue;
8190 case ' ': flags |= F_BLANK; continue;
8191 case '#': flags |= F_ALT; continue;
8192 case '0': flags |= F_ZERO; continue;
8193 }
8194 break;
8195 }
8196 if (c == '*') {
8197 v = getnextarg(args, arglen, &argidx);
8198 if (v == NULL)
8199 goto onError;
8200 if (!PyInt_Check(v)) {
8201 PyErr_SetString(PyExc_TypeError,
8202 "* wants int");
8203 goto onError;
8204 }
8205 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008206 if (width == -1 && PyErr_Occurred())
8207 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 if (width < 0) {
8209 flags |= F_LJUST;
8210 width = -width;
8211 }
8212 if (--fmtcnt >= 0)
8213 c = *fmt++;
8214 }
8215 else if (c >= '0' && c <= '9') {
8216 width = c - '0';
8217 while (--fmtcnt >= 0) {
8218 c = *fmt++;
8219 if (c < '0' || c > '9')
8220 break;
8221 if ((width*10) / 10 != width) {
8222 PyErr_SetString(PyExc_ValueError,
8223 "width too big");
8224 goto onError;
8225 }
8226 width = width*10 + (c - '0');
8227 }
8228 }
8229 if (c == '.') {
8230 prec = 0;
8231 if (--fmtcnt >= 0)
8232 c = *fmt++;
8233 if (c == '*') {
8234 v = getnextarg(args, arglen, &argidx);
8235 if (v == NULL)
8236 goto onError;
8237 if (!PyInt_Check(v)) {
8238 PyErr_SetString(PyExc_TypeError,
8239 "* wants int");
8240 goto onError;
8241 }
8242 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008243 if (prec == -1 && PyErr_Occurred())
8244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 if (prec < 0)
8246 prec = 0;
8247 if (--fmtcnt >= 0)
8248 c = *fmt++;
8249 }
8250 else if (c >= '0' && c <= '9') {
8251 prec = c - '0';
8252 while (--fmtcnt >= 0) {
8253 c = Py_CHARMASK(*fmt++);
8254 if (c < '0' || c > '9')
8255 break;
8256 if ((prec*10) / 10 != prec) {
8257 PyErr_SetString(PyExc_ValueError,
8258 "prec too big");
8259 goto onError;
8260 }
8261 prec = prec*10 + (c - '0');
8262 }
8263 }
8264 } /* prec */
8265 if (fmtcnt >= 0) {
8266 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 if (--fmtcnt >= 0)
8268 c = *fmt++;
8269 }
8270 }
8271 if (fmtcnt < 0) {
8272 PyErr_SetString(PyExc_ValueError,
8273 "incomplete format");
8274 goto onError;
8275 }
8276 if (c != '%') {
8277 v = getnextarg(args, arglen, &argidx);
8278 if (v == NULL)
8279 goto onError;
8280 }
8281 sign = 0;
8282 fill = ' ';
8283 switch (c) {
8284
8285 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008286 pbuf = formatbuf;
8287 /* presume that buffer length is at least 1 */
8288 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 len = 1;
8290 break;
8291
8292 case 's':
8293 case 'r':
8294 if (PyUnicode_Check(v) && c == 's') {
8295 temp = v;
8296 Py_INCREF(temp);
8297 }
8298 else {
8299 PyObject *unicode;
8300 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008301 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 else
8303 temp = PyObject_Repr(v);
8304 if (temp == NULL)
8305 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008306 if (PyUnicode_Check(temp))
8307 /* nothing to do */;
8308 else if (PyString_Check(temp)) {
8309 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008310 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008312 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008314 Py_DECREF(temp);
8315 temp = unicode;
8316 if (temp == NULL)
8317 goto onError;
8318 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008319 else {
8320 Py_DECREF(temp);
8321 PyErr_SetString(PyExc_TypeError,
8322 "%s argument has non-string str()");
8323 goto onError;
8324 }
8325 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008326 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 len = PyUnicode_GET_SIZE(temp);
8328 if (prec >= 0 && len > prec)
8329 len = prec;
8330 break;
8331
8332 case 'i':
8333 case 'd':
8334 case 'u':
8335 case 'o':
8336 case 'x':
8337 case 'X':
8338 if (c == 'i')
8339 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008340 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008341 temp = formatlong(v, flags, prec, c);
8342 if (!temp)
8343 goto onError;
8344 pbuf = PyUnicode_AS_UNICODE(temp);
8345 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008346 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008348 else {
8349 pbuf = formatbuf;
8350 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8351 flags, prec, c, v);
8352 if (len < 0)
8353 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008355 }
8356 if (flags & F_ZERO)
8357 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 break;
8359
8360 case 'e':
8361 case 'E':
8362 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008363 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 case 'g':
8365 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008366 if (c == 'F')
8367 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008368 pbuf = formatbuf;
8369 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8370 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 if (len < 0)
8372 goto onError;
8373 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008374 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 fill = '0';
8376 break;
8377
8378 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008379 pbuf = formatbuf;
8380 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 if (len < 0)
8382 goto onError;
8383 break;
8384
8385 default:
8386 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008387 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008388 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008389 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008390 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008391 (Py_ssize_t)(fmt - 1 -
8392 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 goto onError;
8394 }
8395 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008396 if (*pbuf == '-' || *pbuf == '+') {
8397 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 len--;
8399 }
8400 else if (flags & F_SIGN)
8401 sign = '+';
8402 else if (flags & F_BLANK)
8403 sign = ' ';
8404 else
8405 sign = 0;
8406 }
8407 if (width < len)
8408 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008409 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 reslen -= rescnt;
8411 rescnt = width + fmtcnt + 100;
8412 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008413 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008414 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008415 PyErr_NoMemory();
8416 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008417 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008418 if (_PyUnicode_Resize(&result, reslen) < 0) {
8419 Py_XDECREF(temp);
8420 goto onError;
8421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 res = PyUnicode_AS_UNICODE(result)
8423 + reslen - rescnt;
8424 }
8425 if (sign) {
8426 if (fill != ' ')
8427 *res++ = sign;
8428 rescnt--;
8429 if (width > len)
8430 width--;
8431 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008432 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8433 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008434 assert(pbuf[1] == c);
8435 if (fill != ' ') {
8436 *res++ = *pbuf++;
8437 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008438 }
Tim Petersfff53252001-04-12 18:38:48 +00008439 rescnt -= 2;
8440 width -= 2;
8441 if (width < 0)
8442 width = 0;
8443 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 if (width > len && !(flags & F_LJUST)) {
8446 do {
8447 --rescnt;
8448 *res++ = fill;
8449 } while (--width > len);
8450 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008451 if (fill == ' ') {
8452 if (sign)
8453 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008454 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008455 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008456 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008457 *res++ = *pbuf++;
8458 *res++ = *pbuf++;
8459 }
8460 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008461 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 res += len;
8463 rescnt -= len;
8464 while (--width >= len) {
8465 --rescnt;
8466 *res++ = ' ';
8467 }
8468 if (dict && (argidx < arglen) && c != '%') {
8469 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008470 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008471 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 goto onError;
8473 }
8474 Py_XDECREF(temp);
8475 } /* '%' */
8476 } /* until end */
8477 if (argidx < arglen && !dict) {
8478 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008479 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 goto onError;
8481 }
8482
Thomas Woutersa96affe2006-03-12 00:29:36 +00008483 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8484 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 if (args_owned) {
8486 Py_DECREF(args);
8487 }
8488 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 return (PyObject *)result;
8490
8491 onError:
8492 Py_XDECREF(result);
8493 Py_DECREF(uformat);
8494 if (args_owned) {
8495 Py_DECREF(args);
8496 }
8497 return NULL;
8498}
8499
8500static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 (readbufferproc) unicode_buffer_getreadbuf,
8502 (writebufferproc) unicode_buffer_getwritebuf,
8503 (segcountproc) unicode_buffer_getsegcount,
8504 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505};
8506
Jeremy Hylton938ace62002-07-17 16:30:39 +00008507static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008508unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8509
Tim Peters6d6c1a32001-08-02 04:15:00 +00008510static PyObject *
8511unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8512{
8513 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008514 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008515 char *encoding = NULL;
8516 char *errors = NULL;
8517
Guido van Rossume023fe02001-08-30 03:12:59 +00008518 if (type != &PyUnicode_Type)
8519 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008520 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8521 kwlist, &x, &encoding, &errors))
8522 return NULL;
8523 if (x == NULL)
8524 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008525 if (encoding == NULL && errors == NULL)
8526 return PyObject_Unicode(x);
8527 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008528 return PyUnicode_FromEncodedObject(x, encoding, errors);
8529}
8530
Guido van Rossume023fe02001-08-30 03:12:59 +00008531static PyObject *
8532unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8533{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008534 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008535 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008536
8537 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8538 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8539 if (tmp == NULL)
8540 return NULL;
8541 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008542 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008543 if (pnew == NULL) {
8544 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008545 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008546 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008547 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8548 if (pnew->str == NULL) {
8549 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008550 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008551 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008552 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008553 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008554 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8555 pnew->length = n;
8556 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008557 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008558 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008559}
8560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008561PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008562"unicode(string [, encoding[, errors]]) -> object\n\
8563\n\
8564Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008565encoding defaults to the current default string encoding.\n\
8566errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008567
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008568static PyObject *unicode_iter(PyObject *seq);
8569
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570PyTypeObject PyUnicode_Type = {
8571 PyObject_HEAD_INIT(&PyType_Type)
8572 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008573 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 sizeof(PyUnicodeObject), /* tp_size */
8575 0, /* tp_itemsize */
8576 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008577 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008579 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008581 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008582 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008583 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008585 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 (hashfunc) unicode_hash, /* tp_hash*/
8587 0, /* tp_call*/
8588 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008589 PyObject_GenericGetAttr, /* tp_getattro */
8590 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008592 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8593 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008594 unicode_doc, /* tp_doc */
8595 0, /* tp_traverse */
8596 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008597 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008598 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008599 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008600 0, /* tp_iternext */
8601 unicode_methods, /* tp_methods */
8602 0, /* tp_members */
8603 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008604 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008605 0, /* tp_dict */
8606 0, /* tp_descr_get */
8607 0, /* tp_descr_set */
8608 0, /* tp_dictoffset */
8609 0, /* tp_init */
8610 0, /* tp_alloc */
8611 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008612 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613};
8614
8615/* Initialize the Unicode implementation */
8616
Thomas Wouters78890102000-07-22 19:25:51 +00008617void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008619 int i;
8620
Thomas Wouters477c8d52006-05-27 19:21:47 +00008621 /* XXX - move this array to unicodectype.c ? */
8622 Py_UNICODE linebreak[] = {
8623 0x000A, /* LINE FEED */
8624 0x000D, /* CARRIAGE RETURN */
8625 0x001C, /* FILE SEPARATOR */
8626 0x001D, /* GROUP SEPARATOR */
8627 0x001E, /* RECORD SEPARATOR */
8628 0x0085, /* NEXT LINE */
8629 0x2028, /* LINE SEPARATOR */
8630 0x2029, /* PARAGRAPH SEPARATOR */
8631 };
8632
Fred Drakee4315f52000-05-09 19:53:39 +00008633 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008634 unicode_freelist = NULL;
8635 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008637 if (!unicode_empty)
8638 return;
8639
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008640 for (i = 0; i < 256; i++)
8641 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008642 if (PyType_Ready(&PyUnicode_Type) < 0)
8643 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008644
8645 /* initialize the linebreak bloom filter */
8646 bloom_linebreak = make_bloom_mask(
8647 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8648 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008649
8650 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651}
8652
8653/* Finalize the Unicode implementation */
8654
8655void
Thomas Wouters78890102000-07-22 19:25:51 +00008656_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008658 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008659 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008661 Py_XDECREF(unicode_empty);
8662 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008664 for (i = 0; i < 256; i++) {
8665 if (unicode_latin1[i]) {
8666 Py_DECREF(unicode_latin1[i]);
8667 unicode_latin1[i] = NULL;
8668 }
8669 }
8670
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008671 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 PyUnicodeObject *v = u;
8673 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008674 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008675 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008676 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008677 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008679 unicode_freelist = NULL;
8680 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008682
Walter Dörwald16807132007-05-25 13:52:07 +00008683void
8684PyUnicode_InternInPlace(PyObject **p)
8685{
8686 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8687 PyObject *t;
8688 if (s == NULL || !PyUnicode_Check(s))
8689 Py_FatalError(
8690 "PyUnicode_InternInPlace: unicode strings only please!");
8691 /* If it's a subclass, we don't really know what putting
8692 it in the interned dict might do. */
8693 if (!PyUnicode_CheckExact(s))
8694 return;
8695 if (PyUnicode_CHECK_INTERNED(s))
8696 return;
8697 if (interned == NULL) {
8698 interned = PyDict_New();
8699 if (interned == NULL) {
8700 PyErr_Clear(); /* Don't leave an exception */
8701 return;
8702 }
8703 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008704 /* It might be that the GetItem call fails even
8705 though the key is present in the dictionary,
8706 namely when this happens during a stack overflow. */
8707 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008708 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008709 Py_END_ALLOW_RECURSION
8710
Walter Dörwald16807132007-05-25 13:52:07 +00008711 if (t) {
8712 Py_INCREF(t);
8713 Py_DECREF(*p);
8714 *p = t;
8715 return;
8716 }
8717
Martin v. Löwis5b222132007-06-10 09:51:05 +00008718 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008719 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8720 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008721 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008722 return;
8723 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008724 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008725 /* The two references in interned are not counted by refcnt.
8726 The deallocator will take care of this */
8727 s->ob_refcnt -= 2;
8728 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8729}
8730
8731void
8732PyUnicode_InternImmortal(PyObject **p)
8733{
8734 PyUnicode_InternInPlace(p);
8735 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8736 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8737 Py_INCREF(*p);
8738 }
8739}
8740
8741PyObject *
8742PyUnicode_InternFromString(const char *cp)
8743{
8744 PyObject *s = PyUnicode_FromString(cp);
8745 if (s == NULL)
8746 return NULL;
8747 PyUnicode_InternInPlace(&s);
8748 return s;
8749}
8750
8751void _Py_ReleaseInternedUnicodeStrings(void)
8752{
8753 PyObject *keys;
8754 PyUnicodeObject *s;
8755 Py_ssize_t i, n;
8756 Py_ssize_t immortal_size = 0, mortal_size = 0;
8757
8758 if (interned == NULL || !PyDict_Check(interned))
8759 return;
8760 keys = PyDict_Keys(interned);
8761 if (keys == NULL || !PyList_Check(keys)) {
8762 PyErr_Clear();
8763 return;
8764 }
8765
8766 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8767 detector, interned unicode strings are not forcibly deallocated;
8768 rather, we give them their stolen references back, and then clear
8769 and DECREF the interned dict. */
8770
8771 n = PyList_GET_SIZE(keys);
8772 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8773 n);
8774 for (i = 0; i < n; i++) {
8775 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8776 switch (s->state) {
8777 case SSTATE_NOT_INTERNED:
8778 /* XXX Shouldn't happen */
8779 break;
8780 case SSTATE_INTERNED_IMMORTAL:
8781 s->ob_refcnt += 1;
8782 immortal_size += s->length;
8783 break;
8784 case SSTATE_INTERNED_MORTAL:
8785 s->ob_refcnt += 2;
8786 mortal_size += s->length;
8787 break;
8788 default:
8789 Py_FatalError("Inconsistent interned string state.");
8790 }
8791 s->state = SSTATE_NOT_INTERNED;
8792 }
8793 fprintf(stderr, "total size of all interned strings: "
8794 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8795 "mortal/immortal\n", mortal_size, immortal_size);
8796 Py_DECREF(keys);
8797 PyDict_Clear(interned);
8798 Py_DECREF(interned);
8799 interned = NULL;
8800}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008801
8802
8803/********************* Unicode Iterator **************************/
8804
8805typedef struct {
8806 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008807 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008808 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8809} unicodeiterobject;
8810
8811static void
8812unicodeiter_dealloc(unicodeiterobject *it)
8813{
8814 _PyObject_GC_UNTRACK(it);
8815 Py_XDECREF(it->it_seq);
8816 PyObject_GC_Del(it);
8817}
8818
8819static int
8820unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8821{
8822 Py_VISIT(it->it_seq);
8823 return 0;
8824}
8825
8826static PyObject *
8827unicodeiter_next(unicodeiterobject *it)
8828{
8829 PyUnicodeObject *seq;
8830 PyObject *item;
8831
8832 assert(it != NULL);
8833 seq = it->it_seq;
8834 if (seq == NULL)
8835 return NULL;
8836 assert(PyUnicode_Check(seq));
8837
8838 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008839 item = PyUnicode_FromUnicode(
8840 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008841 if (item != NULL)
8842 ++it->it_index;
8843 return item;
8844 }
8845
8846 Py_DECREF(seq);
8847 it->it_seq = NULL;
8848 return NULL;
8849}
8850
8851static PyObject *
8852unicodeiter_len(unicodeiterobject *it)
8853{
8854 Py_ssize_t len = 0;
8855 if (it->it_seq)
8856 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8857 return PyInt_FromSsize_t(len);
8858}
8859
8860PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8861
8862static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008863 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8864 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008865 {NULL, NULL} /* sentinel */
8866};
8867
8868PyTypeObject PyUnicodeIter_Type = {
8869 PyObject_HEAD_INIT(&PyType_Type)
8870 0, /* ob_size */
8871 "unicodeiterator", /* tp_name */
8872 sizeof(unicodeiterobject), /* tp_basicsize */
8873 0, /* tp_itemsize */
8874 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008875 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008876 0, /* tp_print */
8877 0, /* tp_getattr */
8878 0, /* tp_setattr */
8879 0, /* tp_compare */
8880 0, /* tp_repr */
8881 0, /* tp_as_number */
8882 0, /* tp_as_sequence */
8883 0, /* tp_as_mapping */
8884 0, /* tp_hash */
8885 0, /* tp_call */
8886 0, /* tp_str */
8887 PyObject_GenericGetAttr, /* tp_getattro */
8888 0, /* tp_setattro */
8889 0, /* tp_as_buffer */
8890 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8891 0, /* tp_doc */
8892 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8893 0, /* tp_clear */
8894 0, /* tp_richcompare */
8895 0, /* tp_weaklistoffset */
8896 PyObject_SelfIter, /* tp_iter */
8897 (iternextfunc)unicodeiter_next, /* tp_iternext */
8898 unicodeiter_methods, /* tp_methods */
8899 0,
8900};
8901
8902static PyObject *
8903unicode_iter(PyObject *seq)
8904{
8905 unicodeiterobject *it;
8906
8907 if (!PyUnicode_Check(seq)) {
8908 PyErr_BadInternalCall();
8909 return NULL;
8910 }
8911 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8912 if (it == NULL)
8913 return NULL;
8914 it->it_index = 0;
8915 Py_INCREF(seq);
8916 it->it_seq = (PyUnicodeObject *)seq;
8917 _PyObject_GC_TRACK(it);
8918 return (PyObject *)it;
8919}
8920
Martin v. Löwis5b222132007-06-10 09:51:05 +00008921size_t
8922Py_UNICODE_strlen(const Py_UNICODE *u)
8923{
8924 int res = 0;
8925 while(*u++)
8926 res++;
8927 return res;
8928}
8929
8930Py_UNICODE*
8931Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8932{
8933 Py_UNICODE *u = s1;
8934 while ((*u++ = *s2++));
8935 return s1;
8936}
8937
8938Py_UNICODE*
8939Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8940{
8941 Py_UNICODE *u = s1;
8942 while ((*u++ = *s2++))
8943 if (n-- == 0)
8944 break;
8945 return s1;
8946}
8947
8948int
8949Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8950{
8951 while (*s1 && *s2 && *s1 == *s2)
8952 s1++, s2++;
8953 if (*s1 && *s2)
8954 return (*s1 < *s2) ? -1 : +1;
8955 if (*s1)
8956 return 1;
8957 if (*s2)
8958 return -1;
8959 return 0;
8960}
8961
8962Py_UNICODE*
8963Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
8964{
8965 const Py_UNICODE *p;
8966 for (p = s; *p; p++)
8967 if (*p == c)
8968 return (Py_UNICODE*)p;
8969 return NULL;
8970}
8971
8972
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008973#ifdef __cplusplus
8974}
8975#endif
8976
8977
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008978/*
8979Local variables:
8980c-basic-offset: 4
8981indent-tabs-mode: nil
8982End:
8983*/