blob: 47109a5b3c6a318ab6fbd1eb698fb6f4dd8c7ae9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000308 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000430 some optimizations which share commonly used objects.
431 Also, this means the input must be UTF-8, so fall back to the
432 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000433 if (u != NULL) {
434
435 /* Optimization for empty strings */
436 if (size == 0 && unicode_empty != NULL) {
437 Py_INCREF(unicode_empty);
438 return (PyObject *)unicode_empty;
439 }
440
Martin v. Löwis9c121062007-08-05 20:26:11 +0000441 /* Single characters are shared when using this constructor.
442 Restrict to ASCII, since the input must be UTF-8. */
443 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000444 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000445 if (!unicode) {
446 unicode = _PyUnicode_New(1);
447 if (!unicode)
448 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000449 unicode->str[0] = Py_CHARMASK(*u);
450 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000451 }
452 Py_INCREF(unicode);
453 return (PyObject *)unicode;
454 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000455
456 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000457 }
458
Walter Dörwald55507312007-05-18 13:12:10 +0000459 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 if (!unicode)
461 return NULL;
462
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 return (PyObject *)unicode;
464}
465
Walter Dörwaldd2034312007-05-18 16:29:38 +0000466PyObject *PyUnicode_FromString(const char *u)
467{
468 size_t size = strlen(u);
469 if (size > PY_SSIZE_T_MAX) {
470 PyErr_SetString(PyExc_OverflowError, "input too long");
471 return NULL;
472 }
473
474 return PyUnicode_FromStringAndSize(u, size);
475}
476
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477#ifdef HAVE_WCHAR_H
478
479PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000480 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481{
482 PyUnicodeObject *unicode;
483
484 if (w == NULL) {
485 PyErr_BadInternalCall();
486 return NULL;
487 }
488
489 unicode = _PyUnicode_New(size);
490 if (!unicode)
491 return NULL;
492
493 /* Copy the wchar_t data into the new object */
494#ifdef HAVE_USABLE_WCHAR_T
495 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000496#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 {
498 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000501 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 *u++ = *w++;
503 }
504#endif
505
506 return (PyObject *)unicode;
507}
508
Walter Dörwald346737f2007-05-31 10:44:43 +0000509static void
510makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
511{
512 *fmt++ = '%';
513 if (width) {
514 if (zeropad)
515 *fmt++ = '0';
516 fmt += sprintf(fmt, "%d", width);
517 }
518 if (precision)
519 fmt += sprintf(fmt, ".%d", precision);
520 if (longflag)
521 *fmt++ = 'l';
522 else if (size_tflag) {
523 char *f = PY_FORMAT_SIZE_T;
524 while (*f)
525 *fmt++ = *f++;
526 }
527 *fmt++ = c;
528 *fmt = '\0';
529}
530
Walter Dörwaldd2034312007-05-18 16:29:38 +0000531#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
532
533PyObject *
534PyUnicode_FromFormatV(const char *format, va_list vargs)
535{
536 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000537 Py_ssize_t callcount = 0;
538 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000539 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000540 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000541 int width = 0;
542 int precision = 0;
543 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 const char* f;
545 Py_UNICODE *s;
546 PyObject *string;
547 /* used by sprintf */
548 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 /* use abuffer instead of buffer, if we need more space
550 * (which can happen if there's a format specifier with width). */
551 char *abuffer = NULL;
552 char *realbuffer;
553 Py_ssize_t abuffersize = 0;
554 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 const char *copy;
556
557#ifdef VA_LIST_IS_ARRAY
558 Py_MEMCPY(count, vargs, sizeof(va_list));
559#else
560#ifdef __va_copy
561 __va_copy(count, vargs);
562#else
563 count = vargs;
564#endif
565#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000566 /* step 1: count the number of %S/%R format specifications
567 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
568 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000569 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000571 ++callcount;
572 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 2: allocate memory for the results of
574 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 if (callcount) {
576 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
577 if (!callresults) {
578 PyErr_NoMemory();
579 return NULL;
580 }
581 callresult = callresults;
582 }
583 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000584 for (f = format; *f; f++) {
585 if (*f == '%') {
586 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000587 width = 0;
588 while (isdigit(Py_CHARMASK(*f)))
589 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000590 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 ;
592
593 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
594 * they don't affect the amount of space we reserve.
595 */
596 if ((*f == 'l' || *f == 'z') &&
597 (f[1] == 'd' || f[1] == 'u'))
598 ++f;
599
600 switch (*f) {
601 case 'c':
602 (void)va_arg(count, int);
603 /* fall through... */
604 case '%':
605 n++;
606 break;
607 case 'd': case 'u': case 'i': case 'x':
608 (void) va_arg(count, int);
609 /* 20 bytes is enough to hold a 64-bit
610 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000611 This isn't enough for octal.
612 If a width is specified we need more
613 (which we allocate later). */
614 if (width < 20)
615 width = 20;
616 n += width;
617 if (abuffersize < width)
618 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000619 break;
620 case 's':
621 n += strlen(va_arg(count, char*));
622 break;
623 case 'U':
624 {
625 PyObject *obj = va_arg(count, PyObject *);
626 assert(obj && PyUnicode_Check(obj));
627 n += PyUnicode_GET_SIZE(obj);
628 break;
629 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000630 case 'V':
631 {
632 PyObject *obj = va_arg(count, PyObject *);
633 const char *str = va_arg(count, const char *);
634 assert(obj || str);
635 assert(!obj || PyUnicode_Check(obj));
636 if (obj)
637 n += PyUnicode_GET_SIZE(obj);
638 else
639 n += strlen(str);
640 break;
641 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000642 case 'S':
643 {
644 PyObject *obj = va_arg(count, PyObject *);
645 PyObject *str;
646 assert(obj);
647 str = PyObject_Unicode(obj);
648 if (!str)
649 goto fail;
650 n += PyUnicode_GET_SIZE(str);
651 /* Remember the str and switch to the next slot */
652 *callresult++ = str;
653 break;
654 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 case 'R':
656 {
657 PyObject *obj = va_arg(count, PyObject *);
658 PyObject *repr;
659 assert(obj);
660 repr = PyObject_Repr(obj);
661 if (!repr)
662 goto fail;
663 n += PyUnicode_GET_SIZE(repr);
664 /* Remember the repr and switch to the next slot */
665 *callresult++ = repr;
666 break;
667 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 case 'p':
669 (void) va_arg(count, int);
670 /* maximum 64-bit pointer representation:
671 * 0xffffffffffffffff
672 * so 19 characters is enough.
673 * XXX I count 18 -- what's the extra for?
674 */
675 n += 19;
676 break;
677 default:
678 /* if we stumble upon an unknown
679 formatting code, copy the rest of
680 the format string to the output
681 string. (we cannot just skip the
682 code, since there's no way to know
683 what's in the argument list) */
684 n += strlen(p);
685 goto expand;
686 }
687 } else
688 n++;
689 }
690 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000691 if (abuffersize > 20) {
692 abuffer = PyMem_Malloc(abuffersize);
693 if (!abuffer) {
694 PyErr_NoMemory();
695 goto fail;
696 }
697 realbuffer = abuffer;
698 }
699 else
700 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000701 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000703 we don't have to resize the string.
704 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 string = PyUnicode_FromUnicode(NULL, n);
706 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000707 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708
709 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000710 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 for (f = format; *f; f++) {
713 if (*f == '%') {
714 const char* p = f++;
715 int longflag = 0;
716 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000717 zeropad = (*f == '0');
718 /* parse the width.precision part */
719 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 width = (width*10) + *f++ - '0';
722 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 if (*f == '.') {
724 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000726 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 /* handle the long flag, but only for %ld and %lu.
729 others can be added when necessary. */
730 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
731 longflag = 1;
732 ++f;
733 }
734 /* handle the size_t flag. */
735 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
736 size_tflag = 1;
737 ++f;
738 }
739
740 switch (*f) {
741 case 'c':
742 *s++ = va_arg(vargs, int);
743 break;
744 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, int));
752 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 break;
754 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000757 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
762 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 break;
764 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
766 sprintf(realbuffer, fmt, va_arg(vargs, int));
767 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000768 break;
769 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000770 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
771 sprintf(realbuffer, fmt, va_arg(vargs, int));
772 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000773 break;
774 case 's':
775 p = va_arg(vargs, char*);
776 appendstring(p);
777 break;
778 case 'U':
779 {
780 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000781 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
782 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
783 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 break;
785 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000786 case 'V':
787 {
788 PyObject *obj = va_arg(vargs, PyObject *);
789 const char *str = va_arg(vargs, const char *);
790 if (obj) {
791 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
792 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
793 s += size;
794 } else {
795 appendstring(str);
796 }
797 break;
798 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000799 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000800 case 'R':
801 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000802 Py_UNICODE *ucopy;
803 Py_ssize_t usize;
804 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 /* unused, since we already have the result */
806 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000807 ucopy = PyUnicode_AS_UNICODE(*callresult);
808 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 for (upos = 0; upos<usize;)
810 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000811 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 ++callresult;
815 break;
816 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 case 'p':
818 sprintf(buffer, "%p", va_arg(vargs, void*));
819 /* %p is ill-defined: ensure leading 0x. */
820 if (buffer[1] == 'X')
821 buffer[1] = 'x';
822 else if (buffer[1] != 'x') {
823 memmove(buffer+2, buffer, strlen(buffer)+1);
824 buffer[0] = '0';
825 buffer[1] = 'x';
826 }
827 appendstring(buffer);
828 break;
829 case '%':
830 *s++ = '%';
831 break;
832 default:
833 appendstring(p);
834 goto end;
835 }
836 } else
837 *s++ = *f;
838 }
839
840 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000841 if (callresults)
842 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 if (abuffer)
844 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
846 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 fail:
848 if (callresults) {
849 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000850 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 Py_DECREF(*callresult2);
852 ++callresult2;
853 }
854 PyMem_Free(callresults);
855 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000856 if (abuffer)
857 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000858 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859}
860
861#undef appendstring
862
863PyObject *
864PyUnicode_FromFormat(const char *format, ...)
865{
866 PyObject* ret;
867 va_list vargs;
868
869#ifdef HAVE_STDARG_PROTOTYPES
870 va_start(vargs, format);
871#else
872 va_start(vargs);
873#endif
874 ret = PyUnicode_FromFormatV(format, vargs);
875 va_end(vargs);
876 return ret;
877}
878
Martin v. Löwis18e16552006-02-15 17:27:45 +0000879Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
880 wchar_t *w,
881 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 if (unicode == NULL) {
884 PyErr_BadInternalCall();
885 return -1;
886 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000887
888 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890 size = PyUnicode_GET_SIZE(unicode) + 1;
891
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892#ifdef HAVE_USABLE_WCHAR_T
893 memcpy(w, unicode->str, size * sizeof(wchar_t));
894#else
895 {
896 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000897 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000899 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 *w++ = *u++;
901 }
902#endif
903
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000904 if (size > PyUnicode_GET_SIZE(unicode))
905 return PyUnicode_GET_SIZE(unicode);
906 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 return size;
908}
909
910#endif
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912PyObject *PyUnicode_FromOrdinal(int ordinal)
913{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000914 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916 if (ordinal < 0 || ordinal > 0x10ffff) {
917 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 return NULL;
920 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921
922#ifndef Py_UNICODE_WIDE
923 if (ordinal > 0xffff) {
924 ordinal -= 0x10000;
925 s[0] = 0xD800 | (ordinal >> 10);
926 s[1] = 0xDC00 | (ordinal & 0x3FF);
927 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000928 }
929#endif
930
Hye-Shik Chang40574832004-04-06 07:24:51 +0000931 s[0] = (Py_UNICODE)ordinal;
932 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000933}
934
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935PyObject *PyUnicode_FromObject(register PyObject *obj)
936{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000937 /* XXX Perhaps we should make this API an alias of
938 PyObject_Unicode() instead ?! */
939 if (PyUnicode_CheckExact(obj)) {
940 Py_INCREF(obj);
941 return obj;
942 }
943 if (PyUnicode_Check(obj)) {
944 /* For a Unicode subtype that's not a Unicode object,
945 return a true Unicode object with the same data. */
946 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
947 PyUnicode_GET_SIZE(obj));
948 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000949 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
950}
951
952PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
953 const char *encoding,
954 const char *errors)
955{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000956 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000958 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000959
Guido van Rossumd57fd912000-03-10 22:53:23 +0000960 if (obj == NULL) {
961 PyErr_BadInternalCall();
962 return NULL;
963 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000964
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000965#if 0
966 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000967 that no encodings is given and then redirect to
968 PyObject_Unicode() which then applies the additional logic for
969 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000970
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000971 NOTE: This API should really only be used for object which
972 represent *encoded* Unicode !
973
974 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975 if (PyUnicode_Check(obj)) {
976 if (encoding) {
977 PyErr_SetString(PyExc_TypeError,
978 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000981 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000982 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983#else
984 if (PyUnicode_Check(obj)) {
985 PyErr_SetString(PyExc_TypeError,
986 "decoding Unicode is not supported");
987 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000988 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989#endif
990
991 /* Coerce object */
992 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000993 s = PyString_AS_STRING(obj);
994 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000995 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000996 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
997 /* Overwrite the error message with something more useful in
998 case of a TypeError. */
999 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 "coercing to Unicode: need string or buffer, "
1002 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001003 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 goto onError;
1005 }
Tim Petersced69f82003-09-16 20:30:58 +00001006
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001007 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 if (len == 0) {
1009 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 }
Tim Petersced69f82003-09-16 20:30:58 +00001012 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001014
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 return v;
1016
1017 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019}
1020
1021PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 const char *encoding,
1024 const char *errors)
1025{
1026 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027
1028 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001029 encoding = PyUnicode_GetDefaultEncoding();
1030
1031 /* Shortcuts for common default encodings */
1032 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001034 else if (strcmp(encoding, "latin-1") == 0)
1035 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001036#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1037 else if (strcmp(encoding, "mbcs") == 0)
1038 return PyUnicode_DecodeMBCS(s, size, errors);
1039#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001040 else if (strcmp(encoding, "ascii") == 0)
1041 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 /* Decode via the codec registry */
1044 buffer = PyBuffer_FromMemory((void *)s, size);
1045 if (buffer == NULL)
1046 goto onError;
1047 unicode = PyCodec_Decode(buffer, encoding, errors);
1048 if (unicode == NULL)
1049 goto onError;
1050 if (!PyUnicode_Check(unicode)) {
1051 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001052 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001053 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 Py_DECREF(unicode);
1055 goto onError;
1056 }
1057 Py_DECREF(buffer);
1058 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 onError:
1061 Py_XDECREF(buffer);
1062 return NULL;
1063}
1064
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001065PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1066 const char *encoding,
1067 const char *errors)
1068{
1069 PyObject *v;
1070
1071 if (!PyUnicode_Check(unicode)) {
1072 PyErr_BadArgument();
1073 goto onError;
1074 }
1075
1076 if (encoding == NULL)
1077 encoding = PyUnicode_GetDefaultEncoding();
1078
1079 /* Decode via the codec registry */
1080 v = PyCodec_Decode(unicode, encoding, errors);
1081 if (v == NULL)
1082 goto onError;
1083 return v;
1084
1085 onError:
1086 return NULL;
1087}
1088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *encoding,
1092 const char *errors)
1093{
1094 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 unicode = PyUnicode_FromUnicode(s, size);
1097 if (unicode == NULL)
1098 return NULL;
1099 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1100 Py_DECREF(unicode);
1101 return v;
1102}
1103
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001104PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1105 const char *encoding,
1106 const char *errors)
1107{
1108 PyObject *v;
1109
1110 if (!PyUnicode_Check(unicode)) {
1111 PyErr_BadArgument();
1112 goto onError;
1113 }
1114
1115 if (encoding == NULL)
1116 encoding = PyUnicode_GetDefaultEncoding();
1117
1118 /* Encode via the codec registry */
1119 v = PyCodec_Encode(unicode, encoding, errors);
1120 if (v == NULL)
1121 goto onError;
1122 return v;
1123
1124 onError:
1125 return NULL;
1126}
1127
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1129 const char *encoding,
1130 const char *errors)
1131{
1132 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_BadArgument();
1136 goto onError;
1137 }
Fred Drakee4315f52000-05-09 19:53:39 +00001138
Tim Petersced69f82003-09-16 20:30:58 +00001139 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001140 encoding = PyUnicode_GetDefaultEncoding();
1141
1142 /* Shortcuts for common default encodings */
1143 if (errors == NULL) {
1144 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001145 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001146 else if (strcmp(encoding, "latin-1") == 0)
1147 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001148#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1149 else if (strcmp(encoding, "mbcs") == 0)
1150 return PyUnicode_AsMBCSString(unicode);
1151#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001152 else if (strcmp(encoding, "ascii") == 0)
1153 return PyUnicode_AsASCIIString(unicode);
1154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155
1156 /* Encode via the codec registry */
1157 v = PyCodec_Encode(unicode, encoding, errors);
1158 if (v == NULL)
1159 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001160 if (!PyBytes_Check(v)) {
1161 if (PyString_Check(v)) {
1162 /* Old codec, turn it into bytes */
1163 PyObject *b = PyBytes_FromObject(v);
1164 Py_DECREF(v);
1165 return b;
1166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001168 "encoder did not return a bytes object "
1169 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1170 v->ob_type->tp_name,
1171 encoding ? encoding : "NULL",
1172 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 Py_DECREF(v);
1174 goto onError;
1175 }
1176 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 onError:
1179 return NULL;
1180}
1181
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001182PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1183 const char *errors)
1184{
1185 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001186 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187 if (v)
1188 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 if (errors != NULL)
1190 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Neal Norwitzab40b302007-08-12 17:21:38 +00001191 /* XXX(nnorwitz): errors will always be NULL due to the check above.
1192 Should this check and the else be removed since it's dead code?
1193 */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001194 if (errors == NULL) {
1195 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1196 PyUnicode_GET_SIZE(unicode),
1197 NULL);
1198 }
1199 else {
1200 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1201 }
1202 if (!b)
1203 return NULL;
1204 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1205 PyBytes_Size(b));
1206 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001207 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001208 return v;
1209}
1210
Martin v. Löwis5b222132007-06-10 09:51:05 +00001211char*
1212PyUnicode_AsString(PyObject *unicode)
1213{
1214 assert(PyUnicode_Check(unicode));
1215 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1216 if (!unicode)
1217 return NULL;
1218 return PyString_AsString(unicode);
1219}
1220
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1222{
1223 if (!PyUnicode_Check(unicode)) {
1224 PyErr_BadArgument();
1225 goto onError;
1226 }
1227 return PyUnicode_AS_UNICODE(unicode);
1228
1229 onError:
1230 return NULL;
1231}
1232
Martin v. Löwis18e16552006-02-15 17:27:45 +00001233Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234{
1235 if (!PyUnicode_Check(unicode)) {
1236 PyErr_BadArgument();
1237 goto onError;
1238 }
1239 return PyUnicode_GET_SIZE(unicode);
1240
1241 onError:
1242 return -1;
1243}
1244
Thomas Wouters78890102000-07-22 19:25:51 +00001245const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001246{
1247 return unicode_default_encoding;
1248}
1249
1250int PyUnicode_SetDefaultEncoding(const char *encoding)
1251{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001252 if (strcmp(encoding, unicode_default_encoding) != 0) {
1253 PyErr_Format(PyExc_ValueError,
1254 "Can only set default encoding to %s",
1255 unicode_default_encoding);
1256 return -1;
1257 }
Fred Drakee4315f52000-05-09 19:53:39 +00001258 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001259}
1260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261/* error handling callback helper:
1262 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001263 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001264 and adjust various state variables.
1265 return 0 on success, -1 on error
1266*/
1267
1268static
1269int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1270 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001271 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001274 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275
1276 PyObject *restuple = NULL;
1277 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001278 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001279 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t requiredsize;
1281 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001283 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001284 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 int res = -1;
1286
1287 if (*errorHandler == NULL) {
1288 *errorHandler = PyCodec_LookupError(errors);
1289 if (*errorHandler == NULL)
1290 goto onError;
1291 }
1292
1293 if (*exceptionObject == NULL) {
1294 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001295 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001296 if (*exceptionObject == NULL)
1297 goto onError;
1298 }
1299 else {
1300 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1301 goto onError;
1302 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1303 goto onError;
1304 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1305 goto onError;
1306 }
1307
1308 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1309 if (restuple == NULL)
1310 goto onError;
1311 if (!PyTuple_Check(restuple)) {
1312 PyErr_Format(PyExc_TypeError, &argparse[4]);
1313 goto onError;
1314 }
1315 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1316 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001317
1318 /* Copy back the bytes variables, which might have been modified by the
1319 callback */
1320 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1321 if (!inputobj)
1322 goto onError;
1323 if (!PyBytes_Check(inputobj)) {
1324 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1325 }
1326 *input = PyBytes_AS_STRING(inputobj);
1327 insize = PyBytes_GET_SIZE(inputobj);
1328 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001329 /* we can DECREF safely, as the exception has another reference,
1330 so the object won't go away. */
1331 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001334 newpos = insize+newpos;
1335 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001336 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001337 goto onError;
1338 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339
1340 /* need more space? (at least enough for what we
1341 have+the replacement+the rest of the string (starting
1342 at the new input position), so we won't have to check space
1343 when there are no errors in the rest of the string) */
1344 repptr = PyUnicode_AS_UNICODE(repunicode);
1345 repsize = PyUnicode_GET_SIZE(repunicode);
1346 requiredsize = *outpos + repsize + insize-newpos;
1347 if (requiredsize > outsize) {
1348 if (requiredsize<2*outsize)
1349 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001350 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 goto onError;
1352 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1353 }
1354 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001355 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356 Py_UNICODE_COPY(*outptr, repptr, repsize);
1357 *outptr += repsize;
1358 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001360 /* we made it! */
1361 res = 0;
1362
1363 onError:
1364 Py_XDECREF(restuple);
1365 return res;
1366}
1367
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001368/* --- UTF-7 Codec -------------------------------------------------------- */
1369
1370/* see RFC2152 for details */
1371
Tim Petersced69f82003-09-16 20:30:58 +00001372static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001373char utf7_special[128] = {
1374 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1375 encoded:
1376 0 - not special
1377 1 - special
1378 2 - whitespace (optional)
1379 3 - RFC2152 Set O (optional) */
1380 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1381 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1382 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1384 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1386 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1388
1389};
1390
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001391/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1392 warnings about the comparison always being false; since
1393 utf7_special[0] is 1, we can safely make that one comparison
1394 true */
1395
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001397 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001398 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001399 (encodeO && (utf7_special[(c)] == 3)))
1400
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001401#define B64(n) \
1402 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1403#define B64CHAR(c) \
1404 (isalnum(c) || (c) == '+' || (c) == '/')
1405#define UB64(c) \
1406 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1407 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001408
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001409#define ENCODE(out, ch, bits) \
1410 while (bits >= 6) { \
1411 *out++ = B64(ch >> (bits-6)); \
1412 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001413 }
1414
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001415#define DECODE(out, ch, bits, surrogate) \
1416 while (bits >= 16) { \
1417 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1418 bits -= 16; \
1419 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001420 /* We have already generated an error for the high surrogate \
1421 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001422 surrogate = 0; \
1423 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001425 it in a 16-bit character */ \
1426 surrogate = 1; \
1427 errmsg = "code pairs are not supported"; \
1428 goto utf7Error; \
1429 } else { \
1430 *out++ = outCh; \
1431 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001432 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001433
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001435 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001436 const char *errors)
1437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001438 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001439 Py_ssize_t startinpos;
1440 Py_ssize_t endinpos;
1441 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001442 const char *e;
1443 PyUnicodeObject *unicode;
1444 Py_UNICODE *p;
1445 const char *errmsg = "";
1446 int inShift = 0;
1447 unsigned int bitsleft = 0;
1448 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449 int surrogate = 0;
1450 PyObject *errorHandler = NULL;
1451 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452
1453 unicode = _PyUnicode_New(size);
1454 if (!unicode)
1455 return NULL;
1456 if (size == 0)
1457 return (PyObject *)unicode;
1458
1459 p = unicode->str;
1460 e = s + size;
1461
1462 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001463 Py_UNICODE ch;
1464 restart:
1465 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466
1467 if (inShift) {
1468 if ((ch == '-') || !B64CHAR(ch)) {
1469 inShift = 0;
1470 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001471
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1473 if (bitsleft >= 6) {
1474 /* The shift sequence has a partial character in it. If
1475 bitsleft < 6 then we could just classify it as padding
1476 but that is not the case here */
1477
1478 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001479 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480 }
1481 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001482 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 here so indicate the potential of a misencoded character. */
1484
1485 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1486 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1487 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001488 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 }
1490
1491 if (ch == '-') {
1492 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001493 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001494 inShift = 1;
1495 }
1496 } else if (SPECIAL(ch,0,0)) {
1497 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001498 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499 } else {
1500 *p++ = ch;
1501 }
1502 } else {
1503 charsleft = (charsleft << 6) | UB64(ch);
1504 bitsleft += 6;
1505 s++;
1506 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1507 }
1508 }
1509 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 s++;
1512 if (s < e && *s == '-') {
1513 s++;
1514 *p++ = '+';
1515 } else
1516 {
1517 inShift = 1;
1518 bitsleft = 0;
1519 }
1520 }
1521 else if (SPECIAL(ch,0,0)) {
1522 errmsg = "unexpected special character";
1523 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001524 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525 }
1526 else {
1527 *p++ = ch;
1528 s++;
1529 }
1530 continue;
1531 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 outpos = p-PyUnicode_AS_UNICODE(unicode);
1533 endinpos = s-starts;
1534 if (unicode_decode_call_errorhandler(
1535 errors, &errorHandler,
1536 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001537 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 (PyObject **)&unicode, &outpos, &p))
1539 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 }
1541
1542 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001543 outpos = p-PyUnicode_AS_UNICODE(unicode);
1544 endinpos = size;
1545 if (unicode_decode_call_errorhandler(
1546 errors, &errorHandler,
1547 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001548 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 if (s < e)
1552 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 }
1554
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001555 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 goto onError;
1557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001558 Py_XDECREF(errorHandler);
1559 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001560 return (PyObject *)unicode;
1561
1562onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 Py_XDECREF(errorHandler);
1564 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 Py_DECREF(unicode);
1566 return NULL;
1567}
1568
1569
1570PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001571 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 int encodeSetO,
1573 int encodeWhiteSpace,
1574 const char *errors)
1575{
1576 PyObject *v;
1577 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 unsigned int bitsleft = 0;
1582 unsigned long charsleft = 0;
1583 char * out;
1584 char * start;
1585
1586 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001587 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588
Walter Dörwald51ab4142007-05-05 14:43:36 +00001589 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 if (v == NULL)
1591 return NULL;
1592
Walter Dörwald51ab4142007-05-05 14:43:36 +00001593 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 for (;i < size; ++i) {
1595 Py_UNICODE ch = s[i];
1596
1597 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001598 if (ch == '+') {
1599 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 *out++ = '-';
1601 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1602 charsleft = ch;
1603 bitsleft = 16;
1604 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001605 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001607 } else {
1608 *out++ = (char) ch;
1609 }
1610 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1612 *out++ = B64(charsleft << (6-bitsleft));
1613 charsleft = 0;
1614 bitsleft = 0;
1615 /* Characters not in the BASE64 set implicitly unshift the sequence
1616 so no '-' is required, except if the character is itself a '-' */
1617 if (B64CHAR(ch) || ch == '-') {
1618 *out++ = '-';
1619 }
1620 inShift = 0;
1621 *out++ = (char) ch;
1622 } else {
1623 bitsleft += 16;
1624 charsleft = (charsleft << 16) | ch;
1625 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1626
1627 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001628 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001629 or '-' then the shift sequence will be terminated implicitly and we
1630 don't have to insert a '-'. */
1631
1632 if (bitsleft == 0) {
1633 if (i + 1 < size) {
1634 Py_UNICODE ch2 = s[i+1];
1635
1636 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001637
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 } else if (B64CHAR(ch2) || ch2 == '-') {
1639 *out++ = '-';
1640 inShift = 0;
1641 } else {
1642 inShift = 0;
1643 }
1644
1645 }
1646 else {
1647 *out++ = '-';
1648 inShift = 0;
1649 }
1650 }
Tim Petersced69f82003-09-16 20:30:58 +00001651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001653 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 if (bitsleft) {
1655 *out++= B64(charsleft << (6-bitsleft) );
1656 *out++ = '-';
1657 }
1658
Walter Dörwald51ab4142007-05-05 14:43:36 +00001659 if (PyBytes_Resize(v, out - start)) {
1660 Py_DECREF(v);
1661 return NULL;
1662 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 return v;
1664}
1665
1666#undef SPECIAL
1667#undef B64
1668#undef B64CHAR
1669#undef UB64
1670#undef ENCODE
1671#undef DECODE
1672
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673/* --- UTF-8 Codec -------------------------------------------------------- */
1674
Tim Petersced69f82003-09-16 20:30:58 +00001675static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676char utf8_code_length[256] = {
1677 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1678 illegal prefix. see RFC 2279 for details */
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1691 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1692 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1693 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1694 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1695};
1696
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 const char *errors)
1700{
Walter Dörwald69652032004-09-07 20:24:22 +00001701 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1702}
1703
1704PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001705 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001706 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001707 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001708{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001711 Py_ssize_t startinpos;
1712 Py_ssize_t endinpos;
1713 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 const char *e;
1715 PyUnicodeObject *unicode;
1716 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001717 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 PyObject *errorHandler = NULL;
1719 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
1721 /* Note: size will always be longer than the resulting Unicode
1722 character count */
1723 unicode = _PyUnicode_New(size);
1724 if (!unicode)
1725 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001726 if (size == 0) {
1727 if (consumed)
1728 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
1732 /* Unpack UTF-8 encoded data */
1733 p = unicode->str;
1734 e = s + size;
1735
1736 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001737 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738
1739 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001740 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 s++;
1742 continue;
1743 }
1744
1745 n = utf8_code_length[ch];
1746
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001747 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001748 if (consumed)
1749 break;
1750 else {
1751 errmsg = "unexpected end of data";
1752 startinpos = s-starts;
1753 endinpos = size;
1754 goto utf8Error;
1755 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
1758 switch (n) {
1759
1760 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 startinpos = s-starts;
1763 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
1766 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 startinpos = s-starts;
1769 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771
1772 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001773 if ((s[1] & 0xc0) != 0x80) {
1774 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 startinpos = s-starts;
1776 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 goto utf8Error;
1778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 startinpos = s-starts;
1782 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001783 errmsg = "illegal encoding";
1784 goto utf8Error;
1785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001787 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
1790 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001791 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 (s[2] & 0xc0) != 0x80) {
1793 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 startinpos = s-starts;
1795 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001796 goto utf8Error;
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001799 if (ch < 0x0800) {
1800 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001801 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001802
1803 XXX For wide builds (UCS-4) we should probably try
1804 to recombine the surrogates into a single code
1805 unit.
1806 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 startinpos = s-starts;
1809 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 goto utf8Error;
1811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001813 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001814 break;
1815
1816 case 4:
1817 if ((s[1] & 0xc0) != 0x80 ||
1818 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 (s[3] & 0xc0) != 0x80) {
1820 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 startinpos = s-starts;
1822 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 goto utf8Error;
1824 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1826 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1827 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001829 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001830 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001831 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001832 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 startinpos = s-starts;
1835 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001836 goto utf8Error;
1837 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001838#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001839 *p++ = (Py_UNICODE)ch;
1840#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001841 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001843 /* translate from 10000..10FFFF to 0..FFFF */
1844 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001845
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001846 /* high surrogate = top 10 bits added to D800 */
1847 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001848
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001849 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001850 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001851#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 break;
1853
1854 default:
1855 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 startinpos = s-starts;
1858 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001859 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 }
1861 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001863
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865 outpos = p-PyUnicode_AS_UNICODE(unicode);
1866 if (unicode_decode_call_errorhandler(
1867 errors, &errorHandler,
1868 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001869 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 (PyObject **)&unicode, &outpos, &p))
1871 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 }
Walter Dörwald69652032004-09-07 20:24:22 +00001873 if (consumed)
1874 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875
1876 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001877 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 goto onError;
1879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 Py_XDECREF(errorHandler);
1881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 return (PyObject *)unicode;
1883
1884onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 Py_XDECREF(errorHandler);
1886 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 Py_DECREF(unicode);
1888 return NULL;
1889}
1890
Tim Peters602f7402002-04-27 18:03:26 +00001891/* Allocation strategy: if the string is short, convert into a stack buffer
1892 and allocate exactly as much space needed at the end. Else allocate the
1893 maximum possible needed (4 result bytes per Unicode character), and return
1894 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001895*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001896PyObject *
1897PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001898 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900{
Tim Peters602f7402002-04-27 18:03:26 +00001901#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001902
Martin v. Löwis18e16552006-02-15 17:27:45 +00001903 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001904 PyObject *v; /* result string object */
1905 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001906 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001907 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001908 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001909
Tim Peters602f7402002-04-27 18:03:26 +00001910 assert(s != NULL);
1911 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912
Tim Peters602f7402002-04-27 18:03:26 +00001913 if (size <= MAX_SHORT_UNICHARS) {
1914 /* Write into the stack buffer; nallocated can't overflow.
1915 * At the end, we'll allocate exactly as much heap space as it
1916 * turns out we need.
1917 */
1918 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1919 v = NULL; /* will allocate after we're done */
1920 p = stackbuf;
1921 }
1922 else {
1923 /* Overallocate on the heap, and give the excess back at the end. */
1924 nallocated = size * 4;
1925 if (nallocated / 4 != size) /* overflow! */
1926 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001927 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001928 if (v == NULL)
1929 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001930 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001931 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001932
Tim Peters602f7402002-04-27 18:03:26 +00001933 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001934 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001935
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001936 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001937 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001939
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001941 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001942 *p++ = (char)(0xc0 | (ch >> 6));
1943 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001945 else {
Tim Peters602f7402002-04-27 18:03:26 +00001946 /* Encode UCS2 Unicode ordinals */
1947 if (ch < 0x10000) {
1948 /* Special case: check for high surrogate */
1949 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1950 Py_UCS4 ch2 = s[i];
1951 /* Check for low surrogate and combine the two to
1952 form a UCS4 value */
1953 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001954 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001955 i++;
1956 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 }
Tim Peters602f7402002-04-27 18:03:26 +00001958 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001959 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001960 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001961 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1962 *p++ = (char)(0x80 | (ch & 0x3f));
1963 continue;
1964 }
1965encodeUCS4:
1966 /* Encode UCS4 Unicode ordinals */
1967 *p++ = (char)(0xf0 | (ch >> 18));
1968 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1969 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1970 *p++ = (char)(0x80 | (ch & 0x3f));
1971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001973
Tim Peters602f7402002-04-27 18:03:26 +00001974 if (v == NULL) {
1975 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001976 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001977 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001978 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001979 }
1980 else {
1981 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001982 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001983 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001984 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001987
Tim Peters602f7402002-04-27 18:03:26 +00001988#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989}
1990
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1992{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 if (!PyUnicode_Check(unicode)) {
1994 PyErr_BadArgument();
1995 return NULL;
1996 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001997 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1998 PyUnicode_GET_SIZE(unicode),
1999 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000}
2001
2002/* --- UTF-16 Codec ------------------------------------------------------- */
2003
Tim Peters772747b2001-08-09 22:21:55 +00002004PyObject *
2005PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002007 const char *errors,
2008 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009{
Walter Dörwald69652032004-09-07 20:24:22 +00002010 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2011}
2012
2013PyObject *
2014PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002015 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002016 const char *errors,
2017 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002018 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002021 Py_ssize_t startinpos;
2022 Py_ssize_t endinpos;
2023 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 PyUnicodeObject *unicode;
2025 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002026 const unsigned char *q, *e;
2027 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002028 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002029 /* Offsets from q for retrieving byte pairs in the right order. */
2030#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2031 int ihi = 1, ilo = 0;
2032#else
2033 int ihi = 0, ilo = 1;
2034#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 PyObject *errorHandler = NULL;
2036 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037
2038 /* Note: size will always be longer than the resulting Unicode
2039 character count */
2040 unicode = _PyUnicode_New(size);
2041 if (!unicode)
2042 return NULL;
2043 if (size == 0)
2044 return (PyObject *)unicode;
2045
2046 /* Unpack UTF-16 encoded data */
2047 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002048 q = (unsigned char *)s;
2049 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
2051 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002052 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002054 /* Check for BOM marks (U+FEFF) in the input and adjust current
2055 byte order setting accordingly. In native mode, the leading BOM
2056 mark is skipped, in all other modes, it is copied to the output
2057 stream as-is (giving a ZWNBSP character). */
2058 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002059 if (size >= 2) {
2060 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002062 if (bom == 0xFEFF) {
2063 q += 2;
2064 bo = -1;
2065 }
2066 else if (bom == 0xFFFE) {
2067 q += 2;
2068 bo = 1;
2069 }
Tim Petersced69f82003-09-16 20:30:58 +00002070#else
Walter Dörwald69652032004-09-07 20:24:22 +00002071 if (bom == 0xFEFF) {
2072 q += 2;
2073 bo = 1;
2074 }
2075 else if (bom == 0xFFFE) {
2076 q += 2;
2077 bo = -1;
2078 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002079#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002080 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Tim Peters772747b2001-08-09 22:21:55 +00002083 if (bo == -1) {
2084 /* force LE */
2085 ihi = 1;
2086 ilo = 0;
2087 }
2088 else if (bo == 1) {
2089 /* force BE */
2090 ihi = 0;
2091 ilo = 1;
2092 }
2093
2094 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002096 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002098 if (consumed)
2099 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 errmsg = "truncated data";
2101 startinpos = ((const char *)q)-starts;
2102 endinpos = ((const char *)e)-starts;
2103 goto utf16Error;
2104 /* The remaining input chars are ignored if the callback
2105 chooses to skip the input */
2106 }
2107 ch = (q[ihi] << 8) | q[ilo];
2108
Tim Peters772747b2001-08-09 22:21:55 +00002109 q += 2;
2110
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 if (ch < 0xD800 || ch > 0xDFFF) {
2112 *p++ = ch;
2113 continue;
2114 }
2115
2116 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002117 if (q >= e) {
2118 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 startinpos = (((const char *)q)-2)-starts;
2120 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002121 goto utf16Error;
2122 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002123 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002124 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2125 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002126 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002127#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002128 *p++ = ch;
2129 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002130#else
2131 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002132#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002133 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002134 }
2135 else {
2136 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 startinpos = (((const char *)q)-4)-starts;
2138 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002139 goto utf16Error;
2140 }
2141
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002143 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002144 startinpos = (((const char *)q)-2)-starts;
2145 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002146 /* Fall through to report the error */
2147
2148 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149 outpos = p-PyUnicode_AS_UNICODE(unicode);
2150 if (unicode_decode_call_errorhandler(
2151 errors, &errorHandler,
2152 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002153 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002154 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 }
2157
2158 if (byteorder)
2159 *byteorder = bo;
2160
Walter Dörwald69652032004-09-07 20:24:22 +00002161 if (consumed)
2162 *consumed = (const char *)q-starts;
2163
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002165 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 goto onError;
2167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002168 Py_XDECREF(errorHandler);
2169 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 return (PyObject *)unicode;
2171
2172onError:
2173 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 Py_XDECREF(errorHandler);
2175 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 return NULL;
2177}
2178
Tim Peters772747b2001-08-09 22:21:55 +00002179PyObject *
2180PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002181 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002182 const char *errors,
2183 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184{
2185 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002186 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002187#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002188 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002189#else
2190 const int pairs = 0;
2191#endif
Tim Peters772747b2001-08-09 22:21:55 +00002192 /* Offsets from p for storing byte pairs in the right order. */
2193#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2194 int ihi = 1, ilo = 0;
2195#else
2196 int ihi = 0, ilo = 1;
2197#endif
2198
2199#define STORECHAR(CH) \
2200 do { \
2201 p[ihi] = ((CH) >> 8) & 0xff; \
2202 p[ilo] = (CH) & 0xff; \
2203 p += 2; \
2204 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002206#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002207 for (i = pairs = 0; i < size; i++)
2208 if (s[i] >= 0x10000)
2209 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002210#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002211 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002212 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 if (v == NULL)
2214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
Walter Dörwald3cc34522007-05-04 10:48:27 +00002216 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002218 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002219 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002220 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002221
2222 if (byteorder == -1) {
2223 /* force LE */
2224 ihi = 1;
2225 ilo = 0;
2226 }
2227 else if (byteorder == 1) {
2228 /* force BE */
2229 ihi = 0;
2230 ilo = 1;
2231 }
2232
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002233 while (size-- > 0) {
2234 Py_UNICODE ch = *s++;
2235 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002236#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002237 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002238 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2239 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002241#endif
Tim Peters772747b2001-08-09 22:21:55 +00002242 STORECHAR(ch);
2243 if (ch2)
2244 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002247#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248}
2249
2250PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2251{
2252 if (!PyUnicode_Check(unicode)) {
2253 PyErr_BadArgument();
2254 return NULL;
2255 }
2256 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2257 PyUnicode_GET_SIZE(unicode),
2258 NULL,
2259 0);
2260}
2261
2262/* --- Unicode Escape Codec ----------------------------------------------- */
2263
Fredrik Lundh06d12682001-01-24 07:59:11 +00002264static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002267 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 const char *errors)
2269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002271 Py_ssize_t startinpos;
2272 Py_ssize_t endinpos;
2273 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002278 char* message;
2279 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002280 PyObject *errorHandler = NULL;
2281 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002282
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 /* Escaped strings will always be longer than the resulting
2284 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 length after conversion to the true value.
2286 (but if the error callback returns a long replacement string
2287 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 v = _PyUnicode_New(size);
2289 if (v == NULL)
2290 goto onError;
2291 if (size == 0)
2292 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002296
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 while (s < end) {
2298 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002299 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002300 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 /* Non-escape characters are interpreted as Unicode ordinals */
2303 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002304 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 continue;
2306 }
2307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002308 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 /* \ - Escapes */
2310 s++;
2311 switch (*s++) {
2312
2313 /* \x escapes */
2314 case '\n': break;
2315 case '\\': *p++ = '\\'; break;
2316 case '\'': *p++ = '\''; break;
2317 case '\"': *p++ = '\"'; break;
2318 case 'b': *p++ = '\b'; break;
2319 case 'f': *p++ = '\014'; break; /* FF */
2320 case 't': *p++ = '\t'; break;
2321 case 'n': *p++ = '\n'; break;
2322 case 'r': *p++ = '\r'; break;
2323 case 'v': *p++ = '\013'; break; /* VT */
2324 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2325
2326 /* \OOO (octal) escapes */
2327 case '0': case '1': case '2': case '3':
2328 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002329 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002331 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002333 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002335 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 break;
2337
Fredrik Lundhccc74732001-02-18 22:13:49 +00002338 /* hex escapes */
2339 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002341 digits = 2;
2342 message = "truncated \\xXX escape";
2343 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344
Fredrik Lundhccc74732001-02-18 22:13:49 +00002345 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002347 digits = 4;
2348 message = "truncated \\uXXXX escape";
2349 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350
Fredrik Lundhccc74732001-02-18 22:13:49 +00002351 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002352 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002353 digits = 8;
2354 message = "truncated \\UXXXXXXXX escape";
2355 hexescape:
2356 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002357 outpos = p-PyUnicode_AS_UNICODE(v);
2358 if (s+digits>end) {
2359 endinpos = size;
2360 if (unicode_decode_call_errorhandler(
2361 errors, &errorHandler,
2362 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002363 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 (PyObject **)&v, &outpos, &p))
2365 goto onError;
2366 goto nextByte;
2367 }
2368 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002369 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002370 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 endinpos = (s+i+1)-starts;
2372 if (unicode_decode_call_errorhandler(
2373 errors, &errorHandler,
2374 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002375 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002376 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002377 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002378 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002379 }
2380 chr = (chr<<4) & ~0xF;
2381 if (c >= '0' && c <= '9')
2382 chr += c - '0';
2383 else if (c >= 'a' && c <= 'f')
2384 chr += 10 + c - 'a';
2385 else
2386 chr += 10 + c - 'A';
2387 }
2388 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002389 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002390 /* _decoding_error will have already written into the
2391 target buffer. */
2392 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002393 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002394 /* when we get here, chr is a 32-bit unicode character */
2395 if (chr <= 0xffff)
2396 /* UCS-2 character */
2397 *p++ = (Py_UNICODE) chr;
2398 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002399 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002400 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002401#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002402 *p++ = chr;
2403#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002404 chr -= 0x10000L;
2405 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002406 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002407#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002408 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002409 endinpos = s-starts;
2410 outpos = p-PyUnicode_AS_UNICODE(v);
2411 if (unicode_decode_call_errorhandler(
2412 errors, &errorHandler,
2413 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002414 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002416 goto onError;
2417 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002418 break;
2419
2420 /* \N{name} */
2421 case 'N':
2422 message = "malformed \\N character escape";
2423 if (ucnhash_CAPI == NULL) {
2424 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002425 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002426 m = PyImport_ImportModule("unicodedata");
2427 if (m == NULL)
2428 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002429 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002430 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002431 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002432 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002433 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002434 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002435 if (ucnhash_CAPI == NULL)
2436 goto ucnhashError;
2437 }
2438 if (*s == '{') {
2439 const char *start = s+1;
2440 /* look for the closing brace */
2441 while (*s != '}' && s < end)
2442 s++;
2443 if (s > start && s < end && *s == '}') {
2444 /* found a name. look it up in the unicode database */
2445 message = "unknown Unicode character name";
2446 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002447 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002448 goto store;
2449 }
2450 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002451 endinpos = s-starts;
2452 outpos = p-PyUnicode_AS_UNICODE(v);
2453 if (unicode_decode_call_errorhandler(
2454 errors, &errorHandler,
2455 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002456 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002458 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002459 break;
2460
2461 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002462 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 message = "\\ at end of string";
2464 s--;
2465 endinpos = s-starts;
2466 outpos = p-PyUnicode_AS_UNICODE(v);
2467 if (unicode_decode_call_errorhandler(
2468 errors, &errorHandler,
2469 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002470 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002471 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002472 goto onError;
2473 }
2474 else {
2475 *p++ = '\\';
2476 *p++ = (unsigned char)s[-1];
2477 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002478 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 nextByte:
2481 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002483 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002485 Py_XDECREF(errorHandler);
2486 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002488
Fredrik Lundhccc74732001-02-18 22:13:49 +00002489ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002490 PyErr_SetString(
2491 PyExc_UnicodeError,
2492 "\\N escapes not supported (can't load unicodedata module)"
2493 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002494 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002497 return NULL;
2498
Fredrik Lundhccc74732001-02-18 22:13:49 +00002499onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return NULL;
2504}
2505
2506/* Return a Unicode-Escape string version of the Unicode object.
2507
2508 If quotes is true, the string is enclosed in u"" or u'' quotes as
2509 appropriate.
2510
2511*/
2512
Thomas Wouters477c8d52006-05-27 19:21:47 +00002513Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2514 Py_ssize_t size,
2515 Py_UNICODE ch)
2516{
2517 /* like wcschr, but doesn't stop at NULL characters */
2518
2519 while (size-- > 0) {
2520 if (*s == ch)
2521 return s;
2522 s++;
2523 }
2524
2525 return NULL;
2526}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002527
Walter Dörwald79e913e2007-05-12 11:08:06 +00002528static const char *hexdigits = "0123456789abcdef";
2529
2530PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2531 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532{
2533 PyObject *repr;
2534 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535
Thomas Wouters89f507f2006-12-13 04:49:30 +00002536 /* XXX(nnorwitz): rather than over-allocating, it would be
2537 better to choose a different scheme. Perhaps scan the
2538 first N-chars of the string and allocate based on that size.
2539 */
2540 /* Initial allocation is based on the longest-possible unichr
2541 escape.
2542
2543 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2544 unichr, so in this case it's the longest unichr escape. In
2545 narrow (UTF-16) builds this is five chars per source unichr
2546 since there are two unichrs in the surrogate pair, so in narrow
2547 (UTF-16) builds it's not the longest unichr escape.
2548
2549 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2550 so in the narrow (UTF-16) build case it's the longest unichr
2551 escape.
2552 */
2553
Walter Dörwald79e913e2007-05-12 11:08:06 +00002554 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002555#ifdef Py_UNICODE_WIDE
2556 + 10*size
2557#else
2558 + 6*size
2559#endif
2560 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 if (repr == NULL)
2562 return NULL;
2563
Walter Dörwald79e913e2007-05-12 11:08:06 +00002564 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 while (size-- > 0) {
2567 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002568
Walter Dörwald79e913e2007-05-12 11:08:06 +00002569 /* Escape backslashes */
2570 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 *p++ = '\\';
2572 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002573 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002574 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002575
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002576#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002577 /* Map 21-bit characters to '\U00xxxxxx' */
2578 else if (ch >= 0x10000) {
2579 *p++ = '\\';
2580 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002581 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2582 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2583 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2584 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2585 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2586 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2587 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2588 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002589 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002591#else
2592 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002593 else if (ch >= 0xD800 && ch < 0xDC00) {
2594 Py_UNICODE ch2;
2595 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002596
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002597 ch2 = *s++;
2598 size--;
2599 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2600 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2601 *p++ = '\\';
2602 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002603 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2604 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2605 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2606 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2607 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2608 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2609 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2610 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002611 continue;
2612 }
2613 /* Fall through: isolated surrogates are copied as-is */
2614 s--;
2615 size++;
2616 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002617#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002620 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 *p++ = '\\';
2622 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002623 *p++ = hexdigits[(ch >> 12) & 0x000F];
2624 *p++ = hexdigits[(ch >> 8) & 0x000F];
2625 *p++ = hexdigits[(ch >> 4) & 0x000F];
2626 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002628
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002629 /* Map special whitespace to '\t', \n', '\r' */
2630 else if (ch == '\t') {
2631 *p++ = '\\';
2632 *p++ = 't';
2633 }
2634 else if (ch == '\n') {
2635 *p++ = '\\';
2636 *p++ = 'n';
2637 }
2638 else if (ch == '\r') {
2639 *p++ = '\\';
2640 *p++ = 'r';
2641 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002642
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002643 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002644 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002646 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002647 *p++ = hexdigits[(ch >> 4) & 0x000F];
2648 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002649 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002650
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 /* Copy everything else as-is */
2652 else
2653 *p++ = (char) ch;
2654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
2656 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002657 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2658 Py_DECREF(repr);
2659 return NULL;
2660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 return repr;
2662}
2663
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2665{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002666 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667 if (!PyUnicode_Check(unicode)) {
2668 PyErr_BadArgument();
2669 return NULL;
2670 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002671 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2672 PyUnicode_GET_SIZE(unicode));
2673
2674 if (!s)
2675 return NULL;
2676 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2677 PyBytes_GET_SIZE(s));
2678 Py_DECREF(s);
2679 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680}
2681
2682/* --- Raw Unicode Escape Codec ------------------------------------------- */
2683
2684PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002685 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 const char *errors)
2687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002689 Py_ssize_t startinpos;
2690 Py_ssize_t endinpos;
2691 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 const char *end;
2695 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002696 PyObject *errorHandler = NULL;
2697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 /* Escaped strings will always be longer than the resulting
2700 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 length after conversion to the true value. (But decoding error
2702 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 v = _PyUnicode_New(size);
2704 if (v == NULL)
2705 goto onError;
2706 if (size == 0)
2707 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002708 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 end = s + size;
2710 while (s < end) {
2711 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002712 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002714 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715
2716 /* Non-escape characters are interpreted as Unicode ordinals */
2717 if (*s != '\\') {
2718 *p++ = (unsigned char)*s++;
2719 continue;
2720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722
2723 /* \u-escapes are only interpreted iff the number of leading
2724 backslashes if odd */
2725 bs = s;
2726 for (;s < end;) {
2727 if (*s != '\\')
2728 break;
2729 *p++ = (unsigned char)*s++;
2730 }
2731 if (((s - bs) & 1) == 0 ||
2732 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002733 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 continue;
2735 }
2736 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002737 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 s++;
2739
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002740 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002742 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 endinpos = s-starts;
2746 if (unicode_decode_call_errorhandler(
2747 errors, &errorHandler,
2748 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002749 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 }
2754 x = (x<<4) & ~0xF;
2755 if (c >= '0' && c <= '9')
2756 x += c - '0';
2757 else if (c >= 'a' && c <= 'f')
2758 x += 10 + c - 'a';
2759 else
2760 x += 10 + c - 'A';
2761 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002762#ifndef Py_UNICODE_WIDE
2763 if (x > 0x10000) {
2764 if (unicode_decode_call_errorhandler(
2765 errors, &errorHandler,
2766 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002767 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002768 (PyObject **)&v, &outpos, &p))
2769 goto onError;
2770 }
2771#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 *p++ = x;
2773 nextByte:
2774 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002776 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002777 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 Py_XDECREF(errorHandler);
2779 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002781
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 onError:
2783 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return NULL;
2787}
2788
2789PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002790 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791{
2792 PyObject *repr;
2793 char *p;
2794 char *q;
2795
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002796#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002797 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002798#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002799 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002800#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 if (repr == NULL)
2802 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002803 if (size == 0)
2804 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805
Walter Dörwald711005d2007-05-12 12:03:26 +00002806 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 while (size-- > 0) {
2808 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002809#ifdef Py_UNICODE_WIDE
2810 /* Map 32-bit characters to '\Uxxxxxxxx' */
2811 if (ch >= 0x10000) {
2812 *p++ = '\\';
2813 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002814 *p++ = hexdigits[(ch >> 28) & 0xf];
2815 *p++ = hexdigits[(ch >> 24) & 0xf];
2816 *p++ = hexdigits[(ch >> 20) & 0xf];
2817 *p++ = hexdigits[(ch >> 16) & 0xf];
2818 *p++ = hexdigits[(ch >> 12) & 0xf];
2819 *p++ = hexdigits[(ch >> 8) & 0xf];
2820 *p++ = hexdigits[(ch >> 4) & 0xf];
2821 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002822 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002823 else
2824#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 /* Map 16-bit characters to '\uxxxx' */
2826 if (ch >= 256) {
2827 *p++ = '\\';
2828 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002829 *p++ = hexdigits[(ch >> 12) & 0xf];
2830 *p++ = hexdigits[(ch >> 8) & 0xf];
2831 *p++ = hexdigits[(ch >> 4) & 0xf];
2832 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 }
2834 /* Copy everything else as-is */
2835 else
2836 *p++ = (char) ch;
2837 }
2838 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002839 if (PyBytes_Resize(repr, p - q)) {
2840 Py_DECREF(repr);
2841 return NULL;
2842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 return repr;
2844}
2845
2846PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2847{
Walter Dörwald711005d2007-05-12 12:03:26 +00002848 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002850 PyErr_BadArgument();
2851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002853 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2854 PyUnicode_GET_SIZE(unicode));
2855
2856 if (!s)
2857 return NULL;
2858 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2859 PyBytes_GET_SIZE(s));
2860 Py_DECREF(s);
2861 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862}
2863
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002864/* --- Unicode Internal Codec ------------------------------------------- */
2865
2866PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002868 const char *errors)
2869{
2870 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002871 Py_ssize_t startinpos;
2872 Py_ssize_t endinpos;
2873 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002874 PyUnicodeObject *v;
2875 Py_UNICODE *p;
2876 const char *end;
2877 const char *reason;
2878 PyObject *errorHandler = NULL;
2879 PyObject *exc = NULL;
2880
Neal Norwitzd43069c2006-01-08 01:12:10 +00002881#ifdef Py_UNICODE_WIDE
2882 Py_UNICODE unimax = PyUnicode_GetMax();
2883#endif
2884
Thomas Wouters89f507f2006-12-13 04:49:30 +00002885 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002886 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2887 if (v == NULL)
2888 goto onError;
2889 if (PyUnicode_GetSize((PyObject *)v) == 0)
2890 return (PyObject *)v;
2891 p = PyUnicode_AS_UNICODE(v);
2892 end = s + size;
2893
2894 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002895 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002896 /* We have to sanity check the raw data, otherwise doom looms for
2897 some malformed UCS-4 data. */
2898 if (
2899 #ifdef Py_UNICODE_WIDE
2900 *p > unimax || *p < 0 ||
2901 #endif
2902 end-s < Py_UNICODE_SIZE
2903 )
2904 {
2905 startinpos = s - starts;
2906 if (end-s < Py_UNICODE_SIZE) {
2907 endinpos = end-starts;
2908 reason = "truncated input";
2909 }
2910 else {
2911 endinpos = s - starts + Py_UNICODE_SIZE;
2912 reason = "illegal code point (> 0x10FFFF)";
2913 }
2914 outpos = p - PyUnicode_AS_UNICODE(v);
2915 if (unicode_decode_call_errorhandler(
2916 errors, &errorHandler,
2917 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002918 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002919 (PyObject **)&v, &outpos, &p)) {
2920 goto onError;
2921 }
2922 }
2923 else {
2924 p++;
2925 s += Py_UNICODE_SIZE;
2926 }
2927 }
2928
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002929 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002930 goto onError;
2931 Py_XDECREF(errorHandler);
2932 Py_XDECREF(exc);
2933 return (PyObject *)v;
2934
2935 onError:
2936 Py_XDECREF(v);
2937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
2939 return NULL;
2940}
2941
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942/* --- Latin-1 Codec ------------------------------------------------------ */
2943
2944PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002945 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 const char *errors)
2947{
2948 PyUnicodeObject *v;
2949 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002952 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002953 Py_UNICODE r = *(unsigned char*)s;
2954 return PyUnicode_FromUnicode(&r, 1);
2955 }
2956
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 v = _PyUnicode_New(size);
2958 if (v == NULL)
2959 goto onError;
2960 if (size == 0)
2961 return (PyObject *)v;
2962 p = PyUnicode_AS_UNICODE(v);
2963 while (size-- > 0)
2964 *p++ = (unsigned char)*s++;
2965 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 onError:
2968 Py_XDECREF(v);
2969 return NULL;
2970}
2971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972/* create or adjust a UnicodeEncodeError */
2973static void make_encode_exception(PyObject **exceptionObject,
2974 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002975 const Py_UNICODE *unicode, Py_ssize_t size,
2976 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 if (*exceptionObject == NULL) {
2980 *exceptionObject = PyUnicodeEncodeError_Create(
2981 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
2983 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2985 goto onError;
2986 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2987 goto onError;
2988 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2989 goto onError;
2990 return;
2991 onError:
2992 Py_DECREF(*exceptionObject);
2993 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 }
2995}
2996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997/* raises a UnicodeEncodeError */
2998static void raise_encode_exception(PyObject **exceptionObject,
2999 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003000 const Py_UNICODE *unicode, Py_ssize_t size,
3001 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 const char *reason)
3003{
3004 make_encode_exception(exceptionObject,
3005 encoding, unicode, size, startpos, endpos, reason);
3006 if (*exceptionObject != NULL)
3007 PyCodec_StrictErrors(*exceptionObject);
3008}
3009
3010/* error handling callback helper:
3011 build arguments, call the callback and check the arguments,
3012 put the result into newpos and return the replacement string, which
3013 has to be freed by the caller */
3014static PyObject *unicode_encode_call_errorhandler(const char *errors,
3015 PyObject **errorHandler,
3016 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003017 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3018 Py_ssize_t startpos, Py_ssize_t endpos,
3019 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003021 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022
3023 PyObject *restuple;
3024 PyObject *resunicode;
3025
3026 if (*errorHandler == NULL) {
3027 *errorHandler = PyCodec_LookupError(errors);
3028 if (*errorHandler == NULL)
3029 return NULL;
3030 }
3031
3032 make_encode_exception(exceptionObject,
3033 encoding, unicode, size, startpos, endpos, reason);
3034 if (*exceptionObject == NULL)
3035 return NULL;
3036
3037 restuple = PyObject_CallFunctionObjArgs(
3038 *errorHandler, *exceptionObject, NULL);
3039 if (restuple == NULL)
3040 return NULL;
3041 if (!PyTuple_Check(restuple)) {
3042 PyErr_Format(PyExc_TypeError, &argparse[4]);
3043 Py_DECREF(restuple);
3044 return NULL;
3045 }
3046 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3047 &resunicode, newpos)) {
3048 Py_DECREF(restuple);
3049 return NULL;
3050 }
3051 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003052 *newpos = size+*newpos;
3053 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003054 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003055 Py_DECREF(restuple);
3056 return NULL;
3057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 Py_INCREF(resunicode);
3059 Py_DECREF(restuple);
3060 return resunicode;
3061}
3062
3063static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003064 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 const char *errors,
3066 int limit)
3067{
3068 /* output object */
3069 PyObject *res;
3070 /* pointers to the beginning and end+1 of input */
3071 const Py_UNICODE *startp = p;
3072 const Py_UNICODE *endp = p + size;
3073 /* pointer to the beginning of the unencodable characters */
3074 /* const Py_UNICODE *badp = NULL; */
3075 /* pointer into the output */
3076 char *str;
3077 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003078 Py_ssize_t respos = 0;
3079 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003080 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3081 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082 PyObject *errorHandler = NULL;
3083 PyObject *exc = NULL;
3084 /* the following variable is used for caching string comparisons
3085 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3086 int known_errorHandler = -1;
3087
3088 /* allocate enough for a simple encoding without
3089 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003090 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 if (res == NULL)
3092 goto onError;
3093 if (size == 0)
3094 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003095 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 ressize = size;
3097
3098 while (p<endp) {
3099 Py_UNICODE c = *p;
3100
3101 /* can we encode this? */
3102 if (c<limit) {
3103 /* no overflow check, because we know that the space is enough */
3104 *str++ = (char)c;
3105 ++p;
3106 }
3107 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003108 Py_ssize_t unicodepos = p-startp;
3109 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t repsize;
3112 Py_ssize_t newpos;
3113 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 Py_UNICODE *uni2;
3115 /* startpos for collecting unencodable chars */
3116 const Py_UNICODE *collstart = p;
3117 const Py_UNICODE *collend = p;
3118 /* find all unecodable characters */
3119 while ((collend < endp) && ((*collend)>=limit))
3120 ++collend;
3121 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3122 if (known_errorHandler==-1) {
3123 if ((errors==NULL) || (!strcmp(errors, "strict")))
3124 known_errorHandler = 1;
3125 else if (!strcmp(errors, "replace"))
3126 known_errorHandler = 2;
3127 else if (!strcmp(errors, "ignore"))
3128 known_errorHandler = 3;
3129 else if (!strcmp(errors, "xmlcharrefreplace"))
3130 known_errorHandler = 4;
3131 else
3132 known_errorHandler = 0;
3133 }
3134 switch (known_errorHandler) {
3135 case 1: /* strict */
3136 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3137 goto onError;
3138 case 2: /* replace */
3139 while (collstart++<collend)
3140 *str++ = '?'; /* fall through */
3141 case 3: /* ignore */
3142 p = collend;
3143 break;
3144 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003145 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003146 /* determine replacement size (temporarily (mis)uses p) */
3147 for (p = collstart, repsize = 0; p < collend; ++p) {
3148 if (*p<10)
3149 repsize += 2+1+1;
3150 else if (*p<100)
3151 repsize += 2+2+1;
3152 else if (*p<1000)
3153 repsize += 2+3+1;
3154 else if (*p<10000)
3155 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003156#ifndef Py_UNICODE_WIDE
3157 else
3158 repsize += 2+5+1;
3159#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 else if (*p<100000)
3161 repsize += 2+5+1;
3162 else if (*p<1000000)
3163 repsize += 2+6+1;
3164 else
3165 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003166#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 }
3168 requiredsize = respos+repsize+(endp-collend);
3169 if (requiredsize > ressize) {
3170 if (requiredsize<2*ressize)
3171 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003172 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003174 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 ressize = requiredsize;
3176 }
3177 /* generate replacement (temporarily (mis)uses p) */
3178 for (p = collstart; p < collend; ++p) {
3179 str += sprintf(str, "&#%d;", (int)*p);
3180 }
3181 p = collend;
3182 break;
3183 default:
3184 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3185 encoding, reason, startp, size, &exc,
3186 collstart-startp, collend-startp, &newpos);
3187 if (repunicode == NULL)
3188 goto onError;
3189 /* need more space? (at least enough for what we
3190 have+the replacement+the rest of the string, so
3191 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003192 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 repsize = PyUnicode_GET_SIZE(repunicode);
3194 requiredsize = respos+repsize+(endp-collend);
3195 if (requiredsize > ressize) {
3196 if (requiredsize<2*ressize)
3197 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003198 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199 Py_DECREF(repunicode);
3200 goto onError;
3201 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003202 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 ressize = requiredsize;
3204 }
3205 /* check if there is anything unencodable in the replacement
3206 and copy it to the output */
3207 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3208 c = *uni2;
3209 if (c >= limit) {
3210 raise_encode_exception(&exc, encoding, startp, size,
3211 unicodepos, unicodepos+1, reason);
3212 Py_DECREF(repunicode);
3213 goto onError;
3214 }
3215 *str = (char)c;
3216 }
3217 p = startp + newpos;
3218 Py_DECREF(repunicode);
3219 }
3220 }
3221 }
3222 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003223 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 if (respos<ressize)
3225 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003226 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 Py_XDECREF(errorHandler);
3228 Py_XDECREF(exc);
3229 return res;
3230
3231 onError:
3232 Py_XDECREF(res);
3233 Py_XDECREF(errorHandler);
3234 Py_XDECREF(exc);
3235 return NULL;
3236}
3237
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003239 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 const char *errors)
3241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243}
3244
3245PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3246{
3247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
3249 return NULL;
3250 }
3251 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3252 PyUnicode_GET_SIZE(unicode),
3253 NULL);
3254}
3255
3256/* --- 7-bit ASCII Codec -------------------------------------------------- */
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003259 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 const char *errors)
3261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 PyUnicodeObject *v;
3264 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 Py_ssize_t startinpos;
3266 Py_ssize_t endinpos;
3267 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 const char *e;
3269 PyObject *errorHandler = NULL;
3270 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003271
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003273 if (size == 1 && *(unsigned char*)s < 128) {
3274 Py_UNICODE r = *(unsigned char*)s;
3275 return PyUnicode_FromUnicode(&r, 1);
3276 }
Tim Petersced69f82003-09-16 20:30:58 +00003277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 v = _PyUnicode_New(size);
3279 if (v == NULL)
3280 goto onError;
3281 if (size == 0)
3282 return (PyObject *)v;
3283 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 e = s + size;
3285 while (s < e) {
3286 register unsigned char c = (unsigned char)*s;
3287 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 ++s;
3290 }
3291 else {
3292 startinpos = s-starts;
3293 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003294 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 if (unicode_decode_call_errorhandler(
3296 errors, &errorHandler,
3297 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003298 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003303 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003304 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003305 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003309
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 onError:
3311 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 return NULL;
3315}
3316
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003318 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 const char *errors)
3320{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322}
3323
3324PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3325{
3326 if (!PyUnicode_Check(unicode)) {
3327 PyErr_BadArgument();
3328 return NULL;
3329 }
3330 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3331 PyUnicode_GET_SIZE(unicode),
3332 NULL);
3333}
3334
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003335#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003336
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003337/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003338
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003339#if SIZEOF_INT < SIZEOF_SSIZE_T
3340#define NEED_RETRY
3341#endif
3342
3343/* XXX This code is limited to "true" double-byte encodings, as
3344 a) it assumes an incomplete character consists of a single byte, and
3345 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3346 encodings, see IsDBCSLeadByteEx documentation. */
3347
3348static int is_dbcs_lead_byte(const char *s, int offset)
3349{
3350 const char *curr = s + offset;
3351
3352 if (IsDBCSLeadByte(*curr)) {
3353 const char *prev = CharPrev(s, curr);
3354 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3355 }
3356 return 0;
3357}
3358
3359/*
3360 * Decode MBCS string into unicode object. If 'final' is set, converts
3361 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3362 */
3363static int decode_mbcs(PyUnicodeObject **v,
3364 const char *s, /* MBCS string */
3365 int size, /* sizeof MBCS string */
3366 int final)
3367{
3368 Py_UNICODE *p;
3369 Py_ssize_t n = 0;
3370 int usize = 0;
3371
3372 assert(size >= 0);
3373
3374 /* Skip trailing lead-byte unless 'final' is set */
3375 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3376 --size;
3377
3378 /* First get the size of the result */
3379 if (size > 0) {
3380 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3381 if (usize == 0) {
3382 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3383 return -1;
3384 }
3385 }
3386
3387 if (*v == NULL) {
3388 /* Create unicode object */
3389 *v = _PyUnicode_New(usize);
3390 if (*v == NULL)
3391 return -1;
3392 }
3393 else {
3394 /* Extend unicode object */
3395 n = PyUnicode_GET_SIZE(*v);
3396 if (_PyUnicode_Resize(v, n + usize) < 0)
3397 return -1;
3398 }
3399
3400 /* Do the conversion */
3401 if (size > 0) {
3402 p = PyUnicode_AS_UNICODE(*v) + n;
3403 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3404 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3405 return -1;
3406 }
3407 }
3408
3409 return size;
3410}
3411
3412PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3413 Py_ssize_t size,
3414 const char *errors,
3415 Py_ssize_t *consumed)
3416{
3417 PyUnicodeObject *v = NULL;
3418 int done;
3419
3420 if (consumed)
3421 *consumed = 0;
3422
3423#ifdef NEED_RETRY
3424 retry:
3425 if (size > INT_MAX)
3426 done = decode_mbcs(&v, s, INT_MAX, 0);
3427 else
3428#endif
3429 done = decode_mbcs(&v, s, (int)size, !consumed);
3430
3431 if (done < 0) {
3432 Py_XDECREF(v);
3433 return NULL;
3434 }
3435
3436 if (consumed)
3437 *consumed += done;
3438
3439#ifdef NEED_RETRY
3440 if (size > INT_MAX) {
3441 s += done;
3442 size -= done;
3443 goto retry;
3444 }
3445#endif
3446
3447 return (PyObject *)v;
3448}
3449
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003450PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003451 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003452 const char *errors)
3453{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003454 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3455}
3456
3457/*
3458 * Convert unicode into string object (MBCS).
3459 * Returns 0 if succeed, -1 otherwise.
3460 */
3461static int encode_mbcs(PyObject **repr,
3462 const Py_UNICODE *p, /* unicode */
3463 int size) /* size of unicode */
3464{
3465 int mbcssize = 0;
3466 Py_ssize_t n = 0;
3467
3468 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003469
3470 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003471 if (size > 0) {
3472 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3473 if (mbcssize == 0) {
3474 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3475 return -1;
3476 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003477 }
3478
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 if (*repr == NULL) {
3480 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003481 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003482 if (*repr == NULL)
3483 return -1;
3484 }
3485 else {
3486 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003487 n = PyBytes_Size(*repr);
3488 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003489 return -1;
3490 }
3491
3492 /* Do the conversion */
3493 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003494 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003495 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3496 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3497 return -1;
3498 }
3499 }
3500
3501 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003502}
3503
3504PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003506 const char *errors)
3507{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003508 PyObject *repr = NULL;
3509 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003510
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003511#ifdef NEED_RETRY
3512 retry:
3513 if (size > INT_MAX)
3514 ret = encode_mbcs(&repr, p, INT_MAX);
3515 else
3516#endif
3517 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003518
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003519 if (ret < 0) {
3520 Py_XDECREF(repr);
3521 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003522 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003523
3524#ifdef NEED_RETRY
3525 if (size > INT_MAX) {
3526 p += INT_MAX;
3527 size -= INT_MAX;
3528 goto retry;
3529 }
3530#endif
3531
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003532 return repr;
3533}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003534
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003535PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3536{
3537 if (!PyUnicode_Check(unicode)) {
3538 PyErr_BadArgument();
3539 return NULL;
3540 }
3541 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3542 PyUnicode_GET_SIZE(unicode),
3543 NULL);
3544}
3545
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003546#undef NEED_RETRY
3547
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003548#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003549
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550/* --- Character Mapping Codec -------------------------------------------- */
3551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 PyObject *mapping,
3555 const char *errors)
3556{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 Py_ssize_t startinpos;
3559 Py_ssize_t endinpos;
3560 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 PyUnicodeObject *v;
3563 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003564 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 PyObject *errorHandler = NULL;
3566 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003567 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003569
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 /* Default to Latin-1 */
3571 if (mapping == NULL)
3572 return PyUnicode_DecodeLatin1(s, size, errors);
3573
3574 v = _PyUnicode_New(size);
3575 if (v == NULL)
3576 goto onError;
3577 if (size == 0)
3578 return (PyObject *)v;
3579 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003581 if (PyUnicode_CheckExact(mapping)) {
3582 mapstring = PyUnicode_AS_UNICODE(mapping);
3583 maplen = PyUnicode_GET_SIZE(mapping);
3584 while (s < e) {
3585 unsigned char ch = *s;
3586 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003588 if (ch < maplen)
3589 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003591 if (x == 0xfffe) {
3592 /* undefined mapping */
3593 outpos = p-PyUnicode_AS_UNICODE(v);
3594 startinpos = s-starts;
3595 endinpos = startinpos+1;
3596 if (unicode_decode_call_errorhandler(
3597 errors, &errorHandler,
3598 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003599 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003600 (PyObject **)&v, &outpos, &p)) {
3601 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003602 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003603 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003604 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003605 *p++ = x;
3606 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003608 }
3609 else {
3610 while (s < e) {
3611 unsigned char ch = *s;
3612 PyObject *w, *x;
3613
3614 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3615 w = PyInt_FromLong((long)ch);
3616 if (w == NULL)
3617 goto onError;
3618 x = PyObject_GetItem(mapping, w);
3619 Py_DECREF(w);
3620 if (x == NULL) {
3621 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3622 /* No mapping found means: mapping is undefined. */
3623 PyErr_Clear();
3624 x = Py_None;
3625 Py_INCREF(x);
3626 } else
3627 goto onError;
3628 }
3629
3630 /* Apply mapping */
3631 if (PyInt_Check(x)) {
3632 long value = PyInt_AS_LONG(x);
3633 if (value < 0 || value > 65535) {
3634 PyErr_SetString(PyExc_TypeError,
3635 "character mapping must be in range(65536)");
3636 Py_DECREF(x);
3637 goto onError;
3638 }
3639 *p++ = (Py_UNICODE)value;
3640 }
3641 else if (x == Py_None) {
3642 /* undefined mapping */
3643 outpos = p-PyUnicode_AS_UNICODE(v);
3644 startinpos = s-starts;
3645 endinpos = startinpos+1;
3646 if (unicode_decode_call_errorhandler(
3647 errors, &errorHandler,
3648 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003650 (PyObject **)&v, &outpos, &p)) {
3651 Py_DECREF(x);
3652 goto onError;
3653 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003654 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003655 continue;
3656 }
3657 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003658 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003659
3660 if (targetsize == 1)
3661 /* 1-1 mapping */
3662 *p++ = *PyUnicode_AS_UNICODE(x);
3663
3664 else if (targetsize > 1) {
3665 /* 1-n mapping */
3666 if (targetsize > extrachars) {
3667 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003668 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3669 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003670 (targetsize << 2);
3671 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003672 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003673 if (_PyUnicode_Resize(&v,
3674 PyUnicode_GET_SIZE(v) + needed) < 0) {
3675 Py_DECREF(x);
3676 goto onError;
3677 }
3678 p = PyUnicode_AS_UNICODE(v) + oldpos;
3679 }
3680 Py_UNICODE_COPY(p,
3681 PyUnicode_AS_UNICODE(x),
3682 targetsize);
3683 p += targetsize;
3684 extrachars -= targetsize;
3685 }
3686 /* 1-0 mapping: skip the character */
3687 }
3688 else {
3689 /* wrong return value */
3690 PyErr_SetString(PyExc_TypeError,
3691 "character mapping must return integer, None or unicode");
3692 Py_DECREF(x);
3693 goto onError;
3694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003696 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 }
3699 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003700 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 Py_XDECREF(errorHandler);
3703 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 Py_XDECREF(errorHandler);
3708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 Py_XDECREF(v);
3710 return NULL;
3711}
3712
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003713/* Charmap encoding: the lookup table */
3714
3715struct encoding_map{
3716 PyObject_HEAD
3717 unsigned char level1[32];
3718 int count2, count3;
3719 unsigned char level23[1];
3720};
3721
3722static PyObject*
3723encoding_map_size(PyObject *obj, PyObject* args)
3724{
3725 struct encoding_map *map = (struct encoding_map*)obj;
3726 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3727 128*map->count3);
3728}
3729
3730static PyMethodDef encoding_map_methods[] = {
3731 {"size", encoding_map_size, METH_NOARGS,
3732 PyDoc_STR("Return the size (in bytes) of this object") },
3733 { 0 }
3734};
3735
3736static void
3737encoding_map_dealloc(PyObject* o)
3738{
3739 PyObject_FREE(o);
3740}
3741
3742static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003743 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003744 "EncodingMap", /*tp_name*/
3745 sizeof(struct encoding_map), /*tp_basicsize*/
3746 0, /*tp_itemsize*/
3747 /* methods */
3748 encoding_map_dealloc, /*tp_dealloc*/
3749 0, /*tp_print*/
3750 0, /*tp_getattr*/
3751 0, /*tp_setattr*/
3752 0, /*tp_compare*/
3753 0, /*tp_repr*/
3754 0, /*tp_as_number*/
3755 0, /*tp_as_sequence*/
3756 0, /*tp_as_mapping*/
3757 0, /*tp_hash*/
3758 0, /*tp_call*/
3759 0, /*tp_str*/
3760 0, /*tp_getattro*/
3761 0, /*tp_setattro*/
3762 0, /*tp_as_buffer*/
3763 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3764 0, /*tp_doc*/
3765 0, /*tp_traverse*/
3766 0, /*tp_clear*/
3767 0, /*tp_richcompare*/
3768 0, /*tp_weaklistoffset*/
3769 0, /*tp_iter*/
3770 0, /*tp_iternext*/
3771 encoding_map_methods, /*tp_methods*/
3772 0, /*tp_members*/
3773 0, /*tp_getset*/
3774 0, /*tp_base*/
3775 0, /*tp_dict*/
3776 0, /*tp_descr_get*/
3777 0, /*tp_descr_set*/
3778 0, /*tp_dictoffset*/
3779 0, /*tp_init*/
3780 0, /*tp_alloc*/
3781 0, /*tp_new*/
3782 0, /*tp_free*/
3783 0, /*tp_is_gc*/
3784};
3785
3786PyObject*
3787PyUnicode_BuildEncodingMap(PyObject* string)
3788{
3789 Py_UNICODE *decode;
3790 PyObject *result;
3791 struct encoding_map *mresult;
3792 int i;
3793 int need_dict = 0;
3794 unsigned char level1[32];
3795 unsigned char level2[512];
3796 unsigned char *mlevel1, *mlevel2, *mlevel3;
3797 int count2 = 0, count3 = 0;
3798
3799 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3800 PyErr_BadArgument();
3801 return NULL;
3802 }
3803 decode = PyUnicode_AS_UNICODE(string);
3804 memset(level1, 0xFF, sizeof level1);
3805 memset(level2, 0xFF, sizeof level2);
3806
3807 /* If there isn't a one-to-one mapping of NULL to \0,
3808 or if there are non-BMP characters, we need to use
3809 a mapping dictionary. */
3810 if (decode[0] != 0)
3811 need_dict = 1;
3812 for (i = 1; i < 256; i++) {
3813 int l1, l2;
3814 if (decode[i] == 0
3815 #ifdef Py_UNICODE_WIDE
3816 || decode[i] > 0xFFFF
3817 #endif
3818 ) {
3819 need_dict = 1;
3820 break;
3821 }
3822 if (decode[i] == 0xFFFE)
3823 /* unmapped character */
3824 continue;
3825 l1 = decode[i] >> 11;
3826 l2 = decode[i] >> 7;
3827 if (level1[l1] == 0xFF)
3828 level1[l1] = count2++;
3829 if (level2[l2] == 0xFF)
3830 level2[l2] = count3++;
3831 }
3832
3833 if (count2 >= 0xFF || count3 >= 0xFF)
3834 need_dict = 1;
3835
3836 if (need_dict) {
3837 PyObject *result = PyDict_New();
3838 PyObject *key, *value;
3839 if (!result)
3840 return NULL;
3841 for (i = 0; i < 256; i++) {
3842 key = value = NULL;
3843 key = PyInt_FromLong(decode[i]);
3844 value = PyInt_FromLong(i);
3845 if (!key || !value)
3846 goto failed1;
3847 if (PyDict_SetItem(result, key, value) == -1)
3848 goto failed1;
3849 Py_DECREF(key);
3850 Py_DECREF(value);
3851 }
3852 return result;
3853 failed1:
3854 Py_XDECREF(key);
3855 Py_XDECREF(value);
3856 Py_DECREF(result);
3857 return NULL;
3858 }
3859
3860 /* Create a three-level trie */
3861 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3862 16*count2 + 128*count3 - 1);
3863 if (!result)
3864 return PyErr_NoMemory();
3865 PyObject_Init(result, &EncodingMapType);
3866 mresult = (struct encoding_map*)result;
3867 mresult->count2 = count2;
3868 mresult->count3 = count3;
3869 mlevel1 = mresult->level1;
3870 mlevel2 = mresult->level23;
3871 mlevel3 = mresult->level23 + 16*count2;
3872 memcpy(mlevel1, level1, 32);
3873 memset(mlevel2, 0xFF, 16*count2);
3874 memset(mlevel3, 0, 128*count3);
3875 count3 = 0;
3876 for (i = 1; i < 256; i++) {
3877 int o1, o2, o3, i2, i3;
3878 if (decode[i] == 0xFFFE)
3879 /* unmapped character */
3880 continue;
3881 o1 = decode[i]>>11;
3882 o2 = (decode[i]>>7) & 0xF;
3883 i2 = 16*mlevel1[o1] + o2;
3884 if (mlevel2[i2] == 0xFF)
3885 mlevel2[i2] = count3++;
3886 o3 = decode[i] & 0x7F;
3887 i3 = 128*mlevel2[i2] + o3;
3888 mlevel3[i3] = i;
3889 }
3890 return result;
3891}
3892
3893static int
3894encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3895{
3896 struct encoding_map *map = (struct encoding_map*)mapping;
3897 int l1 = c>>11;
3898 int l2 = (c>>7) & 0xF;
3899 int l3 = c & 0x7F;
3900 int i;
3901
3902#ifdef Py_UNICODE_WIDE
3903 if (c > 0xFFFF) {
3904 return -1;
3905 }
3906#endif
3907 if (c == 0)
3908 return 0;
3909 /* level 1*/
3910 i = map->level1[l1];
3911 if (i == 0xFF) {
3912 return -1;
3913 }
3914 /* level 2*/
3915 i = map->level23[16*i+l2];
3916 if (i == 0xFF) {
3917 return -1;
3918 }
3919 /* level 3 */
3920 i = map->level23[16*map->count2 + 128*i + l3];
3921 if (i == 0) {
3922 return -1;
3923 }
3924 return i;
3925}
3926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927/* Lookup the character ch in the mapping. If the character
3928 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003929 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 PyObject *w = PyInt_FromLong((long)c);
3933 PyObject *x;
3934
3935 if (w == NULL)
3936 return NULL;
3937 x = PyObject_GetItem(mapping, w);
3938 Py_DECREF(w);
3939 if (x == NULL) {
3940 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3941 /* No mapping found means: mapping is undefined. */
3942 PyErr_Clear();
3943 x = Py_None;
3944 Py_INCREF(x);
3945 return x;
3946 } else
3947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003949 else if (x == Py_None)
3950 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 else if (PyInt_Check(x)) {
3952 long value = PyInt_AS_LONG(x);
3953 if (value < 0 || value > 255) {
3954 PyErr_SetString(PyExc_TypeError,
3955 "character mapping must be in range(256)");
3956 Py_DECREF(x);
3957 return NULL;
3958 }
3959 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 else if (PyString_Check(x))
3962 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003965 PyErr_Format(PyExc_TypeError,
3966 "character mapping must return integer, None or str8, not %.400s",
3967 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 Py_DECREF(x);
3969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 }
3971}
3972
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003973static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003974charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003975{
Walter Dörwald827b0552007-05-12 13:23:53 +00003976 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003977 /* exponentially overallocate to minimize reallocations */
3978 if (requiredsize < 2*outsize)
3979 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003980 if (PyBytes_Resize(outobj, requiredsize)) {
3981 Py_DECREF(outobj);
3982 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003983 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003984 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003985}
3986
3987typedef enum charmapencode_result {
3988 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3989}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003991 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 space is available. Return a new reference to the object that
3993 was put in the output buffer, or Py_None, if the mapping was undefined
3994 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003995 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003998 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004000 PyObject *rep;
4001 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004002 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004004 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004005 int res = encoding_map_lookup(c, mapping);
4006 Py_ssize_t requiredsize = *outpos+1;
4007 if (res == -1)
4008 return enc_FAILED;
4009 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004010 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004012 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004013 outstart[(*outpos)++] = (char)res;
4014 return enc_SUCCESS;
4015 }
4016
4017 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004019 return enc_EXCEPTION;
4020 else if (rep==Py_None) {
4021 Py_DECREF(rep);
4022 return enc_FAILED;
4023 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004025 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004027 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004029 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004031 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4033 }
4034 else {
4035 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004036 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4037 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004038 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004039 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004041 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004043 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 memcpy(outstart + *outpos, repchars, repsize);
4045 *outpos += repsize;
4046 }
4047 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004048 Py_DECREF(rep);
4049 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050}
4051
4052/* handle an error in PyUnicode_EncodeCharmap
4053 Return 0 on success, -1 on error */
4054static
4055int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004058 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004059 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060{
4061 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t repsize;
4063 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 Py_UNICODE *uni2;
4065 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t collstartpos = *inpos;
4067 Py_ssize_t collendpos = *inpos+1;
4068 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 char *encoding = "charmap";
4070 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004071 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 /* find all unencodable characters */
4074 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004075 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004076 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004077 int res = encoding_map_lookup(p[collendpos], mapping);
4078 if (res != -1)
4079 break;
4080 ++collendpos;
4081 continue;
4082 }
4083
4084 rep = charmapencode_lookup(p[collendpos], mapping);
4085 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004087 else if (rep!=Py_None) {
4088 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 break;
4090 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004091 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 ++collendpos;
4093 }
4094 /* cache callback name lookup
4095 * (if not done yet, i.e. it's the first error) */
4096 if (*known_errorHandler==-1) {
4097 if ((errors==NULL) || (!strcmp(errors, "strict")))
4098 *known_errorHandler = 1;
4099 else if (!strcmp(errors, "replace"))
4100 *known_errorHandler = 2;
4101 else if (!strcmp(errors, "ignore"))
4102 *known_errorHandler = 3;
4103 else if (!strcmp(errors, "xmlcharrefreplace"))
4104 *known_errorHandler = 4;
4105 else
4106 *known_errorHandler = 0;
4107 }
4108 switch (*known_errorHandler) {
4109 case 1: /* strict */
4110 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4111 return -1;
4112 case 2: /* replace */
4113 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4114 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004115 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 return -1;
4117 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004118 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4120 return -1;
4121 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 }
4123 /* fall through */
4124 case 3: /* ignore */
4125 *inpos = collendpos;
4126 break;
4127 case 4: /* xmlcharrefreplace */
4128 /* generate replacement (temporarily (mis)uses p) */
4129 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4130 char buffer[2+29+1+1];
4131 char *cp;
4132 sprintf(buffer, "&#%d;", (int)p[collpos]);
4133 for (cp = buffer; *cp; ++cp) {
4134 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004135 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004137 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4139 return -1;
4140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 }
4142 }
4143 *inpos = collendpos;
4144 break;
4145 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004146 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 encoding, reason, p, size, exceptionObject,
4148 collstartpos, collendpos, &newpos);
4149 if (repunicode == NULL)
4150 return -1;
4151 /* generate replacement */
4152 repsize = PyUnicode_GET_SIZE(repunicode);
4153 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4154 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004155 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 return -1;
4157 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004158 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4161 return -1;
4162 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 }
4164 *inpos = newpos;
4165 Py_DECREF(repunicode);
4166 }
4167 return 0;
4168}
4169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004171 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 PyObject *mapping,
4173 const char *errors)
4174{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 /* output object */
4176 PyObject *res = NULL;
4177 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004180 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 PyObject *errorHandler = NULL;
4182 PyObject *exc = NULL;
4183 /* the following variable is used for caching string comparisons
4184 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4185 * 3=ignore, 4=xmlcharrefreplace */
4186 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187
4188 /* Default to Latin-1 */
4189 if (mapping == NULL)
4190 return PyUnicode_EncodeLatin1(p, size, errors);
4191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* allocate enough for a simple encoding without
4193 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004194 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 if (res == NULL)
4196 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004197 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 while (inpos<size) {
4201 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004202 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004203 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004205 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 if (charmap_encoding_error(p, size, &inpos, mapping,
4207 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004208 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004209 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004210 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 else
4214 /* done with this character => adjust input position */
4215 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004219 if (respos<PyBytes_GET_SIZE(res)) {
4220 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 goto onError;
4222 }
4223 Py_XDECREF(exc);
4224 Py_XDECREF(errorHandler);
4225 return res;
4226
4227 onError:
4228 Py_XDECREF(res);
4229 Py_XDECREF(exc);
4230 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 return NULL;
4232}
4233
4234PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4235 PyObject *mapping)
4236{
4237 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4238 PyErr_BadArgument();
4239 return NULL;
4240 }
4241 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4242 PyUnicode_GET_SIZE(unicode),
4243 mapping,
4244 NULL);
4245}
4246
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247/* create or adjust a UnicodeTranslateError */
4248static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004249 const Py_UNICODE *unicode, Py_ssize_t size,
4250 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 if (*exceptionObject == NULL) {
4254 *exceptionObject = PyUnicodeTranslateError_Create(
4255 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 }
4257 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4259 goto onError;
4260 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4261 goto onError;
4262 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4263 goto onError;
4264 return;
4265 onError:
4266 Py_DECREF(*exceptionObject);
4267 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 }
4269}
4270
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271/* raises a UnicodeTranslateError */
4272static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 const Py_UNICODE *unicode, Py_ssize_t size,
4274 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 const char *reason)
4276{
4277 make_translate_exception(exceptionObject,
4278 unicode, size, startpos, endpos, reason);
4279 if (*exceptionObject != NULL)
4280 PyCodec_StrictErrors(*exceptionObject);
4281}
4282
4283/* error handling callback helper:
4284 build arguments, call the callback and check the arguments,
4285 put the result into newpos and return the replacement string, which
4286 has to be freed by the caller */
4287static PyObject *unicode_translate_call_errorhandler(const char *errors,
4288 PyObject **errorHandler,
4289 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4291 Py_ssize_t startpos, Py_ssize_t endpos,
4292 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004294 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004296 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 PyObject *restuple;
4298 PyObject *resunicode;
4299
4300 if (*errorHandler == NULL) {
4301 *errorHandler = PyCodec_LookupError(errors);
4302 if (*errorHandler == NULL)
4303 return NULL;
4304 }
4305
4306 make_translate_exception(exceptionObject,
4307 unicode, size, startpos, endpos, reason);
4308 if (*exceptionObject == NULL)
4309 return NULL;
4310
4311 restuple = PyObject_CallFunctionObjArgs(
4312 *errorHandler, *exceptionObject, NULL);
4313 if (restuple == NULL)
4314 return NULL;
4315 if (!PyTuple_Check(restuple)) {
4316 PyErr_Format(PyExc_TypeError, &argparse[4]);
4317 Py_DECREF(restuple);
4318 return NULL;
4319 }
4320 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 Py_DECREF(restuple);
4323 return NULL;
4324 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004325 if (i_newpos<0)
4326 *newpos = size+i_newpos;
4327 else
4328 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004329 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004330 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004331 Py_DECREF(restuple);
4332 return NULL;
4333 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 Py_INCREF(resunicode);
4335 Py_DECREF(restuple);
4336 return resunicode;
4337}
4338
4339/* Lookup the character ch in the mapping and put the result in result,
4340 which must be decrefed by the caller.
4341 Return 0 on success, -1 on error */
4342static
4343int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4344{
4345 PyObject *w = PyInt_FromLong((long)c);
4346 PyObject *x;
4347
4348 if (w == NULL)
4349 return -1;
4350 x = PyObject_GetItem(mapping, w);
4351 Py_DECREF(w);
4352 if (x == NULL) {
4353 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4354 /* No mapping found means: use 1:1 mapping. */
4355 PyErr_Clear();
4356 *result = NULL;
4357 return 0;
4358 } else
4359 return -1;
4360 }
4361 else if (x == Py_None) {
4362 *result = x;
4363 return 0;
4364 }
4365 else if (PyInt_Check(x)) {
4366 long value = PyInt_AS_LONG(x);
4367 long max = PyUnicode_GetMax();
4368 if (value < 0 || value > max) {
4369 PyErr_Format(PyExc_TypeError,
4370 "character mapping must be in range(0x%lx)", max+1);
4371 Py_DECREF(x);
4372 return -1;
4373 }
4374 *result = x;
4375 return 0;
4376 }
4377 else if (PyUnicode_Check(x)) {
4378 *result = x;
4379 return 0;
4380 }
4381 else {
4382 /* wrong return value */
4383 PyErr_SetString(PyExc_TypeError,
4384 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004385 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 return -1;
4387 }
4388}
4389/* ensure that *outobj is at least requiredsize characters long,
4390if not reallocate and adjust various state variables.
4391Return 0 on success, -1 on error */
4392static
Walter Dörwald4894c302003-10-24 14:25:28 +00004393int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004394 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004396 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004397 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004399 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004401 if (requiredsize < 2 * oldsize)
4402 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004403 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 return -1;
4405 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 }
4407 return 0;
4408}
4409/* lookup the character, put the result in the output string and adjust
4410 various state variables. Return a new reference to the object that
4411 was put in the output buffer in *result, or Py_None, if the mapping was
4412 undefined (in which case no character was written).
4413 The called must decref result.
4414 Return 0 on success, -1 on error. */
4415static
Walter Dörwald4894c302003-10-24 14:25:28 +00004416int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004417 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004418 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419{
Walter Dörwald4894c302003-10-24 14:25:28 +00004420 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 return -1;
4422 if (*res==NULL) {
4423 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004424 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
4426 else if (*res==Py_None)
4427 ;
4428 else if (PyInt_Check(*res)) {
4429 /* no overflow check, because we know that the space is enough */
4430 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4431 }
4432 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 if (repsize==1) {
4435 /* no overflow check, because we know that the space is enough */
4436 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4437 }
4438 else if (repsize!=0) {
4439 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004441 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004442 repsize - 1;
4443 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 return -1;
4445 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4446 *outp += repsize;
4447 }
4448 }
4449 else
4450 return -1;
4451 return 0;
4452}
4453
4454PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 PyObject *mapping,
4457 const char *errors)
4458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 /* output object */
4460 PyObject *res = NULL;
4461 /* pointers to the beginning and end+1 of input */
4462 const Py_UNICODE *startp = p;
4463 const Py_UNICODE *endp = p + size;
4464 /* pointer into the output */
4465 Py_UNICODE *str;
4466 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004467 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 char *reason = "character maps to <undefined>";
4469 PyObject *errorHandler = NULL;
4470 PyObject *exc = NULL;
4471 /* the following variable is used for caching string comparisons
4472 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4473 * 3=ignore, 4=xmlcharrefreplace */
4474 int known_errorHandler = -1;
4475
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 if (mapping == NULL) {
4477 PyErr_BadArgument();
4478 return NULL;
4479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480
4481 /* allocate enough for a simple 1:1 translation without
4482 replacements, if we need more, we'll resize */
4483 res = PyUnicode_FromUnicode(NULL, size);
4484 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004485 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 return res;
4488 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 while (p<endp) {
4491 /* try to encode it */
4492 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004493 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 goto onError;
4496 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004497 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (x!=Py_None) /* it worked => adjust input pointer */
4499 ++p;
4500 else { /* untranslatable character */
4501 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 Py_ssize_t repsize;
4503 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 Py_UNICODE *uni2;
4505 /* startpos for collecting untranslatable chars */
4506 const Py_UNICODE *collstart = p;
4507 const Py_UNICODE *collend = p+1;
4508 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 /* find all untranslatable characters */
4511 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004512 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 goto onError;
4514 Py_XDECREF(x);
4515 if (x!=Py_None)
4516 break;
4517 ++collend;
4518 }
4519 /* cache callback name lookup
4520 * (if not done yet, i.e. it's the first error) */
4521 if (known_errorHandler==-1) {
4522 if ((errors==NULL) || (!strcmp(errors, "strict")))
4523 known_errorHandler = 1;
4524 else if (!strcmp(errors, "replace"))
4525 known_errorHandler = 2;
4526 else if (!strcmp(errors, "ignore"))
4527 known_errorHandler = 3;
4528 else if (!strcmp(errors, "xmlcharrefreplace"))
4529 known_errorHandler = 4;
4530 else
4531 known_errorHandler = 0;
4532 }
4533 switch (known_errorHandler) {
4534 case 1: /* strict */
4535 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4536 goto onError;
4537 case 2: /* replace */
4538 /* No need to check for space, this is a 1:1 replacement */
4539 for (coll = collstart; coll<collend; ++coll)
4540 *str++ = '?';
4541 /* fall through */
4542 case 3: /* ignore */
4543 p = collend;
4544 break;
4545 case 4: /* xmlcharrefreplace */
4546 /* generate replacement (temporarily (mis)uses p) */
4547 for (p = collstart; p < collend; ++p) {
4548 char buffer[2+29+1+1];
4549 char *cp;
4550 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004551 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4553 goto onError;
4554 for (cp = buffer; *cp; ++cp)
4555 *str++ = *cp;
4556 }
4557 p = collend;
4558 break;
4559 default:
4560 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4561 reason, startp, size, &exc,
4562 collstart-startp, collend-startp, &newpos);
4563 if (repunicode == NULL)
4564 goto onError;
4565 /* generate replacement */
4566 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004567 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4569 Py_DECREF(repunicode);
4570 goto onError;
4571 }
4572 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4573 *str++ = *uni2;
4574 p = startp + newpos;
4575 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 }
4577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 /* Resize if we allocated to much */
4580 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004581 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004582 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004583 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 }
4585 Py_XDECREF(exc);
4586 Py_XDECREF(errorHandler);
4587 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 onError:
4590 Py_XDECREF(res);
4591 Py_XDECREF(exc);
4592 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 return NULL;
4594}
4595
4596PyObject *PyUnicode_Translate(PyObject *str,
4597 PyObject *mapping,
4598 const char *errors)
4599{
4600 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004601
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 str = PyUnicode_FromObject(str);
4603 if (str == NULL)
4604 goto onError;
4605 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4606 PyUnicode_GET_SIZE(str),
4607 mapping,
4608 errors);
4609 Py_DECREF(str);
4610 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 onError:
4613 Py_XDECREF(str);
4614 return NULL;
4615}
Tim Petersced69f82003-09-16 20:30:58 +00004616
Guido van Rossum9e896b32000-04-05 20:11:21 +00004617/* --- Decimal Encoder ---------------------------------------------------- */
4618
4619int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004620 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004621 char *output,
4622 const char *errors)
4623{
4624 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 PyObject *errorHandler = NULL;
4626 PyObject *exc = NULL;
4627 const char *encoding = "decimal";
4628 const char *reason = "invalid decimal Unicode string";
4629 /* the following variable is used for caching string comparisons
4630 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4631 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004632
4633 if (output == NULL) {
4634 PyErr_BadArgument();
4635 return -1;
4636 }
4637
4638 p = s;
4639 end = s + length;
4640 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004642 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644 Py_ssize_t repsize;
4645 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 Py_UNICODE *uni2;
4647 Py_UNICODE *collstart;
4648 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004649
Guido van Rossum9e896b32000-04-05 20:11:21 +00004650 if (Py_UNICODE_ISSPACE(ch)) {
4651 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004653 continue;
4654 }
4655 decimal = Py_UNICODE_TODECIMAL(ch);
4656 if (decimal >= 0) {
4657 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004659 continue;
4660 }
Guido van Rossumba477042000-04-06 18:18:10 +00004661 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004662 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004664 continue;
4665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 /* All other characters are considered unencodable */
4667 collstart = p;
4668 collend = p+1;
4669 while (collend < end) {
4670 if ((0 < *collend && *collend < 256) ||
4671 !Py_UNICODE_ISSPACE(*collend) ||
4672 Py_UNICODE_TODECIMAL(*collend))
4673 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004674 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 /* cache callback name lookup
4676 * (if not done yet, i.e. it's the first error) */
4677 if (known_errorHandler==-1) {
4678 if ((errors==NULL) || (!strcmp(errors, "strict")))
4679 known_errorHandler = 1;
4680 else if (!strcmp(errors, "replace"))
4681 known_errorHandler = 2;
4682 else if (!strcmp(errors, "ignore"))
4683 known_errorHandler = 3;
4684 else if (!strcmp(errors, "xmlcharrefreplace"))
4685 known_errorHandler = 4;
4686 else
4687 known_errorHandler = 0;
4688 }
4689 switch (known_errorHandler) {
4690 case 1: /* strict */
4691 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4692 goto onError;
4693 case 2: /* replace */
4694 for (p = collstart; p < collend; ++p)
4695 *output++ = '?';
4696 /* fall through */
4697 case 3: /* ignore */
4698 p = collend;
4699 break;
4700 case 4: /* xmlcharrefreplace */
4701 /* generate replacement (temporarily (mis)uses p) */
4702 for (p = collstart; p < collend; ++p)
4703 output += sprintf(output, "&#%d;", (int)*p);
4704 p = collend;
4705 break;
4706 default:
4707 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4708 encoding, reason, s, length, &exc,
4709 collstart-s, collend-s, &newpos);
4710 if (repunicode == NULL)
4711 goto onError;
4712 /* generate replacement */
4713 repsize = PyUnicode_GET_SIZE(repunicode);
4714 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4715 Py_UNICODE ch = *uni2;
4716 if (Py_UNICODE_ISSPACE(ch))
4717 *output++ = ' ';
4718 else {
4719 decimal = Py_UNICODE_TODECIMAL(ch);
4720 if (decimal >= 0)
4721 *output++ = '0' + decimal;
4722 else if (0 < ch && ch < 256)
4723 *output++ = (char)ch;
4724 else {
4725 Py_DECREF(repunicode);
4726 raise_encode_exception(&exc, encoding,
4727 s, length, collstart-s, collend-s, reason);
4728 goto onError;
4729 }
4730 }
4731 }
4732 p = s + newpos;
4733 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004734 }
4735 }
4736 /* 0-terminate the output string */
4737 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 Py_XDECREF(exc);
4739 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004740 return 0;
4741
4742 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 Py_XDECREF(exc);
4744 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004745 return -1;
4746}
4747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748/* --- Helpers ------------------------------------------------------------ */
4749
Thomas Wouters477c8d52006-05-27 19:21:47 +00004750#define STRINGLIB_CHAR Py_UNICODE
4751
4752#define STRINGLIB_LEN PyUnicode_GET_SIZE
4753#define STRINGLIB_NEW PyUnicode_FromUnicode
4754#define STRINGLIB_STR PyUnicode_AS_UNICODE
4755
4756Py_LOCAL_INLINE(int)
4757STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004759 if (str[0] != other[0])
4760 return 1;
4761 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762}
4763
Thomas Wouters477c8d52006-05-27 19:21:47 +00004764#define STRINGLIB_EMPTY unicode_empty
4765
4766#include "stringlib/fastsearch.h"
4767
4768#include "stringlib/count.h"
4769#include "stringlib/find.h"
4770#include "stringlib/partition.h"
4771
4772/* helper macro to fixup start/end slice values */
4773#define FIX_START_END(obj) \
4774 if (start < 0) \
4775 start += (obj)->length; \
4776 if (start < 0) \
4777 start = 0; \
4778 if (end > (obj)->length) \
4779 end = (obj)->length; \
4780 if (end < 0) \
4781 end += (obj)->length; \
4782 if (end < 0) \
4783 end = 0;
4784
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004786 PyObject *substr,
4787 Py_ssize_t start,
4788 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004790 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004791 PyUnicodeObject* str_obj;
4792 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004793
Thomas Wouters477c8d52006-05-27 19:21:47 +00004794 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4795 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004797 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4798 if (!sub_obj) {
4799 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 return -1;
4801 }
Tim Petersced69f82003-09-16 20:30:58 +00004802
Thomas Wouters477c8d52006-05-27 19:21:47 +00004803 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004804
Thomas Wouters477c8d52006-05-27 19:21:47 +00004805 result = stringlib_count(
4806 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4807 );
4808
4809 Py_DECREF(sub_obj);
4810 Py_DECREF(str_obj);
4811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 return result;
4813}
4814
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004816 PyObject *sub,
4817 Py_ssize_t start,
4818 Py_ssize_t end,
4819 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004822
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004824 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004825 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004826 sub = PyUnicode_FromObject(sub);
4827 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004828 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004829 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
Tim Petersced69f82003-09-16 20:30:58 +00004831
Thomas Wouters477c8d52006-05-27 19:21:47 +00004832 if (direction > 0)
4833 result = stringlib_find_slice(
4834 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4835 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4836 start, end
4837 );
4838 else
4839 result = stringlib_rfind_slice(
4840 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4841 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4842 start, end
4843 );
4844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004846 Py_DECREF(sub);
4847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 return result;
4849}
4850
Tim Petersced69f82003-09-16 20:30:58 +00004851static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852int tailmatch(PyUnicodeObject *self,
4853 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 Py_ssize_t start,
4855 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 int direction)
4857{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 if (substring->length == 0)
4859 return 1;
4860
Thomas Wouters477c8d52006-05-27 19:21:47 +00004861 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
4863 end -= substring->length;
4864 if (end < start)
4865 return 0;
4866
4867 if (direction > 0) {
4868 if (Py_UNICODE_MATCH(self, end, substring))
4869 return 1;
4870 } else {
4871 if (Py_UNICODE_MATCH(self, start, substring))
4872 return 1;
4873 }
4874
4875 return 0;
4876}
4877
Martin v. Löwis18e16552006-02-15 17:27:45 +00004878Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004880 Py_ssize_t start,
4881 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 int direction)
4883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 str = PyUnicode_FromObject(str);
4887 if (str == NULL)
4888 return -1;
4889 substr = PyUnicode_FromObject(substr);
4890 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004891 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 return -1;
4893 }
Tim Petersced69f82003-09-16 20:30:58 +00004894
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 result = tailmatch((PyUnicodeObject *)str,
4896 (PyUnicodeObject *)substr,
4897 start, end, direction);
4898 Py_DECREF(str);
4899 Py_DECREF(substr);
4900 return result;
4901}
4902
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903/* Apply fixfct filter to the Unicode object self and return a
4904 reference to the modified object */
4905
Tim Petersced69f82003-09-16 20:30:58 +00004906static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907PyObject *fixup(PyUnicodeObject *self,
4908 int (*fixfct)(PyUnicodeObject *s))
4909{
4910
4911 PyUnicodeObject *u;
4912
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004913 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 if (u == NULL)
4915 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004916
4917 Py_UNICODE_COPY(u->str, self->str, self->length);
4918
Tim Peters7a29bd52001-09-12 03:03:31 +00004919 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 /* fixfct should return TRUE if it modified the buffer. If
4921 FALSE, return a reference to the original buffer instead
4922 (to save space, not time) */
4923 Py_INCREF(self);
4924 Py_DECREF(u);
4925 return (PyObject*) self;
4926 }
4927 return (PyObject*) u;
4928}
4929
Tim Petersced69f82003-09-16 20:30:58 +00004930static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931int fixupper(PyUnicodeObject *self)
4932{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004933 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 Py_UNICODE *s = self->str;
4935 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 while (len-- > 0) {
4938 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004939
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 ch = Py_UNICODE_TOUPPER(*s);
4941 if (ch != *s) {
4942 status = 1;
4943 *s = ch;
4944 }
4945 s++;
4946 }
4947
4948 return status;
4949}
4950
Tim Petersced69f82003-09-16 20:30:58 +00004951static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952int fixlower(PyUnicodeObject *self)
4953{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004954 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 Py_UNICODE *s = self->str;
4956 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 while (len-- > 0) {
4959 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004960
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 ch = Py_UNICODE_TOLOWER(*s);
4962 if (ch != *s) {
4963 status = 1;
4964 *s = ch;
4965 }
4966 s++;
4967 }
4968
4969 return status;
4970}
4971
Tim Petersced69f82003-09-16 20:30:58 +00004972static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973int fixswapcase(PyUnicodeObject *self)
4974{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004975 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 Py_UNICODE *s = self->str;
4977 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004978
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 while (len-- > 0) {
4980 if (Py_UNICODE_ISUPPER(*s)) {
4981 *s = Py_UNICODE_TOLOWER(*s);
4982 status = 1;
4983 } else if (Py_UNICODE_ISLOWER(*s)) {
4984 *s = Py_UNICODE_TOUPPER(*s);
4985 status = 1;
4986 }
4987 s++;
4988 }
4989
4990 return status;
4991}
4992
Tim Petersced69f82003-09-16 20:30:58 +00004993static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994int fixcapitalize(PyUnicodeObject *self)
4995{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004996 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004997 Py_UNICODE *s = self->str;
4998 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004999
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005000 if (len == 0)
5001 return 0;
5002 if (Py_UNICODE_ISLOWER(*s)) {
5003 *s = Py_UNICODE_TOUPPER(*s);
5004 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005006 s++;
5007 while (--len > 0) {
5008 if (Py_UNICODE_ISUPPER(*s)) {
5009 *s = Py_UNICODE_TOLOWER(*s);
5010 status = 1;
5011 }
5012 s++;
5013 }
5014 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015}
5016
5017static
5018int fixtitle(PyUnicodeObject *self)
5019{
5020 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5021 register Py_UNICODE *e;
5022 int previous_is_cased;
5023
5024 /* Shortcut for single character strings */
5025 if (PyUnicode_GET_SIZE(self) == 1) {
5026 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5027 if (*p != ch) {
5028 *p = ch;
5029 return 1;
5030 }
5031 else
5032 return 0;
5033 }
Tim Petersced69f82003-09-16 20:30:58 +00005034
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 e = p + PyUnicode_GET_SIZE(self);
5036 previous_is_cased = 0;
5037 for (; p < e; p++) {
5038 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005039
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 if (previous_is_cased)
5041 *p = Py_UNICODE_TOLOWER(ch);
5042 else
5043 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005044
5045 if (Py_UNICODE_ISLOWER(ch) ||
5046 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 Py_UNICODE_ISTITLE(ch))
5048 previous_is_cased = 1;
5049 else
5050 previous_is_cased = 0;
5051 }
5052 return 1;
5053}
5054
Tim Peters8ce9f162004-08-27 01:49:32 +00005055PyObject *
5056PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057{
Tim Peters8ce9f162004-08-27 01:49:32 +00005058 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005059 const Py_UNICODE blank = ' ';
5060 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005061 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005062 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005063 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5064 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005065 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5066 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005067 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005068 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005069 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Tim Peters05eba1f2004-08-27 21:32:02 +00005071 fseq = PySequence_Fast(seq, "");
5072 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005073 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005074 }
5075
Tim Peters91879ab2004-08-27 22:35:44 +00005076 /* Grrrr. A codec may be invoked to convert str objects to
5077 * Unicode, and so it's possible to call back into Python code
5078 * during PyUnicode_FromObject(), and so it's possible for a sick
5079 * codec to change the size of fseq (if seq is a list). Therefore
5080 * we have to keep refetching the size -- can't assume seqlen
5081 * is invariant.
5082 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005083 seqlen = PySequence_Fast_GET_SIZE(fseq);
5084 /* If empty sequence, return u"". */
5085 if (seqlen == 0) {
5086 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5087 goto Done;
5088 }
5089 /* If singleton sequence with an exact Unicode, return that. */
5090 if (seqlen == 1) {
5091 item = PySequence_Fast_GET_ITEM(fseq, 0);
5092 if (PyUnicode_CheckExact(item)) {
5093 Py_INCREF(item);
5094 res = (PyUnicodeObject *)item;
5095 goto Done;
5096 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005097 }
5098
Tim Peters05eba1f2004-08-27 21:32:02 +00005099 /* At least two items to join, or one that isn't exact Unicode. */
5100 if (seqlen > 1) {
5101 /* Set up sep and seplen -- they're needed. */
5102 if (separator == NULL) {
5103 sep = &blank;
5104 seplen = 1;
5105 }
5106 else {
5107 internal_separator = PyUnicode_FromObject(separator);
5108 if (internal_separator == NULL)
5109 goto onError;
5110 sep = PyUnicode_AS_UNICODE(internal_separator);
5111 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005112 /* In case PyUnicode_FromObject() mutated seq. */
5113 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005114 }
5115 }
5116
5117 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005118 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005119 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005120 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005121 res_p = PyUnicode_AS_UNICODE(res);
5122 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005123
Tim Peters05eba1f2004-08-27 21:32:02 +00005124 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005125 Py_ssize_t itemlen;
5126 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005127
5128 item = PySequence_Fast_GET_ITEM(fseq, i);
5129 /* Convert item to Unicode. */
5130 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5131 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005132 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005133 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005134 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005135 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005136 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005137 item = PyUnicode_FromObject(item);
5138 if (item == NULL)
5139 goto onError;
5140 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005141
Tim Peters91879ab2004-08-27 22:35:44 +00005142 /* In case PyUnicode_FromObject() mutated seq. */
5143 seqlen = PySequence_Fast_GET_SIZE(fseq);
5144
Tim Peters8ce9f162004-08-27 01:49:32 +00005145 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005147 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005148 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005149 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005150 if (i < seqlen - 1) {
5151 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005153 goto Overflow;
5154 }
5155 if (new_res_used > res_alloc) {
5156 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005157 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005158 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005159 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005160 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005161 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005162 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005163 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005165 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005166 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005168
5169 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005170 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005171 res_p += itemlen;
5172 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005173 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005174 res_p += seplen;
5175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005177 res_used = new_res_used;
5178 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005179
Tim Peters05eba1f2004-08-27 21:32:02 +00005180 /* Shrink res to match the used area; this probably can't fail,
5181 * but it's cheap to check.
5182 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005183 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005184 goto onError;
5185
5186 Done:
5187 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005188 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 return (PyObject *)res;
5190
Tim Peters8ce9f162004-08-27 01:49:32 +00005191 Overflow:
5192 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005193 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005194 Py_DECREF(item);
5195 /* fall through */
5196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005198 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005199 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005200 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 return NULL;
5202}
5203
Tim Petersced69f82003-09-16 20:30:58 +00005204static
5205PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t left,
5207 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 Py_UNICODE fill)
5209{
5210 PyUnicodeObject *u;
5211
5212 if (left < 0)
5213 left = 0;
5214 if (right < 0)
5215 right = 0;
5216
Tim Peters7a29bd52001-09-12 03:03:31 +00005217 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 Py_INCREF(self);
5219 return self;
5220 }
5221
5222 u = _PyUnicode_New(left + self->length + right);
5223 if (u) {
5224 if (left)
5225 Py_UNICODE_FILL(u->str, fill, left);
5226 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5227 if (right)
5228 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5229 }
5230
5231 return u;
5232}
5233
5234#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005235 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 if (!str) \
5237 goto onError; \
5238 if (PyList_Append(list, str)) { \
5239 Py_DECREF(str); \
5240 goto onError; \
5241 } \
5242 else \
5243 Py_DECREF(str);
5244
5245static
5246PyObject *split_whitespace(PyUnicodeObject *self,
5247 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005248 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005250 register Py_ssize_t i;
5251 register Py_ssize_t j;
5252 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 PyObject *str;
5254
5255 for (i = j = 0; i < len; ) {
5256 /* find a token */
5257 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5258 i++;
5259 j = i;
5260 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5261 i++;
5262 if (j < i) {
5263 if (maxcount-- <= 0)
5264 break;
5265 SPLIT_APPEND(self->str, j, i);
5266 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5267 i++;
5268 j = i;
5269 }
5270 }
5271 if (j < len) {
5272 SPLIT_APPEND(self->str, j, len);
5273 }
5274 return list;
5275
5276 onError:
5277 Py_DECREF(list);
5278 return NULL;
5279}
5280
5281PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005282 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 register Py_ssize_t i;
5285 register Py_ssize_t j;
5286 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 PyObject *list;
5288 PyObject *str;
5289 Py_UNICODE *data;
5290
5291 string = PyUnicode_FromObject(string);
5292 if (string == NULL)
5293 return NULL;
5294 data = PyUnicode_AS_UNICODE(string);
5295 len = PyUnicode_GET_SIZE(string);
5296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 list = PyList_New(0);
5298 if (!list)
5299 goto onError;
5300
5301 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005302 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005303
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005305 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
5308 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005309 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 if (i < len) {
5311 if (data[i] == '\r' && i + 1 < len &&
5312 data[i+1] == '\n')
5313 i += 2;
5314 else
5315 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005316 if (keepends)
5317 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 }
Guido van Rossum86662912000-04-11 15:38:46 +00005319 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 j = i;
5321 }
5322 if (j < len) {
5323 SPLIT_APPEND(data, j, len);
5324 }
5325
5326 Py_DECREF(string);
5327 return list;
5328
5329 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005330 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 Py_DECREF(string);
5332 return NULL;
5333}
5334
Tim Petersced69f82003-09-16 20:30:58 +00005335static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336PyObject *split_char(PyUnicodeObject *self,
5337 PyObject *list,
5338 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005339 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 register Py_ssize_t i;
5342 register Py_ssize_t j;
5343 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 PyObject *str;
5345
5346 for (i = j = 0; i < len; ) {
5347 if (self->str[i] == ch) {
5348 if (maxcount-- <= 0)
5349 break;
5350 SPLIT_APPEND(self->str, j, i);
5351 i = j = i + 1;
5352 } else
5353 i++;
5354 }
5355 if (j <= len) {
5356 SPLIT_APPEND(self->str, j, len);
5357 }
5358 return list;
5359
5360 onError:
5361 Py_DECREF(list);
5362 return NULL;
5363}
5364
Tim Petersced69f82003-09-16 20:30:58 +00005365static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366PyObject *split_substring(PyUnicodeObject *self,
5367 PyObject *list,
5368 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371 register Py_ssize_t i;
5372 register Py_ssize_t j;
5373 Py_ssize_t len = self->length;
5374 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 PyObject *str;
5376
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005377 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 if (Py_UNICODE_MATCH(self, i, substring)) {
5379 if (maxcount-- <= 0)
5380 break;
5381 SPLIT_APPEND(self->str, j, i);
5382 i = j = i + sublen;
5383 } else
5384 i++;
5385 }
5386 if (j <= len) {
5387 SPLIT_APPEND(self->str, j, len);
5388 }
5389 return list;
5390
5391 onError:
5392 Py_DECREF(list);
5393 return NULL;
5394}
5395
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005396static
5397PyObject *rsplit_whitespace(PyUnicodeObject *self,
5398 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005399 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005400{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 register Py_ssize_t i;
5402 register Py_ssize_t j;
5403 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005404 PyObject *str;
5405
5406 for (i = j = len - 1; i >= 0; ) {
5407 /* find a token */
5408 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5409 i--;
5410 j = i;
5411 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5412 i--;
5413 if (j > i) {
5414 if (maxcount-- <= 0)
5415 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005416 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005417 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5418 i--;
5419 j = i;
5420 }
5421 }
5422 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005423 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005425 if (PyList_Reverse(list) < 0)
5426 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005427 return list;
5428
5429 onError:
5430 Py_DECREF(list);
5431 return NULL;
5432}
5433
5434static
5435PyObject *rsplit_char(PyUnicodeObject *self,
5436 PyObject *list,
5437 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005438 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440 register Py_ssize_t i;
5441 register Py_ssize_t j;
5442 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005443 PyObject *str;
5444
5445 for (i = j = len - 1; i >= 0; ) {
5446 if (self->str[i] == ch) {
5447 if (maxcount-- <= 0)
5448 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005449 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005450 j = i = i - 1;
5451 } else
5452 i--;
5453 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005454 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005455 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005456 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005457 if (PyList_Reverse(list) < 0)
5458 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005459 return list;
5460
5461 onError:
5462 Py_DECREF(list);
5463 return NULL;
5464}
5465
5466static
5467PyObject *rsplit_substring(PyUnicodeObject *self,
5468 PyObject *list,
5469 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005472 register Py_ssize_t i;
5473 register Py_ssize_t j;
5474 Py_ssize_t len = self->length;
5475 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005476 PyObject *str;
5477
5478 for (i = len - sublen, j = len; i >= 0; ) {
5479 if (Py_UNICODE_MATCH(self, i, substring)) {
5480 if (maxcount-- <= 0)
5481 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005482 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005483 j = i;
5484 i -= sublen;
5485 } else
5486 i--;
5487 }
5488 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005489 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005490 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005491 if (PyList_Reverse(list) < 0)
5492 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005493 return list;
5494
5495 onError:
5496 Py_DECREF(list);
5497 return NULL;
5498}
5499
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500#undef SPLIT_APPEND
5501
5502static
5503PyObject *split(PyUnicodeObject *self,
5504 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
5507 PyObject *list;
5508
5509 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005510 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511
5512 list = PyList_New(0);
5513 if (!list)
5514 return NULL;
5515
5516 if (substring == NULL)
5517 return split_whitespace(self,list,maxcount);
5518
5519 else if (substring->length == 1)
5520 return split_char(self,list,substring->str[0],maxcount);
5521
5522 else if (substring->length == 0) {
5523 Py_DECREF(list);
5524 PyErr_SetString(PyExc_ValueError, "empty separator");
5525 return NULL;
5526 }
5527 else
5528 return split_substring(self,list,substring,maxcount);
5529}
5530
Tim Petersced69f82003-09-16 20:30:58 +00005531static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005532PyObject *rsplit(PyUnicodeObject *self,
5533 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005534 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005535{
5536 PyObject *list;
5537
5538 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005539 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005540
5541 list = PyList_New(0);
5542 if (!list)
5543 return NULL;
5544
5545 if (substring == NULL)
5546 return rsplit_whitespace(self,list,maxcount);
5547
5548 else if (substring->length == 1)
5549 return rsplit_char(self,list,substring->str[0],maxcount);
5550
5551 else if (substring->length == 0) {
5552 Py_DECREF(list);
5553 PyErr_SetString(PyExc_ValueError, "empty separator");
5554 return NULL;
5555 }
5556 else
5557 return rsplit_substring(self,list,substring,maxcount);
5558}
5559
5560static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561PyObject *replace(PyUnicodeObject *self,
5562 PyUnicodeObject *str1,
5563 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565{
5566 PyUnicodeObject *u;
5567
5568 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005569 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570
Thomas Wouters477c8d52006-05-27 19:21:47 +00005571 if (str1->length == str2->length) {
5572 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005573 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005574 if (str1->length == 1) {
5575 /* replace characters */
5576 Py_UNICODE u1, u2;
5577 if (!findchar(self->str, self->length, str1->str[0]))
5578 goto nothing;
5579 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5580 if (!u)
5581 return NULL;
5582 Py_UNICODE_COPY(u->str, self->str, self->length);
5583 u1 = str1->str[0];
5584 u2 = str2->str[0];
5585 for (i = 0; i < u->length; i++)
5586 if (u->str[i] == u1) {
5587 if (--maxcount < 0)
5588 break;
5589 u->str[i] = u2;
5590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005592 i = fastsearch(
5593 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005595 if (i < 0)
5596 goto nothing;
5597 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5598 if (!u)
5599 return NULL;
5600 Py_UNICODE_COPY(u->str, self->str, self->length);
5601 while (i <= self->length - str1->length)
5602 if (Py_UNICODE_MATCH(self, i, str1)) {
5603 if (--maxcount < 0)
5604 break;
5605 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5606 i += str1->length;
5607 } else
5608 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005611
5612 Py_ssize_t n, i, j, e;
5613 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 Py_UNICODE *p;
5615
5616 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005617 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 if (n > maxcount)
5619 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005620 if (n == 0)
5621 goto nothing;
5622 /* new_size = self->length + n * (str2->length - str1->length)); */
5623 delta = (str2->length - str1->length);
5624 if (delta == 0) {
5625 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005627 product = n * (str2->length - str1->length);
5628 if ((product / (str2->length - str1->length)) != n) {
5629 PyErr_SetString(PyExc_OverflowError,
5630 "replace string is too long");
5631 return NULL;
5632 }
5633 new_size = self->length + product;
5634 if (new_size < 0) {
5635 PyErr_SetString(PyExc_OverflowError,
5636 "replace string is too long");
5637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 }
5639 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005640 u = _PyUnicode_New(new_size);
5641 if (!u)
5642 return NULL;
5643 i = 0;
5644 p = u->str;
5645 e = self->length - str1->length;
5646 if (str1->length > 0) {
5647 while (n-- > 0) {
5648 /* look for next match */
5649 j = i;
5650 while (j <= e) {
5651 if (Py_UNICODE_MATCH(self, j, str1))
5652 break;
5653 j++;
5654 }
5655 if (j > i) {
5656 if (j > e)
5657 break;
5658 /* copy unchanged part [i:j] */
5659 Py_UNICODE_COPY(p, self->str+i, j-i);
5660 p += j - i;
5661 }
5662 /* copy substitution string */
5663 if (str2->length > 0) {
5664 Py_UNICODE_COPY(p, str2->str, str2->length);
5665 p += str2->length;
5666 }
5667 i = j + str1->length;
5668 }
5669 if (i < self->length)
5670 /* copy tail [i:] */
5671 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5672 } else {
5673 /* interleave */
5674 while (n > 0) {
5675 Py_UNICODE_COPY(p, str2->str, str2->length);
5676 p += str2->length;
5677 if (--n <= 0)
5678 break;
5679 *p++ = self->str[i++];
5680 }
5681 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005685
5686nothing:
5687 /* nothing to replace; return original string (when possible) */
5688 if (PyUnicode_CheckExact(self)) {
5689 Py_INCREF(self);
5690 return (PyObject *) self;
5691 }
5692 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
5695/* --- Unicode Object Methods --------------------------------------------- */
5696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005697PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698"S.title() -> unicode\n\
5699\n\
5700Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005701characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
5703static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005704unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return fixup(self, fixtitle);
5707}
5708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005709PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710"S.capitalize() -> unicode\n\
5711\n\
5712Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005713have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
5715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005716unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 return fixup(self, fixcapitalize);
5719}
5720
5721#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005722PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723"S.capwords() -> unicode\n\
5724\n\
5725Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005726normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005729unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730{
5731 PyObject *list;
5732 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 /* Split into words */
5736 list = split(self, NULL, -1);
5737 if (!list)
5738 return NULL;
5739
5740 /* Capitalize each word */
5741 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5742 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5743 fixcapitalize);
5744 if (item == NULL)
5745 goto onError;
5746 Py_DECREF(PyList_GET_ITEM(list, i));
5747 PyList_SET_ITEM(list, i, item);
5748 }
5749
5750 /* Join the words to form a new string */
5751 item = PyUnicode_Join(NULL, list);
5752
5753onError:
5754 Py_DECREF(list);
5755 return (PyObject *)item;
5756}
5757#endif
5758
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005759/* Argument converter. Coerces to a single unicode character */
5760
5761static int
5762convert_uc(PyObject *obj, void *addr)
5763{
5764 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5765 PyObject *uniobj;
5766 Py_UNICODE *unistr;
5767
5768 uniobj = PyUnicode_FromObject(obj);
5769 if (uniobj == NULL) {
5770 PyErr_SetString(PyExc_TypeError,
5771 "The fill character cannot be converted to Unicode");
5772 return 0;
5773 }
5774 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5775 PyErr_SetString(PyExc_TypeError,
5776 "The fill character must be exactly one character long");
5777 Py_DECREF(uniobj);
5778 return 0;
5779 }
5780 unistr = PyUnicode_AS_UNICODE(uniobj);
5781 *fillcharloc = unistr[0];
5782 Py_DECREF(uniobj);
5783 return 1;
5784}
5785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005786PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005787"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005789Return S centered in a Unicode string of length width. Padding is\n\
5790done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
5792static PyObject *
5793unicode_center(PyUnicodeObject *self, PyObject *args)
5794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 Py_ssize_t marg, left;
5796 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005797 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Thomas Woutersde017742006-02-16 19:34:37 +00005799 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 return NULL;
5801
Tim Peters7a29bd52001-09-12 03:03:31 +00005802 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 Py_INCREF(self);
5804 return (PyObject*) self;
5805 }
5806
5807 marg = width - self->length;
5808 left = marg / 2 + (marg & width & 1);
5809
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005810 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811}
5812
Marc-André Lemburge5034372000-08-08 08:04:29 +00005813#if 0
5814
5815/* This code should go into some future Unicode collation support
5816 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005817 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005818
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005819/* speedy UTF-16 code point order comparison */
5820/* gleaned from: */
5821/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5822
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005823static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005824{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005825 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005826 0, 0, 0, 0, 0, 0, 0, 0,
5827 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005828 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005829};
5830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831static int
5832unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5833{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005834 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005835
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 Py_UNICODE *s1 = str1->str;
5837 Py_UNICODE *s2 = str2->str;
5838
5839 len1 = str1->length;
5840 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005841
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005843 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005844
5845 c1 = *s1++;
5846 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005847
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005848 if (c1 > (1<<11) * 26)
5849 c1 += utf16Fixup[c1>>11];
5850 if (c2 > (1<<11) * 26)
5851 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005852 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005853
5854 if (c1 != c2)
5855 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005856
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005857 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 }
5859
5860 return (len1 < len2) ? -1 : (len1 != len2);
5861}
5862
Marc-André Lemburge5034372000-08-08 08:04:29 +00005863#else
5864
5865static int
5866unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005869
5870 Py_UNICODE *s1 = str1->str;
5871 Py_UNICODE *s2 = str2->str;
5872
5873 len1 = str1->length;
5874 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005875
Marc-André Lemburge5034372000-08-08 08:04:29 +00005876 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005877 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005878
Fredrik Lundh45714e92001-06-26 16:39:36 +00005879 c1 = *s1++;
5880 c2 = *s2++;
5881
5882 if (c1 != c2)
5883 return (c1 < c2) ? -1 : 1;
5884
Marc-André Lemburge5034372000-08-08 08:04:29 +00005885 len1--; len2--;
5886 }
5887
5888 return (len1 < len2) ? -1 : (len1 != len2);
5889}
5890
5891#endif
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893int PyUnicode_Compare(PyObject *left,
5894 PyObject *right)
5895{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005896 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5897 return unicode_compare((PyUnicodeObject *)left,
5898 (PyUnicodeObject *)right);
5899 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5900 (PyUnicode_Check(left) && PyString_Check(right))) {
5901 if (PyUnicode_Check(left))
5902 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5903 if (PyUnicode_Check(right))
5904 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5905 assert(PyString_Check(left));
5906 assert(PyString_Check(right));
5907 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005909 PyErr_Format(PyExc_TypeError,
5910 "Can't compare %.100s and %.100s",
5911 left->ob_type->tp_name,
5912 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return -1;
5914}
5915
Martin v. Löwis5b222132007-06-10 09:51:05 +00005916int
5917PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5918{
5919 int i;
5920 Py_UNICODE *id;
5921 assert(PyUnicode_Check(uni));
5922 id = PyUnicode_AS_UNICODE(uni);
5923 /* Compare Unicode string and source character set string */
5924 for (i = 0; id[i] && str[i]; i++)
5925 if (id[i] != str[i])
5926 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5927 if (id[i])
5928 return 1; /* uni is longer */
5929 if (str[i])
5930 return -1; /* str is longer */
5931 return 0;
5932}
5933
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005934PyObject *PyUnicode_RichCompare(PyObject *left,
5935 PyObject *right,
5936 int op)
5937{
5938 int result;
5939
5940 result = PyUnicode_Compare(left, right);
5941 if (result == -1 && PyErr_Occurred())
5942 goto onError;
5943
5944 /* Convert the return value to a Boolean */
5945 switch (op) {
5946 case Py_EQ:
5947 result = (result == 0);
5948 break;
5949 case Py_NE:
5950 result = (result != 0);
5951 break;
5952 case Py_LE:
5953 result = (result <= 0);
5954 break;
5955 case Py_GE:
5956 result = (result >= 0);
5957 break;
5958 case Py_LT:
5959 result = (result == -1);
5960 break;
5961 case Py_GT:
5962 result = (result == 1);
5963 break;
5964 }
5965 return PyBool_FromLong(result);
5966
5967 onError:
5968
5969 /* Standard case
5970
5971 Type errors mean that PyUnicode_FromObject() could not convert
5972 one of the arguments (usually the right hand side) to Unicode,
5973 ie. we can't handle the comparison request. However, it is
5974 possible that the other object knows a comparison method, which
5975 is why we return Py_NotImplemented to give the other object a
5976 chance.
5977
5978 */
5979 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5980 PyErr_Clear();
5981 Py_INCREF(Py_NotImplemented);
5982 return Py_NotImplemented;
5983 }
5984 if (op != Py_EQ && op != Py_NE)
5985 return NULL;
5986
5987 /* Equality comparison.
5988
5989 This is a special case: we silence any PyExc_UnicodeDecodeError
5990 and instead turn it into a PyErr_UnicodeWarning.
5991
5992 */
5993 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5994 return NULL;
5995 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00005996 if (PyErr_WarnEx(PyExc_UnicodeWarning,
5997 (op == Py_EQ) ?
5998 "Unicode equal comparison "
5999 "failed to convert both arguments to Unicode - "
6000 "interpreting them as being unequal"
6001 :
6002 "Unicode unequal comparison "
6003 "failed to convert both arguments to Unicode - "
6004 "interpreting them as being unequal",
6005 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006006 return NULL;
6007 result = (op == Py_NE);
6008 return PyBool_FromLong(result);
6009}
6010
Guido van Rossum403d68b2000-03-13 15:55:09 +00006011int PyUnicode_Contains(PyObject *container,
6012 PyObject *element)
6013{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006014 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006016
6017 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006018 sub = PyUnicode_FromObject(element);
6019 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006020 PyErr_Format(PyExc_TypeError,
6021 "'in <string>' requires string as left operand, not %s",
6022 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006023 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006024 }
6025
Thomas Wouters477c8d52006-05-27 19:21:47 +00006026 str = PyUnicode_FromObject(container);
6027 if (!str) {
6028 Py_DECREF(sub);
6029 return -1;
6030 }
6031
6032 result = stringlib_contains_obj(str, sub);
6033
6034 Py_DECREF(str);
6035 Py_DECREF(sub);
6036
Guido van Rossum403d68b2000-03-13 15:55:09 +00006037 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006038}
6039
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040/* Concat to string or Unicode object giving a new Unicode object. */
6041
6042PyObject *PyUnicode_Concat(PyObject *left,
6043 PyObject *right)
6044{
6045 PyUnicodeObject *u = NULL, *v = NULL, *w;
6046
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006047 if (PyBytes_Check(left) || PyBytes_Check(right))
6048 return PyBytes_Concat(left, right);
6049
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 /* Coerce the two arguments */
6051 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6052 if (u == NULL)
6053 goto onError;
6054 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6055 if (v == NULL)
6056 goto onError;
6057
6058 /* Shortcuts */
6059 if (v == unicode_empty) {
6060 Py_DECREF(v);
6061 return (PyObject *)u;
6062 }
6063 if (u == unicode_empty) {
6064 Py_DECREF(u);
6065 return (PyObject *)v;
6066 }
6067
6068 /* Concat the two Unicode strings */
6069 w = _PyUnicode_New(u->length + v->length);
6070 if (w == NULL)
6071 goto onError;
6072 Py_UNICODE_COPY(w->str, u->str, u->length);
6073 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6074
6075 Py_DECREF(u);
6076 Py_DECREF(v);
6077 return (PyObject *)w;
6078
6079onError:
6080 Py_XDECREF(u);
6081 Py_XDECREF(v);
6082 return NULL;
6083}
6084
Walter Dörwald1ab83302007-05-18 17:15:44 +00006085void
6086PyUnicode_Append(PyObject **pleft, PyObject *right)
6087{
6088 PyObject *new;
6089 if (*pleft == NULL)
6090 return;
6091 if (right == NULL || !PyUnicode_Check(*pleft)) {
6092 Py_DECREF(*pleft);
6093 *pleft = NULL;
6094 return;
6095 }
6096 new = PyUnicode_Concat(*pleft, right);
6097 Py_DECREF(*pleft);
6098 *pleft = new;
6099}
6100
6101void
6102PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6103{
6104 PyUnicode_Append(pleft, right);
6105 Py_XDECREF(right);
6106}
6107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006108PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109"S.count(sub[, start[, end]]) -> int\n\
6110\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111Return the number of non-overlapping occurrences of substring sub in\n\
6112Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006113interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
6115static PyObject *
6116unicode_count(PyUnicodeObject *self, PyObject *args)
6117{
6118 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006119 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006120 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 PyObject *result;
6122
Guido van Rossumb8872e62000-05-09 14:14:27 +00006123 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6124 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 return NULL;
6126
6127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006128 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 if (substring == NULL)
6130 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006131
Thomas Wouters477c8d52006-05-27 19:21:47 +00006132 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Thomas Wouters477c8d52006-05-27 19:21:47 +00006134 result = PyInt_FromSsize_t(
6135 stringlib_count(self->str + start, end - start,
6136 substring->str, substring->length)
6137 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
6139 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return result;
6142}
6143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006144PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006145"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006147Encodes S using the codec registered for encoding. encoding defaults\n\
6148to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006149handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6151'xmlcharrefreplace' as well as any other name registered with\n\
6152codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
6154static PyObject *
6155unicode_encode(PyUnicodeObject *self, PyObject *args)
6156{
6157 char *encoding = NULL;
6158 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006159 PyObject *v;
6160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6162 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006163 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006164 if (v == NULL)
6165 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006166 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006167 if (PyString_Check(v)) {
6168 /* Old codec, turn it into bytes */
6169 PyObject *b = PyBytes_FromObject(v);
6170 Py_DECREF(v);
6171 return b;
6172 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006173 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006174 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006175 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006176 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006177 Py_DECREF(v);
6178 return NULL;
6179 }
6180 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006181
6182 onError:
6183 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006184}
6185
6186PyDoc_STRVAR(decode__doc__,
6187"S.decode([encoding[,errors]]) -> string or unicode\n\
6188\n\
6189Decodes S using the codec registered for encoding. encoding defaults\n\
6190to the default encoding. errors may be given to set a different error\n\
6191handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6192a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6193as well as any other name registerd with codecs.register_error that is\n\
6194able to handle UnicodeDecodeErrors.");
6195
6196static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006197unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006198{
6199 char *encoding = NULL;
6200 char *errors = NULL;
6201 PyObject *v;
6202
6203 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6204 return NULL;
6205 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006206 if (v == NULL)
6207 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006208 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6209 PyErr_Format(PyExc_TypeError,
6210 "decoder did not return a string/unicode object "
6211 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006212 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006213 Py_DECREF(v);
6214 return NULL;
6215 }
6216 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006217
6218 onError:
6219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220}
6221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006222PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223"S.expandtabs([tabsize]) -> unicode\n\
6224\n\
6225Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006226If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
6228static PyObject*
6229unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6230{
6231 Py_UNICODE *e;
6232 Py_UNICODE *p;
6233 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006234 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 PyUnicodeObject *u;
6236 int tabsize = 8;
6237
6238 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6239 return NULL;
6240
Thomas Wouters7e474022000-07-16 12:04:32 +00006241 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006242 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 e = self->str + self->length;
6244 for (p = self->str; p < e; p++)
6245 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006246 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006248 if (old_j > j) {
6249 PyErr_SetString(PyExc_OverflowError,
6250 "new string is too long");
6251 return NULL;
6252 }
6253 old_j = j;
6254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 }
6256 else {
6257 j++;
6258 if (*p == '\n' || *p == '\r') {
6259 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006260 old_j = j = 0;
6261 if (i < 0) {
6262 PyErr_SetString(PyExc_OverflowError,
6263 "new string is too long");
6264 return NULL;
6265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 }
6267 }
6268
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006269 if ((i + j) < 0) {
6270 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6271 return NULL;
6272 }
6273
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 /* Second pass: create output string and fill it */
6275 u = _PyUnicode_New(i + j);
6276 if (!u)
6277 return NULL;
6278
6279 j = 0;
6280 q = u->str;
6281
6282 for (p = self->str; p < e; p++)
6283 if (*p == '\t') {
6284 if (tabsize > 0) {
6285 i = tabsize - (j % tabsize);
6286 j += i;
6287 while (i--)
6288 *q++ = ' ';
6289 }
6290 }
6291 else {
6292 j++;
6293 *q++ = *p;
6294 if (*p == '\n' || *p == '\r')
6295 j = 0;
6296 }
6297
6298 return (PyObject*) u;
6299}
6300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006301PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302"S.find(sub [,start [,end]]) -> int\n\
6303\n\
6304Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006305such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306arguments start and end are interpreted as in slice notation.\n\
6307\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
6310static PyObject *
6311unicode_find(PyUnicodeObject *self, PyObject *args)
6312{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006313 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006314 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006315 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006316 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317
Guido van Rossumb8872e62000-05-09 14:14:27 +00006318 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6319 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006321 substring = PyUnicode_FromObject(substring);
6322 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 return NULL;
6324
Thomas Wouters477c8d52006-05-27 19:21:47 +00006325 result = stringlib_find_slice(
6326 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6327 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6328 start, end
6329 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330
6331 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006332
6333 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334}
6335
6336static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
6339 if (index < 0 || index >= self->length) {
6340 PyErr_SetString(PyExc_IndexError, "string index out of range");
6341 return NULL;
6342 }
6343
6344 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6345}
6346
6347static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006348unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006350 /* Since Unicode objects compare equal to their UTF-8 string
6351 counterparts, we hash the UTF-8 string. */
6352 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6353 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354}
6355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357"S.index(sub [,start [,end]]) -> int\n\
6358\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006359Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
6361static PyObject *
6362unicode_index(PyUnicodeObject *self, PyObject *args)
6363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006364 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006365 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006366 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006367 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
Guido van Rossumb8872e62000-05-09 14:14:27 +00006369 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6370 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006372 substring = PyUnicode_FromObject(substring);
6373 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 return NULL;
6375
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376 result = stringlib_find_slice(
6377 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6378 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6379 start, end
6380 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381
6382 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (result < 0) {
6385 PyErr_SetString(PyExc_ValueError, "substring not found");
6386 return NULL;
6387 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390}
6391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006392PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006395Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006399unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
6401 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6402 register const Py_UNICODE *e;
6403 int cased;
6404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 /* Shortcut for single character strings */
6406 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006407 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006409 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006410 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 e = p + PyUnicode_GET_SIZE(self);
6414 cased = 0;
6415 for (; p < e; p++) {
6416 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 else if (!cased && Py_UNICODE_ISLOWER(ch))
6421 cased = 1;
6422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006429Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431
6432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006433unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
6435 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6436 register const Py_UNICODE *e;
6437 int cased;
6438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 /* Shortcut for single character strings */
6440 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006441 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006444 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006446
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 e = p + PyUnicode_GET_SIZE(self);
6448 cased = 0;
6449 for (; p < e; p++) {
6450 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006453 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 else if (!cased && Py_UNICODE_ISUPPER(ch))
6455 cased = 1;
6456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006457 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458}
6459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006460PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006461"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006463Return True if S is a titlecased string and there is at least one\n\
6464character in S, i.e. upper- and titlecase characters may only\n\
6465follow uncased characters and lowercase characters only cased ones.\n\
6466Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
6468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006469unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470{
6471 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6472 register const Py_UNICODE *e;
6473 int cased, previous_is_cased;
6474
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 /* Shortcut for single character strings */
6476 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006477 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6478 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006480 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006481 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006482 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006483
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 e = p + PyUnicode_GET_SIZE(self);
6485 cased = 0;
6486 previous_is_cased = 0;
6487 for (; p < e; p++) {
6488 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6491 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006492 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 previous_is_cased = 1;
6494 cased = 1;
6495 }
6496 else if (Py_UNICODE_ISLOWER(ch)) {
6497 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006498 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 previous_is_cased = 1;
6500 cased = 1;
6501 }
6502 else
6503 previous_is_cased = 0;
6504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006505 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506}
6507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006508PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006509"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006511Return True if all characters in S are whitespace\n\
6512and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006515unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516{
6517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6518 register const Py_UNICODE *e;
6519
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 /* Shortcut for single character strings */
6521 if (PyUnicode_GET_SIZE(self) == 1 &&
6522 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006523 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006525 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006526 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006528
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 e = p + PyUnicode_GET_SIZE(self);
6530 for (; p < e; p++) {
6531 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006532 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006534 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006537PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006538"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006540Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006541and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006542
6543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006544unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006545{
6546 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6547 register const Py_UNICODE *e;
6548
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006549 /* Shortcut for single character strings */
6550 if (PyUnicode_GET_SIZE(self) == 1 &&
6551 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006552 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006553
6554 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006555 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006556 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557
6558 e = p + PyUnicode_GET_SIZE(self);
6559 for (; p < e; p++) {
6560 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006561 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006563 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564}
6565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006567"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006568\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006569Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006571
6572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006573unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006574{
6575 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6576 register const Py_UNICODE *e;
6577
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006578 /* Shortcut for single character strings */
6579 if (PyUnicode_GET_SIZE(self) == 1 &&
6580 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006581 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006582
6583 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006584 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006585 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006586
6587 e = p + PyUnicode_GET_SIZE(self);
6588 for (; p < e; p++) {
6589 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006590 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006592 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006593}
6594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006595PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006596"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006598Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600
6601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006602unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
6604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6605 register const Py_UNICODE *e;
6606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 /* Shortcut for single character strings */
6608 if (PyUnicode_GET_SIZE(self) == 1 &&
6609 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006612 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006613 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 e = p + PyUnicode_GET_SIZE(self);
6617 for (; p < e; p++) {
6618 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006619 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622}
6623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006624PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006625"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006627Return True if all characters in S are digits\n\
6628and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629
6630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006631unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632{
6633 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6634 register const Py_UNICODE *e;
6635
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 /* Shortcut for single character strings */
6637 if (PyUnicode_GET_SIZE(self) == 1 &&
6638 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006641 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006642 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 e = p + PyUnicode_GET_SIZE(self);
6646 for (; p < e; p++) {
6647 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006648 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006650 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651}
6652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006654"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006656Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006657False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
6659static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006660unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
6662 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6663 register const Py_UNICODE *e;
6664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 /* Shortcut for single character strings */
6666 if (PyUnicode_GET_SIZE(self) == 1 &&
6667 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006668 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006670 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006671 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006672 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006673
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 e = p + PyUnicode_GET_SIZE(self);
6675 for (; p < e; p++) {
6676 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006677 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006679 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680}
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683"S.join(sequence) -> unicode\n\
6684\n\
6685Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006686sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006689unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006691 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695unicode_length(PyUnicodeObject *self)
6696{
6697 return self->length;
6698}
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006701"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702\n\
6703Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006704done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706static PyObject *
6707unicode_ljust(PyUnicodeObject *self, PyObject *args)
6708{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006709 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006710 Py_UNICODE fillchar = ' ';
6711
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006712 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return NULL;
6714
Tim Peters7a29bd52001-09-12 03:03:31 +00006715 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 Py_INCREF(self);
6717 return (PyObject*) self;
6718 }
6719
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006720 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721}
6722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724"S.lower() -> unicode\n\
6725\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006726Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
6728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006729unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return fixup(self, fixlower);
6732}
6733
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006734#define LEFTSTRIP 0
6735#define RIGHTSTRIP 1
6736#define BOTHSTRIP 2
6737
6738/* Arrays indexed by above */
6739static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6740
6741#define STRIPNAME(i) (stripformat[i]+3)
6742
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006743/* externally visible for str.strip(unicode) */
6744PyObject *
6745_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6746{
6747 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006749 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006750 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6751 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006752
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6754
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006755 i = 0;
6756 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006757 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6758 i++;
6759 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006760 }
6761
6762 j = len;
6763 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006764 do {
6765 j--;
6766 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6767 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006768 }
6769
6770 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006771 Py_INCREF(self);
6772 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006773 }
6774 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006775 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006776}
6777
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
6779static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006780do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006782 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006783 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006784
6785 i = 0;
6786 if (striptype != RIGHTSTRIP) {
6787 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6788 i++;
6789 }
6790 }
6791
6792 j = len;
6793 if (striptype != LEFTSTRIP) {
6794 do {
6795 j--;
6796 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6797 j++;
6798 }
6799
6800 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6801 Py_INCREF(self);
6802 return (PyObject*)self;
6803 }
6804 else
6805 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
6807
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006808
6809static PyObject *
6810do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6811{
6812 PyObject *sep = NULL;
6813
6814 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6815 return NULL;
6816
6817 if (sep != NULL && sep != Py_None) {
6818 if (PyUnicode_Check(sep))
6819 return _PyUnicode_XStrip(self, striptype, sep);
6820 else if (PyString_Check(sep)) {
6821 PyObject *res;
6822 sep = PyUnicode_FromObject(sep);
6823 if (sep==NULL)
6824 return NULL;
6825 res = _PyUnicode_XStrip(self, striptype, sep);
6826 Py_DECREF(sep);
6827 return res;
6828 }
6829 else {
6830 PyErr_Format(PyExc_TypeError,
6831 "%s arg must be None, unicode or str",
6832 STRIPNAME(striptype));
6833 return NULL;
6834 }
6835 }
6836
6837 return do_strip(self, striptype);
6838}
6839
6840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006842"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006843\n\
6844Return a copy of the string S with leading and trailing\n\
6845whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006846If chars is given and not None, remove characters in chars instead.\n\
6847If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006848
6849static PyObject *
6850unicode_strip(PyUnicodeObject *self, PyObject *args)
6851{
6852 if (PyTuple_GET_SIZE(args) == 0)
6853 return do_strip(self, BOTHSTRIP); /* Common case */
6854 else
6855 return do_argstrip(self, BOTHSTRIP, args);
6856}
6857
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006860"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006861\n\
6862Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006863If chars is given and not None, remove characters in chars instead.\n\
6864If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006865
6866static PyObject *
6867unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6868{
6869 if (PyTuple_GET_SIZE(args) == 0)
6870 return do_strip(self, LEFTSTRIP); /* Common case */
6871 else
6872 return do_argstrip(self, LEFTSTRIP, args);
6873}
6874
6875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006877"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006878\n\
6879Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006880If chars is given and not None, remove characters in chars instead.\n\
6881If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006882
6883static PyObject *
6884unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6885{
6886 if (PyTuple_GET_SIZE(args) == 0)
6887 return do_strip(self, RIGHTSTRIP); /* Common case */
6888 else
6889 return do_argstrip(self, RIGHTSTRIP, args);
6890}
6891
6892
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006894unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895{
6896 PyUnicodeObject *u;
6897 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006899 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901 if (len < 0)
6902 len = 0;
6903
Tim Peters7a29bd52001-09-12 03:03:31 +00006904 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 /* no repeat, return original string */
6906 Py_INCREF(str);
6907 return (PyObject*) str;
6908 }
Tim Peters8f422462000-09-09 06:13:41 +00006909
6910 /* ensure # of chars needed doesn't overflow int and # of bytes
6911 * needed doesn't overflow size_t
6912 */
6913 nchars = len * str->length;
6914 if (len && nchars / len != str->length) {
6915 PyErr_SetString(PyExc_OverflowError,
6916 "repeated string is too long");
6917 return NULL;
6918 }
6919 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6920 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6921 PyErr_SetString(PyExc_OverflowError,
6922 "repeated string is too long");
6923 return NULL;
6924 }
6925 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 if (!u)
6927 return NULL;
6928
6929 p = u->str;
6930
Thomas Wouters477c8d52006-05-27 19:21:47 +00006931 if (str->length == 1 && len > 0) {
6932 Py_UNICODE_FILL(p, str->str[0], len);
6933 } else {
6934 Py_ssize_t done = 0; /* number of characters copied this far */
6935 if (done < nchars) {
6936 Py_UNICODE_COPY(p, str->str, str->length);
6937 done = str->length;
6938 }
6939 while (done < nchars) {
6940 int n = (done <= nchars-done) ? done : nchars-done;
6941 Py_UNICODE_COPY(p+done, p, n);
6942 done += n;
6943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 }
6945
6946 return (PyObject*) u;
6947}
6948
6949PyObject *PyUnicode_Replace(PyObject *obj,
6950 PyObject *subobj,
6951 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006952 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
6954 PyObject *self;
6955 PyObject *str1;
6956 PyObject *str2;
6957 PyObject *result;
6958
6959 self = PyUnicode_FromObject(obj);
6960 if (self == NULL)
6961 return NULL;
6962 str1 = PyUnicode_FromObject(subobj);
6963 if (str1 == NULL) {
6964 Py_DECREF(self);
6965 return NULL;
6966 }
6967 str2 = PyUnicode_FromObject(replobj);
6968 if (str2 == NULL) {
6969 Py_DECREF(self);
6970 Py_DECREF(str1);
6971 return NULL;
6972 }
Tim Petersced69f82003-09-16 20:30:58 +00006973 result = replace((PyUnicodeObject *)self,
6974 (PyUnicodeObject *)str1,
6975 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 maxcount);
6977 Py_DECREF(self);
6978 Py_DECREF(str1);
6979 Py_DECREF(str2);
6980 return result;
6981}
6982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006983PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984"S.replace (old, new[, maxsplit]) -> unicode\n\
6985\n\
6986Return a copy of S with all occurrences of substring\n\
6987old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006988given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
6990static PyObject*
6991unicode_replace(PyUnicodeObject *self, PyObject *args)
6992{
6993 PyUnicodeObject *str1;
6994 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 PyObject *result;
6997
Martin v. Löwis18e16552006-02-15 17:27:45 +00006998 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 return NULL;
7000 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7001 if (str1 == NULL)
7002 return NULL;
7003 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007004 if (str2 == NULL) {
7005 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
7009 result = replace(self, str1, str2, maxcount);
7010
7011 Py_DECREF(str1);
7012 Py_DECREF(str2);
7013 return result;
7014}
7015
7016static
7017PyObject *unicode_repr(PyObject *unicode)
7018{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007019 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007020 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007021 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7022 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7023
7024 /* XXX(nnorwitz): rather than over-allocating, it would be
7025 better to choose a different scheme. Perhaps scan the
7026 first N-chars of the string and allocate based on that size.
7027 */
7028 /* Initial allocation is based on the longest-possible unichr
7029 escape.
7030
7031 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7032 unichr, so in this case it's the longest unichr escape. In
7033 narrow (UTF-16) builds this is five chars per source unichr
7034 since there are two unichrs in the surrogate pair, so in narrow
7035 (UTF-16) builds it's not the longest unichr escape.
7036
7037 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7038 so in the narrow (UTF-16) build case it's the longest unichr
7039 escape.
7040 */
7041
Walter Dörwald1ab83302007-05-18 17:15:44 +00007042 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007043 2 /* quotes */
7044#ifdef Py_UNICODE_WIDE
7045 + 10*size
7046#else
7047 + 6*size
7048#endif
7049 + 1);
7050 if (repr == NULL)
7051 return NULL;
7052
Walter Dörwald1ab83302007-05-18 17:15:44 +00007053 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007054
7055 /* Add quote */
7056 *p++ = (findchar(s, size, '\'') &&
7057 !findchar(s, size, '"')) ? '"' : '\'';
7058 while (size-- > 0) {
7059 Py_UNICODE ch = *s++;
7060
7061 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007062 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007063 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007064 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007065 continue;
7066 }
7067
7068#ifdef Py_UNICODE_WIDE
7069 /* Map 21-bit characters to '\U00xxxxxx' */
7070 else if (ch >= 0x10000) {
7071 *p++ = '\\';
7072 *p++ = 'U';
7073 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7074 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7075 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7076 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7077 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7078 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7079 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7080 *p++ = hexdigits[ch & 0x0000000F];
7081 continue;
7082 }
7083#else
7084 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7085 else if (ch >= 0xD800 && ch < 0xDC00) {
7086 Py_UNICODE ch2;
7087 Py_UCS4 ucs;
7088
7089 ch2 = *s++;
7090 size--;
7091 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7093 *p++ = '\\';
7094 *p++ = 'U';
7095 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7096 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7097 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7098 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7099 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7100 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7101 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7102 *p++ = hexdigits[ucs & 0x0000000F];
7103 continue;
7104 }
7105 /* Fall through: isolated surrogates are copied as-is */
7106 s--;
7107 size++;
7108 }
7109#endif
7110
7111 /* Map 16-bit characters to '\uxxxx' */
7112 if (ch >= 256) {
7113 *p++ = '\\';
7114 *p++ = 'u';
7115 *p++ = hexdigits[(ch >> 12) & 0x000F];
7116 *p++ = hexdigits[(ch >> 8) & 0x000F];
7117 *p++ = hexdigits[(ch >> 4) & 0x000F];
7118 *p++ = hexdigits[ch & 0x000F];
7119 }
7120
7121 /* Map special whitespace to '\t', \n', '\r' */
7122 else if (ch == '\t') {
7123 *p++ = '\\';
7124 *p++ = 't';
7125 }
7126 else if (ch == '\n') {
7127 *p++ = '\\';
7128 *p++ = 'n';
7129 }
7130 else if (ch == '\r') {
7131 *p++ = '\\';
7132 *p++ = 'r';
7133 }
7134
7135 /* Map non-printable US ASCII to '\xhh' */
7136 else if (ch < ' ' || ch >= 0x7F) {
7137 *p++ = '\\';
7138 *p++ = 'x';
7139 *p++ = hexdigits[(ch >> 4) & 0x000F];
7140 *p++ = hexdigits[ch & 0x000F];
7141 }
7142
7143 /* Copy everything else as-is */
7144 else
7145 *p++ = (char) ch;
7146 }
7147 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007148 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007149
7150 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007151 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007152 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153}
7154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007155PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156"S.rfind(sub [,start [,end]]) -> int\n\
7157\n\
7158Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007159such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160arguments start and end are interpreted as in slice notation.\n\
7161\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
7164static PyObject *
7165unicode_rfind(PyUnicodeObject *self, PyObject *args)
7166{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007169 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007170 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171
Guido van Rossumb8872e62000-05-09 14:14:27 +00007172 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7173 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007175 substring = PyUnicode_FromObject(substring);
7176 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 return NULL;
7178
Thomas Wouters477c8d52006-05-27 19:21:47 +00007179 result = stringlib_rfind_slice(
7180 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7181 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7182 start, end
7183 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
7185 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007186
7187 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188}
7189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007190PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191"S.rindex(sub [,start [,end]]) -> int\n\
7192\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194
7195static PyObject *
7196unicode_rindex(PyUnicodeObject *self, PyObject *args)
7197{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007198 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007199 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007200 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007201 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
Guido van Rossumb8872e62000-05-09 14:14:27 +00007203 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7204 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007206 substring = PyUnicode_FromObject(substring);
7207 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 return NULL;
7209
Thomas Wouters477c8d52006-05-27 19:21:47 +00007210 result = stringlib_rfind_slice(
7211 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7212 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7213 start, end
7214 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007217
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 if (result < 0) {
7219 PyErr_SetString(PyExc_ValueError, "substring not found");
7220 return NULL;
7221 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223}
7224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007225PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007226"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227\n\
7228Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007229done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231static PyObject *
7232unicode_rjust(PyUnicodeObject *self, PyObject *args)
7233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007234 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007235 Py_UNICODE fillchar = ' ';
7236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007237 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 return NULL;
7239
Tim Peters7a29bd52001-09-12 03:03:31 +00007240 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 Py_INCREF(self);
7242 return (PyObject*) self;
7243 }
7244
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007245 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246}
7247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250{
7251 /* standard clamping */
7252 if (start < 0)
7253 start = 0;
7254 if (end < 0)
7255 end = 0;
7256 if (end > self->length)
7257 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007258 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 /* full slice, return original string */
7260 Py_INCREF(self);
7261 return (PyObject*) self;
7262 }
7263 if (start > end)
7264 start = end;
7265 /* copy slice */
7266 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7267 end - start);
7268}
7269
7270PyObject *PyUnicode_Split(PyObject *s,
7271 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007272 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273{
7274 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007275
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 s = PyUnicode_FromObject(s);
7277 if (s == NULL)
7278 return NULL;
7279 if (sep != NULL) {
7280 sep = PyUnicode_FromObject(sep);
7281 if (sep == NULL) {
7282 Py_DECREF(s);
7283 return NULL;
7284 }
7285 }
7286
7287 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7288
7289 Py_DECREF(s);
7290 Py_XDECREF(sep);
7291 return result;
7292}
7293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295"S.split([sep [,maxsplit]]) -> list of strings\n\
7296\n\
7297Return a list of the words in S, using sep as the\n\
7298delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007299splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007300any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302static PyObject*
7303unicode_split(PyUnicodeObject *self, PyObject *args)
7304{
7305 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307
Martin v. Löwis18e16552006-02-15 17:27:45 +00007308 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 return NULL;
7310
7311 if (substring == Py_None)
7312 return split(self, NULL, maxcount);
7313 else if (PyUnicode_Check(substring))
7314 return split(self, (PyUnicodeObject *)substring, maxcount);
7315 else
7316 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7317}
7318
Thomas Wouters477c8d52006-05-27 19:21:47 +00007319PyObject *
7320PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7321{
7322 PyObject* str_obj;
7323 PyObject* sep_obj;
7324 PyObject* out;
7325
7326 str_obj = PyUnicode_FromObject(str_in);
7327 if (!str_obj)
7328 return NULL;
7329 sep_obj = PyUnicode_FromObject(sep_in);
7330 if (!sep_obj) {
7331 Py_DECREF(str_obj);
7332 return NULL;
7333 }
7334
7335 out = stringlib_partition(
7336 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7337 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7338 );
7339
7340 Py_DECREF(sep_obj);
7341 Py_DECREF(str_obj);
7342
7343 return out;
7344}
7345
7346
7347PyObject *
7348PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7349{
7350 PyObject* str_obj;
7351 PyObject* sep_obj;
7352 PyObject* out;
7353
7354 str_obj = PyUnicode_FromObject(str_in);
7355 if (!str_obj)
7356 return NULL;
7357 sep_obj = PyUnicode_FromObject(sep_in);
7358 if (!sep_obj) {
7359 Py_DECREF(str_obj);
7360 return NULL;
7361 }
7362
7363 out = stringlib_rpartition(
7364 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7365 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7366 );
7367
7368 Py_DECREF(sep_obj);
7369 Py_DECREF(str_obj);
7370
7371 return out;
7372}
7373
7374PyDoc_STRVAR(partition__doc__,
7375"S.partition(sep) -> (head, sep, tail)\n\
7376\n\
7377Searches for the separator sep in S, and returns the part before it,\n\
7378the separator itself, and the part after it. If the separator is not\n\
7379found, returns S and two empty strings.");
7380
7381static PyObject*
7382unicode_partition(PyUnicodeObject *self, PyObject *separator)
7383{
7384 return PyUnicode_Partition((PyObject *)self, separator);
7385}
7386
7387PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007388"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007389\n\
7390Searches for the separator sep in S, starting at the end of S, and returns\n\
7391the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007392separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007393
7394static PyObject*
7395unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7396{
7397 return PyUnicode_RPartition((PyObject *)self, separator);
7398}
7399
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007400PyObject *PyUnicode_RSplit(PyObject *s,
7401 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007402 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007403{
7404 PyObject *result;
7405
7406 s = PyUnicode_FromObject(s);
7407 if (s == NULL)
7408 return NULL;
7409 if (sep != NULL) {
7410 sep = PyUnicode_FromObject(sep);
7411 if (sep == NULL) {
7412 Py_DECREF(s);
7413 return NULL;
7414 }
7415 }
7416
7417 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7418
7419 Py_DECREF(s);
7420 Py_XDECREF(sep);
7421 return result;
7422}
7423
7424PyDoc_STRVAR(rsplit__doc__,
7425"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7426\n\
7427Return a list of the words in S, using sep as the\n\
7428delimiter string, starting at the end of the string and\n\
7429working to the front. If maxsplit is given, at most maxsplit\n\
7430splits are done. If sep is not specified, any whitespace string\n\
7431is a separator.");
7432
7433static PyObject*
7434unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7435{
7436 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007437 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007438
Martin v. Löwis18e16552006-02-15 17:27:45 +00007439 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007440 return NULL;
7441
7442 if (substring == Py_None)
7443 return rsplit(self, NULL, maxcount);
7444 else if (PyUnicode_Check(substring))
7445 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7446 else
7447 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7448}
7449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007450PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007451"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452\n\
7453Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007454Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007455is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456
7457static PyObject*
7458unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7459{
Guido van Rossum86662912000-04-11 15:38:46 +00007460 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
Guido van Rossum86662912000-04-11 15:38:46 +00007462 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 return NULL;
7464
Guido van Rossum86662912000-04-11 15:38:46 +00007465 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466}
7467
7468static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007469PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
Walter Dörwald346737f2007-05-31 10:44:43 +00007471 if (PyUnicode_CheckExact(self)) {
7472 Py_INCREF(self);
7473 return self;
7474 } else
7475 /* Subtype -- return genuine unicode string with the same value. */
7476 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7477 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478}
7479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007480PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481"S.swapcase() -> unicode\n\
7482\n\
7483Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
7486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007487unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 return fixup(self, fixswapcase);
7490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493"S.translate(table) -> unicode\n\
7494\n\
7495Return a copy of the string S, where all characters have been mapped\n\
7496through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007497Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7498Unmapped characters are left untouched. Characters mapped to None\n\
7499are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
7501static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007502unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503{
Tim Petersced69f82003-09-16 20:30:58 +00007504 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007506 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 "ignore");
7508}
7509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511"S.upper() -> unicode\n\
7512\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007513Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007516unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 return fixup(self, fixupper);
7519}
7520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522"S.zfill(width) -> unicode\n\
7523\n\
7524Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526
7527static PyObject *
7528unicode_zfill(PyUnicodeObject *self, PyObject *args)
7529{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007530 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 PyUnicodeObject *u;
7532
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 Py_ssize_t width;
7534 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 return NULL;
7536
7537 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007538 if (PyUnicode_CheckExact(self)) {
7539 Py_INCREF(self);
7540 return (PyObject*) self;
7541 }
7542 else
7543 return PyUnicode_FromUnicode(
7544 PyUnicode_AS_UNICODE(self),
7545 PyUnicode_GET_SIZE(self)
7546 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 }
7548
7549 fill = width - self->length;
7550
7551 u = pad(self, fill, 0, '0');
7552
Walter Dörwald068325e2002-04-15 13:36:47 +00007553 if (u == NULL)
7554 return NULL;
7555
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 if (u->str[fill] == '+' || u->str[fill] == '-') {
7557 /* move sign to beginning of string */
7558 u->str[0] = u->str[fill];
7559 u->str[fill] = '0';
7560 }
7561
7562 return (PyObject*) u;
7563}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564
7565#if 0
7566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007567unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return PyInt_FromLong(unicode_freelist_size);
7570}
7571#endif
7572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007573PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007574"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007576Return True if S starts with the specified prefix, False otherwise.\n\
7577With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578With optional end, stop comparing S at that position.\n\
7579prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
7581static PyObject *
7582unicode_startswith(PyUnicodeObject *self,
7583 PyObject *args)
7584{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007585 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007587 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007588 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007589 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007592 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007594 if (PyTuple_Check(subobj)) {
7595 Py_ssize_t i;
7596 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7597 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7598 PyTuple_GET_ITEM(subobj, i));
7599 if (substring == NULL)
7600 return NULL;
7601 result = tailmatch(self, substring, start, end, -1);
7602 Py_DECREF(substring);
7603 if (result) {
7604 Py_RETURN_TRUE;
7605 }
7606 }
7607 /* nothing matched */
7608 Py_RETURN_FALSE;
7609 }
7610 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007612 return NULL;
7613 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007615 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616}
7617
7618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007620"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007622Return True if S ends with the specified suffix, False otherwise.\n\
7623With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624With optional end, stop comparing S at that position.\n\
7625suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
7627static PyObject *
7628unicode_endswith(PyUnicodeObject *self,
7629 PyObject *args)
7630{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007633 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007634 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007635 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7638 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640 if (PyTuple_Check(subobj)) {
7641 Py_ssize_t i;
7642 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7643 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7644 PyTuple_GET_ITEM(subobj, i));
7645 if (substring == NULL)
7646 return NULL;
7647 result = tailmatch(self, substring, start, end, +1);
7648 Py_DECREF(substring);
7649 if (result) {
7650 Py_RETURN_TRUE;
7651 }
7652 }
7653 Py_RETURN_FALSE;
7654 }
7655 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007659 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007661 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662}
7663
7664
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007665
7666static PyObject *
7667unicode_getnewargs(PyUnicodeObject *v)
7668{
7669 return Py_BuildValue("(u#)", v->str, v->length);
7670}
7671
7672
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673static PyMethodDef unicode_methods[] = {
7674
7675 /* Order is according to common usage: often used methods should
7676 appear first, since lookup is done sequentially. */
7677
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007678 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7679 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7680 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007681 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007682 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7683 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7684 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7685 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7686 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7687 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7688 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007689 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007690 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7691 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7692 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007693 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007694 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007695/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7696 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7697 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7698 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007699 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007700 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007701 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007703 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7704 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7705 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7706 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7707 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7708 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7709 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7710 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7711 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7712 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7713 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7714 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7715 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7716 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007717 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007718#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007719 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720#endif
7721
7722#if 0
7723 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007724 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725#endif
7726
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007727 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 {NULL, NULL}
7729};
7730
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007731static PyObject *
7732unicode_mod(PyObject *v, PyObject *w)
7733{
7734 if (!PyUnicode_Check(v)) {
7735 Py_INCREF(Py_NotImplemented);
7736 return Py_NotImplemented;
7737 }
7738 return PyUnicode_Format(v, w);
7739}
7740
7741static PyNumberMethods unicode_as_number = {
7742 0, /*nb_add*/
7743 0, /*nb_subtract*/
7744 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007745 unicode_mod, /*nb_remainder*/
7746};
7747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007749 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007750 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007751 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7752 (ssizeargfunc) unicode_getitem, /* sq_item */
7753 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 0, /* sq_ass_item */
7755 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007756 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757};
7758
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007759static PyObject*
7760unicode_subscript(PyUnicodeObject* self, PyObject* item)
7761{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007762 if (PyIndex_Check(item)) {
7763 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007764 if (i == -1 && PyErr_Occurred())
7765 return NULL;
7766 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007767 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007768 return unicode_getitem(self, i);
7769 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007770 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007771 Py_UNICODE* source_buf;
7772 Py_UNICODE* result_buf;
7773 PyObject* result;
7774
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007775 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007776 &start, &stop, &step, &slicelength) < 0) {
7777 return NULL;
7778 }
7779
7780 if (slicelength <= 0) {
7781 return PyUnicode_FromUnicode(NULL, 0);
7782 } else {
7783 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007784 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7785 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007786
7787 if (result_buf == NULL)
7788 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007789
7790 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7791 result_buf[i] = source_buf[cur];
7792 }
Tim Petersced69f82003-09-16 20:30:58 +00007793
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007794 result = PyUnicode_FromUnicode(result_buf, slicelength);
7795 PyMem_FREE(result_buf);
7796 return result;
7797 }
7798 } else {
7799 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7800 return NULL;
7801 }
7802}
7803
7804static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007806 (binaryfunc)unicode_subscript, /* mp_subscript */
7807 (objobjargproc)0, /* mp_ass_subscript */
7808};
7809
Martin v. Löwis18e16552006-02-15 17:27:45 +00007810static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007812 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 const void **ptr)
7814{
7815 if (index != 0) {
7816 PyErr_SetString(PyExc_SystemError,
7817 "accessing non-existent unicode segment");
7818 return -1;
7819 }
7820 *ptr = (void *) self->str;
7821 return PyUnicode_GET_DATA_SIZE(self);
7822}
7823
Martin v. Löwis18e16552006-02-15 17:27:45 +00007824static Py_ssize_t
7825unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 const void **ptr)
7827{
7828 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007829 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 return -1;
7831}
7832
7833static int
7834unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007835 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
7837 if (lenp)
7838 *lenp = PyUnicode_GET_DATA_SIZE(self);
7839 return 1;
7840}
7841
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007842static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007844 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 const void **ptr)
7846{
7847 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (index != 0) {
7850 PyErr_SetString(PyExc_SystemError,
7851 "accessing non-existent unicode segment");
7852 return -1;
7853 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007854 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 if (str == NULL)
7856 return -1;
7857 *ptr = (void *) PyString_AS_STRING(str);
7858 return PyString_GET_SIZE(str);
7859}
7860
7861/* Helpers for PyUnicode_Format() */
7862
7863static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007864getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007866 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 if (argidx < arglen) {
7868 (*p_argidx)++;
7869 if (arglen < 0)
7870 return args;
7871 else
7872 return PyTuple_GetItem(args, argidx);
7873 }
7874 PyErr_SetString(PyExc_TypeError,
7875 "not enough arguments for format string");
7876 return NULL;
7877}
7878
7879#define F_LJUST (1<<0)
7880#define F_SIGN (1<<1)
7881#define F_BLANK (1<<2)
7882#define F_ALT (1<<3)
7883#define F_ZERO (1<<4)
7884
Martin v. Löwis18e16552006-02-15 17:27:45 +00007885static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007886strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007888 register Py_ssize_t i;
7889 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 for (i = len - 1; i >= 0; i--)
7891 buffer[i] = (Py_UNICODE) charbuffer[i];
7892
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return len;
7894}
7895
Neal Norwitzfc76d632006-01-10 06:03:13 +00007896static int
7897doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7898{
Tim Peters15231542006-02-16 01:08:01 +00007899 Py_ssize_t result;
7900
Neal Norwitzfc76d632006-01-10 06:03:13 +00007901 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007902 result = strtounicode(buffer, (char *)buffer);
7903 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007904}
7905
7906static int
7907longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7908{
Tim Peters15231542006-02-16 01:08:01 +00007909 Py_ssize_t result;
7910
Neal Norwitzfc76d632006-01-10 06:03:13 +00007911 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007912 result = strtounicode(buffer, (char *)buffer);
7913 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007914}
7915
Guido van Rossum078151d2002-08-11 04:24:12 +00007916/* XXX To save some code duplication, formatfloat/long/int could have been
7917 shared with stringobject.c, converting from 8-bit to Unicode after the
7918 formatting is done. */
7919
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920static int
7921formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007922 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 int flags,
7924 int prec,
7925 int type,
7926 PyObject *v)
7927{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007928 /* fmt = '%#.' + `prec` + `type`
7929 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 char fmt[20];
7931 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007932
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 x = PyFloat_AsDouble(v);
7934 if (x == -1.0 && PyErr_Occurred())
7935 return -1;
7936 if (prec < 0)
7937 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7939 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007940 /* Worst case length calc to ensure no buffer overrun:
7941
7942 'g' formats:
7943 fmt = %#.<prec>g
7944 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7945 for any double rep.)
7946 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7947
7948 'f' formats:
7949 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7950 len = 1 + 50 + 1 + prec = 52 + prec
7951
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007952 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007953 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007954
7955 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007956 if (((type == 'g' || type == 'G') &&
7957 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007958 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007959 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007960 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007961 return -1;
7962 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007963 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7964 (flags&F_ALT) ? "#" : "",
7965 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007966 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967}
7968
Tim Peters38fd5b62000-09-21 05:43:11 +00007969static PyObject*
7970formatlong(PyObject *val, int flags, int prec, int type)
7971{
7972 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007973 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007974 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007975 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007976
7977 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7978 if (!str)
7979 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007980 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007981 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007982 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007983}
7984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985static int
7986formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007987 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 int flags,
7989 int prec,
7990 int type,
7991 PyObject *v)
7992{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007993 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007994 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7995 * + 1 + 1
7996 * = 24
7997 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007998 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007999 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 long x;
8001
8002 x = PyInt_AsLong(v);
8003 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008004 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008005 if (x < 0 && type == 'u') {
8006 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008007 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008008 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8009 sign = "-";
8010 else
8011 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008013 prec = 1;
8014
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008015 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8016 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008017 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008018 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008019 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008021 return -1;
8022 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008023
8024 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008025 (type == 'x' || type == 'X' || type == 'o')) {
8026 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008027 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008028 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008029 * - when 0 is being converted, the C standard leaves off
8030 * the '0x' or '0X', which is inconsistent with other
8031 * %#x/%#X conversions and inconsistent with Python's
8032 * hex() function
8033 * - there are platforms that violate the standard and
8034 * convert 0 with the '0x' or '0X'
8035 * (Metrowerks, Compaq Tru64)
8036 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008037 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008038 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008039 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008040 * We can achieve the desired consistency by inserting our
8041 * own '0x' or '0X' prefix, and substituting %x/%X in place
8042 * of %#x/%#X.
8043 *
8044 * Note that this is the same approach as used in
8045 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008046 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008047 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8048 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008049 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008050 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008051 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8052 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008053 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008054 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008055 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008056 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008057 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008058 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059}
8060
8061static int
8062formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008063 size_t buflen,
8064 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008066 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008067 if (PyUnicode_Check(v)) {
8068 if (PyUnicode_GET_SIZE(v) != 1)
8069 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008073 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008074 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008075 goto onError;
8076 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078
8079 else {
8080 /* Integer input truncated to a character */
8081 long x;
8082 x = PyInt_AsLong(v);
8083 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008084 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008085#ifdef Py_UNICODE_WIDE
8086 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008087 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008088 "%c arg not in range(0x110000) "
8089 "(wide Python build)");
8090 return -1;
8091 }
8092#else
8093 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008094 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008095 "%c arg not in range(0x10000) "
8096 "(narrow Python build)");
8097 return -1;
8098 }
8099#endif
8100 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 }
8102 buf[1] = '\0';
8103 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008104
8105 onError:
8106 PyErr_SetString(PyExc_TypeError,
8107 "%c requires int or char");
8108 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109}
8110
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008111/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8112
8113 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8114 chars are formatted. XXX This is a magic number. Each formatting
8115 routine does bounds checking to ensure no overflow, but a better
8116 solution may be to malloc a buffer of appropriate size for each
8117 format. For now, the current solution is sufficient.
8118*/
8119#define FORMATBUFLEN (size_t)120
8120
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121PyObject *PyUnicode_Format(PyObject *format,
8122 PyObject *args)
8123{
8124 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 int args_owned = 0;
8127 PyUnicodeObject *result = NULL;
8128 PyObject *dict = NULL;
8129 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 if (format == NULL || args == NULL) {
8132 PyErr_BadInternalCall();
8133 return NULL;
8134 }
8135 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008136 if (uformat == NULL)
8137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 fmt = PyUnicode_AS_UNICODE(uformat);
8139 fmtcnt = PyUnicode_GET_SIZE(uformat);
8140
8141 reslen = rescnt = fmtcnt + 100;
8142 result = _PyUnicode_New(reslen);
8143 if (result == NULL)
8144 goto onError;
8145 res = PyUnicode_AS_UNICODE(result);
8146
8147 if (PyTuple_Check(args)) {
8148 arglen = PyTuple_Size(args);
8149 argidx = 0;
8150 }
8151 else {
8152 arglen = -1;
8153 argidx = -2;
8154 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008155 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008156 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 dict = args;
8158
8159 while (--fmtcnt >= 0) {
8160 if (*fmt != '%') {
8161 if (--rescnt < 0) {
8162 rescnt = fmtcnt + 100;
8163 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008164 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008165 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8167 --rescnt;
8168 }
8169 *res++ = *fmt++;
8170 }
8171 else {
8172 /* Got a format specifier */
8173 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008174 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 Py_UNICODE c = '\0';
8177 Py_UNICODE fill;
8178 PyObject *v = NULL;
8179 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008180 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008182 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008183 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184
8185 fmt++;
8186 if (*fmt == '(') {
8187 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008188 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 PyObject *key;
8190 int pcount = 1;
8191
8192 if (dict == NULL) {
8193 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008194 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 goto onError;
8196 }
8197 ++fmt;
8198 --fmtcnt;
8199 keystart = fmt;
8200 /* Skip over balanced parentheses */
8201 while (pcount > 0 && --fmtcnt >= 0) {
8202 if (*fmt == ')')
8203 --pcount;
8204 else if (*fmt == '(')
8205 ++pcount;
8206 fmt++;
8207 }
8208 keylen = fmt - keystart - 1;
8209 if (fmtcnt < 0 || pcount > 0) {
8210 PyErr_SetString(PyExc_ValueError,
8211 "incomplete format key");
8212 goto onError;
8213 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008214#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008215 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 then looked up since Python uses strings to hold
8217 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008218 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 key = PyUnicode_EncodeUTF8(keystart,
8220 keylen,
8221 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008222#else
8223 key = PyUnicode_FromUnicode(keystart, keylen);
8224#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 if (key == NULL)
8226 goto onError;
8227 if (args_owned) {
8228 Py_DECREF(args);
8229 args_owned = 0;
8230 }
8231 args = PyObject_GetItem(dict, key);
8232 Py_DECREF(key);
8233 if (args == NULL) {
8234 goto onError;
8235 }
8236 args_owned = 1;
8237 arglen = -1;
8238 argidx = -2;
8239 }
8240 while (--fmtcnt >= 0) {
8241 switch (c = *fmt++) {
8242 case '-': flags |= F_LJUST; continue;
8243 case '+': flags |= F_SIGN; continue;
8244 case ' ': flags |= F_BLANK; continue;
8245 case '#': flags |= F_ALT; continue;
8246 case '0': flags |= F_ZERO; continue;
8247 }
8248 break;
8249 }
8250 if (c == '*') {
8251 v = getnextarg(args, arglen, &argidx);
8252 if (v == NULL)
8253 goto onError;
8254 if (!PyInt_Check(v)) {
8255 PyErr_SetString(PyExc_TypeError,
8256 "* wants int");
8257 goto onError;
8258 }
8259 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008260 if (width == -1 && PyErr_Occurred())
8261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 if (width < 0) {
8263 flags |= F_LJUST;
8264 width = -width;
8265 }
8266 if (--fmtcnt >= 0)
8267 c = *fmt++;
8268 }
8269 else if (c >= '0' && c <= '9') {
8270 width = c - '0';
8271 while (--fmtcnt >= 0) {
8272 c = *fmt++;
8273 if (c < '0' || c > '9')
8274 break;
8275 if ((width*10) / 10 != width) {
8276 PyErr_SetString(PyExc_ValueError,
8277 "width too big");
8278 goto onError;
8279 }
8280 width = width*10 + (c - '0');
8281 }
8282 }
8283 if (c == '.') {
8284 prec = 0;
8285 if (--fmtcnt >= 0)
8286 c = *fmt++;
8287 if (c == '*') {
8288 v = getnextarg(args, arglen, &argidx);
8289 if (v == NULL)
8290 goto onError;
8291 if (!PyInt_Check(v)) {
8292 PyErr_SetString(PyExc_TypeError,
8293 "* wants int");
8294 goto onError;
8295 }
8296 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008297 if (prec == -1 && PyErr_Occurred())
8298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 if (prec < 0)
8300 prec = 0;
8301 if (--fmtcnt >= 0)
8302 c = *fmt++;
8303 }
8304 else if (c >= '0' && c <= '9') {
8305 prec = c - '0';
8306 while (--fmtcnt >= 0) {
8307 c = Py_CHARMASK(*fmt++);
8308 if (c < '0' || c > '9')
8309 break;
8310 if ((prec*10) / 10 != prec) {
8311 PyErr_SetString(PyExc_ValueError,
8312 "prec too big");
8313 goto onError;
8314 }
8315 prec = prec*10 + (c - '0');
8316 }
8317 }
8318 } /* prec */
8319 if (fmtcnt >= 0) {
8320 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 if (--fmtcnt >= 0)
8322 c = *fmt++;
8323 }
8324 }
8325 if (fmtcnt < 0) {
8326 PyErr_SetString(PyExc_ValueError,
8327 "incomplete format");
8328 goto onError;
8329 }
8330 if (c != '%') {
8331 v = getnextarg(args, arglen, &argidx);
8332 if (v == NULL)
8333 goto onError;
8334 }
8335 sign = 0;
8336 fill = ' ';
8337 switch (c) {
8338
8339 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008340 pbuf = formatbuf;
8341 /* presume that buffer length is at least 1 */
8342 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 len = 1;
8344 break;
8345
8346 case 's':
8347 case 'r':
8348 if (PyUnicode_Check(v) && c == 's') {
8349 temp = v;
8350 Py_INCREF(temp);
8351 }
8352 else {
8353 PyObject *unicode;
8354 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008355 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 else
8357 temp = PyObject_Repr(v);
8358 if (temp == NULL)
8359 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008360 if (PyUnicode_Check(temp))
8361 /* nothing to do */;
8362 else if (PyString_Check(temp)) {
8363 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008364 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008366 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008368 Py_DECREF(temp);
8369 temp = unicode;
8370 if (temp == NULL)
8371 goto onError;
8372 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008373 else {
8374 Py_DECREF(temp);
8375 PyErr_SetString(PyExc_TypeError,
8376 "%s argument has non-string str()");
8377 goto onError;
8378 }
8379 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008380 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 len = PyUnicode_GET_SIZE(temp);
8382 if (prec >= 0 && len > prec)
8383 len = prec;
8384 break;
8385
8386 case 'i':
8387 case 'd':
8388 case 'u':
8389 case 'o':
8390 case 'x':
8391 case 'X':
8392 if (c == 'i')
8393 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008394 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008395 temp = formatlong(v, flags, prec, c);
8396 if (!temp)
8397 goto onError;
8398 pbuf = PyUnicode_AS_UNICODE(temp);
8399 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008400 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008402 else {
8403 pbuf = formatbuf;
8404 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8405 flags, prec, c, v);
8406 if (len < 0)
8407 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008408 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008409 }
8410 if (flags & F_ZERO)
8411 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 break;
8413
8414 case 'e':
8415 case 'E':
8416 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008417 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 case 'g':
8419 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008420 if (c == 'F')
8421 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008422 pbuf = formatbuf;
8423 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8424 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 if (len < 0)
8426 goto onError;
8427 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008428 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 fill = '0';
8430 break;
8431
8432 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008433 pbuf = formatbuf;
8434 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (len < 0)
8436 goto onError;
8437 break;
8438
8439 default:
8440 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008441 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008442 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008443 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008444 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008445 (Py_ssize_t)(fmt - 1 -
8446 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 goto onError;
8448 }
8449 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008450 if (*pbuf == '-' || *pbuf == '+') {
8451 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 len--;
8453 }
8454 else if (flags & F_SIGN)
8455 sign = '+';
8456 else if (flags & F_BLANK)
8457 sign = ' ';
8458 else
8459 sign = 0;
8460 }
8461 if (width < len)
8462 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008463 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 reslen -= rescnt;
8465 rescnt = width + fmtcnt + 100;
8466 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008467 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008468 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008469 PyErr_NoMemory();
8470 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008471 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008472 if (_PyUnicode_Resize(&result, reslen) < 0) {
8473 Py_XDECREF(temp);
8474 goto onError;
8475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 res = PyUnicode_AS_UNICODE(result)
8477 + reslen - rescnt;
8478 }
8479 if (sign) {
8480 if (fill != ' ')
8481 *res++ = sign;
8482 rescnt--;
8483 if (width > len)
8484 width--;
8485 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008486 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008487 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008488 assert(pbuf[1] == c);
8489 if (fill != ' ') {
8490 *res++ = *pbuf++;
8491 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008492 }
Tim Petersfff53252001-04-12 18:38:48 +00008493 rescnt -= 2;
8494 width -= 2;
8495 if (width < 0)
8496 width = 0;
8497 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 if (width > len && !(flags & F_LJUST)) {
8500 do {
8501 --rescnt;
8502 *res++ = fill;
8503 } while (--width > len);
8504 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008505 if (fill == ' ') {
8506 if (sign)
8507 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008508 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008509 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008510 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008511 *res++ = *pbuf++;
8512 *res++ = *pbuf++;
8513 }
8514 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008515 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 res += len;
8517 rescnt -= len;
8518 while (--width >= len) {
8519 --rescnt;
8520 *res++ = ' ';
8521 }
8522 if (dict && (argidx < arglen) && c != '%') {
8523 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008524 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008525 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 goto onError;
8527 }
8528 Py_XDECREF(temp);
8529 } /* '%' */
8530 } /* until end */
8531 if (argidx < arglen && !dict) {
8532 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008533 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 goto onError;
8535 }
8536
Thomas Woutersa96affe2006-03-12 00:29:36 +00008537 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 if (args_owned) {
8540 Py_DECREF(args);
8541 }
8542 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 return (PyObject *)result;
8544
8545 onError:
8546 Py_XDECREF(result);
8547 Py_DECREF(uformat);
8548 if (args_owned) {
8549 Py_DECREF(args);
8550 }
8551 return NULL;
8552}
8553
8554static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008555 (readbufferproc) unicode_buffer_getreadbuf,
8556 (writebufferproc) unicode_buffer_getwritebuf,
8557 (segcountproc) unicode_buffer_getsegcount,
8558 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559};
8560
Jeremy Hylton938ace62002-07-17 16:30:39 +00008561static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008562unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8563
Tim Peters6d6c1a32001-08-02 04:15:00 +00008564static PyObject *
8565unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8566{
8567 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008568 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008569 char *encoding = NULL;
8570 char *errors = NULL;
8571
Guido van Rossume023fe02001-08-30 03:12:59 +00008572 if (type != &PyUnicode_Type)
8573 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008574 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8575 kwlist, &x, &encoding, &errors))
8576 return NULL;
8577 if (x == NULL)
8578 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008579 if (encoding == NULL && errors == NULL)
8580 return PyObject_Unicode(x);
8581 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008582 return PyUnicode_FromEncodedObject(x, encoding, errors);
8583}
8584
Guido van Rossume023fe02001-08-30 03:12:59 +00008585static PyObject *
8586unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8587{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008588 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008589 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008590
8591 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8592 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8593 if (tmp == NULL)
8594 return NULL;
8595 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008596 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008597 if (pnew == NULL) {
8598 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008599 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008600 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008601 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8602 if (pnew->str == NULL) {
8603 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008604 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008605 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008606 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008607 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008608 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8609 pnew->length = n;
8610 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008611 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008612 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008613}
8614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008615PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008616"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008617\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008618Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008619encoding defaults to the current default string encoding.\n\
8620errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008621
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008622static PyObject *unicode_iter(PyObject *seq);
8623
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008625 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008626 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 sizeof(PyUnicodeObject), /* tp_size */
8628 0, /* tp_itemsize */
8629 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008630 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008632 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008634 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008635 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008636 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008638 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 (hashfunc) unicode_hash, /* tp_hash*/
8640 0, /* tp_call*/
8641 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008642 PyObject_GenericGetAttr, /* tp_getattro */
8643 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008645 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8646 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008647 unicode_doc, /* tp_doc */
8648 0, /* tp_traverse */
8649 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008650 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008651 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008652 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008653 0, /* tp_iternext */
8654 unicode_methods, /* tp_methods */
8655 0, /* tp_members */
8656 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008657 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008658 0, /* tp_dict */
8659 0, /* tp_descr_get */
8660 0, /* tp_descr_set */
8661 0, /* tp_dictoffset */
8662 0, /* tp_init */
8663 0, /* tp_alloc */
8664 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008665 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666};
8667
8668/* Initialize the Unicode implementation */
8669
Thomas Wouters78890102000-07-22 19:25:51 +00008670void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008672 int i;
8673
Thomas Wouters477c8d52006-05-27 19:21:47 +00008674 /* XXX - move this array to unicodectype.c ? */
8675 Py_UNICODE linebreak[] = {
8676 0x000A, /* LINE FEED */
8677 0x000D, /* CARRIAGE RETURN */
8678 0x001C, /* FILE SEPARATOR */
8679 0x001D, /* GROUP SEPARATOR */
8680 0x001E, /* RECORD SEPARATOR */
8681 0x0085, /* NEXT LINE */
8682 0x2028, /* LINE SEPARATOR */
8683 0x2029, /* PARAGRAPH SEPARATOR */
8684 };
8685
Fred Drakee4315f52000-05-09 19:53:39 +00008686 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008687 unicode_freelist = NULL;
8688 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008690 if (!unicode_empty)
8691 return;
8692
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008693 for (i = 0; i < 256; i++)
8694 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008695 if (PyType_Ready(&PyUnicode_Type) < 0)
8696 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008697
8698 /* initialize the linebreak bloom filter */
8699 bloom_linebreak = make_bloom_mask(
8700 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8701 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008702
8703 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704}
8705
8706/* Finalize the Unicode implementation */
8707
8708void
Thomas Wouters78890102000-07-22 19:25:51 +00008709_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008711 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008712 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008714 Py_XDECREF(unicode_empty);
8715 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008716
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008717 for (i = 0; i < 256; i++) {
8718 if (unicode_latin1[i]) {
8719 Py_DECREF(unicode_latin1[i]);
8720 unicode_latin1[i] = NULL;
8721 }
8722 }
8723
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008724 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 PyUnicodeObject *v = u;
8726 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008727 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008728 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008729 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008730 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008732 unicode_freelist = NULL;
8733 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008735
Walter Dörwald16807132007-05-25 13:52:07 +00008736void
8737PyUnicode_InternInPlace(PyObject **p)
8738{
8739 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8740 PyObject *t;
8741 if (s == NULL || !PyUnicode_Check(s))
8742 Py_FatalError(
8743 "PyUnicode_InternInPlace: unicode strings only please!");
8744 /* If it's a subclass, we don't really know what putting
8745 it in the interned dict might do. */
8746 if (!PyUnicode_CheckExact(s))
8747 return;
8748 if (PyUnicode_CHECK_INTERNED(s))
8749 return;
8750 if (interned == NULL) {
8751 interned = PyDict_New();
8752 if (interned == NULL) {
8753 PyErr_Clear(); /* Don't leave an exception */
8754 return;
8755 }
8756 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008757 /* It might be that the GetItem call fails even
8758 though the key is present in the dictionary,
8759 namely when this happens during a stack overflow. */
8760 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008761 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008762 Py_END_ALLOW_RECURSION
8763
Walter Dörwald16807132007-05-25 13:52:07 +00008764 if (t) {
8765 Py_INCREF(t);
8766 Py_DECREF(*p);
8767 *p = t;
8768 return;
8769 }
8770
Martin v. Löwis5b222132007-06-10 09:51:05 +00008771 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008772 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8773 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008774 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008775 return;
8776 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008777 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008778 /* The two references in interned are not counted by refcnt.
8779 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008780 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008781 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8782}
8783
8784void
8785PyUnicode_InternImmortal(PyObject **p)
8786{
8787 PyUnicode_InternInPlace(p);
8788 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8789 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8790 Py_INCREF(*p);
8791 }
8792}
8793
8794PyObject *
8795PyUnicode_InternFromString(const char *cp)
8796{
8797 PyObject *s = PyUnicode_FromString(cp);
8798 if (s == NULL)
8799 return NULL;
8800 PyUnicode_InternInPlace(&s);
8801 return s;
8802}
8803
8804void _Py_ReleaseInternedUnicodeStrings(void)
8805{
8806 PyObject *keys;
8807 PyUnicodeObject *s;
8808 Py_ssize_t i, n;
8809 Py_ssize_t immortal_size = 0, mortal_size = 0;
8810
8811 if (interned == NULL || !PyDict_Check(interned))
8812 return;
8813 keys = PyDict_Keys(interned);
8814 if (keys == NULL || !PyList_Check(keys)) {
8815 PyErr_Clear();
8816 return;
8817 }
8818
8819 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8820 detector, interned unicode strings are not forcibly deallocated;
8821 rather, we give them their stolen references back, and then clear
8822 and DECREF the interned dict. */
8823
8824 n = PyList_GET_SIZE(keys);
8825 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8826 n);
8827 for (i = 0; i < n; i++) {
8828 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8829 switch (s->state) {
8830 case SSTATE_NOT_INTERNED:
8831 /* XXX Shouldn't happen */
8832 break;
8833 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008834 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008835 immortal_size += s->length;
8836 break;
8837 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008838 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008839 mortal_size += s->length;
8840 break;
8841 default:
8842 Py_FatalError("Inconsistent interned string state.");
8843 }
8844 s->state = SSTATE_NOT_INTERNED;
8845 }
8846 fprintf(stderr, "total size of all interned strings: "
8847 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8848 "mortal/immortal\n", mortal_size, immortal_size);
8849 Py_DECREF(keys);
8850 PyDict_Clear(interned);
8851 Py_DECREF(interned);
8852 interned = NULL;
8853}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008854
8855
8856/********************* Unicode Iterator **************************/
8857
8858typedef struct {
8859 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008860 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008861 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8862} unicodeiterobject;
8863
8864static void
8865unicodeiter_dealloc(unicodeiterobject *it)
8866{
8867 _PyObject_GC_UNTRACK(it);
8868 Py_XDECREF(it->it_seq);
8869 PyObject_GC_Del(it);
8870}
8871
8872static int
8873unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8874{
8875 Py_VISIT(it->it_seq);
8876 return 0;
8877}
8878
8879static PyObject *
8880unicodeiter_next(unicodeiterobject *it)
8881{
8882 PyUnicodeObject *seq;
8883 PyObject *item;
8884
8885 assert(it != NULL);
8886 seq = it->it_seq;
8887 if (seq == NULL)
8888 return NULL;
8889 assert(PyUnicode_Check(seq));
8890
8891 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008892 item = PyUnicode_FromUnicode(
8893 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008894 if (item != NULL)
8895 ++it->it_index;
8896 return item;
8897 }
8898
8899 Py_DECREF(seq);
8900 it->it_seq = NULL;
8901 return NULL;
8902}
8903
8904static PyObject *
8905unicodeiter_len(unicodeiterobject *it)
8906{
8907 Py_ssize_t len = 0;
8908 if (it->it_seq)
8909 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8910 return PyInt_FromSsize_t(len);
8911}
8912
8913PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8914
8915static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008916 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8917 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008918 {NULL, NULL} /* sentinel */
8919};
8920
8921PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008922 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008923 "unicodeiterator", /* tp_name */
8924 sizeof(unicodeiterobject), /* tp_basicsize */
8925 0, /* tp_itemsize */
8926 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008927 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008928 0, /* tp_print */
8929 0, /* tp_getattr */
8930 0, /* tp_setattr */
8931 0, /* tp_compare */
8932 0, /* tp_repr */
8933 0, /* tp_as_number */
8934 0, /* tp_as_sequence */
8935 0, /* tp_as_mapping */
8936 0, /* tp_hash */
8937 0, /* tp_call */
8938 0, /* tp_str */
8939 PyObject_GenericGetAttr, /* tp_getattro */
8940 0, /* tp_setattro */
8941 0, /* tp_as_buffer */
8942 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8943 0, /* tp_doc */
8944 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8945 0, /* tp_clear */
8946 0, /* tp_richcompare */
8947 0, /* tp_weaklistoffset */
8948 PyObject_SelfIter, /* tp_iter */
8949 (iternextfunc)unicodeiter_next, /* tp_iternext */
8950 unicodeiter_methods, /* tp_methods */
8951 0,
8952};
8953
8954static PyObject *
8955unicode_iter(PyObject *seq)
8956{
8957 unicodeiterobject *it;
8958
8959 if (!PyUnicode_Check(seq)) {
8960 PyErr_BadInternalCall();
8961 return NULL;
8962 }
8963 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8964 if (it == NULL)
8965 return NULL;
8966 it->it_index = 0;
8967 Py_INCREF(seq);
8968 it->it_seq = (PyUnicodeObject *)seq;
8969 _PyObject_GC_TRACK(it);
8970 return (PyObject *)it;
8971}
8972
Martin v. Löwis5b222132007-06-10 09:51:05 +00008973size_t
8974Py_UNICODE_strlen(const Py_UNICODE *u)
8975{
8976 int res = 0;
8977 while(*u++)
8978 res++;
8979 return res;
8980}
8981
8982Py_UNICODE*
8983Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8984{
8985 Py_UNICODE *u = s1;
8986 while ((*u++ = *s2++));
8987 return s1;
8988}
8989
8990Py_UNICODE*
8991Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8992{
8993 Py_UNICODE *u = s1;
8994 while ((*u++ = *s2++))
8995 if (n-- == 0)
8996 break;
8997 return s1;
8998}
8999
9000int
9001Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9002{
9003 while (*s1 && *s2 && *s1 == *s2)
9004 s1++, s2++;
9005 if (*s1 && *s2)
9006 return (*s1 < *s2) ? -1 : +1;
9007 if (*s1)
9008 return 1;
9009 if (*s2)
9010 return -1;
9011 return 0;
9012}
9013
9014Py_UNICODE*
9015Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9016{
9017 const Py_UNICODE *p;
9018 for (p = s; *p; p++)
9019 if (*p == c)
9020 return (Py_UNICODE*)p;
9021 return NULL;
9022}
9023
9024
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009025#ifdef __cplusplus
9026}
9027#endif
9028
9029
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009030/*
9031Local variables:
9032c-basic-offset: 4
9033indent-tabs-mode: nil
9034End:
9035*/