blob: 172c61caba05776eee2ad57f0ad1c0d5fae983de [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
433 }
434 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
439 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450
451 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 if (len == 0) {
453 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000454 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000458
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000460 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463 return v;
464
465 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000466 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470}
471
472PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *buffer = NULL, *unicode;
478
Fred Drakee4315f52000-05-09 19:53:39 +0000479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
481
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000499 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
503 }
504 Py_DECREF(buffer);
505 return unicode;
506
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
510}
511
512PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
516{
517 PyObject *v, *unicode;
518
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
525}
526
527PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v;
532
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
Fred Drakee4315f52000-05-09 19:53:39 +0000537
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000544 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000558 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
562 }
563 return v;
564
565 onError:
566 return NULL;
567}
568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000569/* Return a Python string holding the default encoded value of the
570 Unicode object.
571
572 The resulting string is cached in the Unicode object for subsequent
573 usage by this function. The cached version is needed to implement
574 the character buffer interface and will live (at least) as long as
575 the Unicode object itself.
576
577 The refcount of the string is *not* incremented.
578
579 *** Exported for internal use by the interpreter only !!! ***
580
581*/
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644/* --- UTF-8 Codec -------------------------------------------------------- */
645
646static
647char utf8_code_length[256] = {
648 /* Map UTF-8 encoded prefix byte to sequence length. zero means
649 illegal prefix. see RFC 2279 for details */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
663 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
665 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
666};
667
668static
669int utf8_decoding_error(const char **source,
670 Py_UNICODE **dest,
671 const char *errors,
672 const char *details)
673{
674 if ((errors == NULL) ||
675 (strcmp(errors,"strict") == 0)) {
676 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000677 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 details);
679 return -1;
680 }
681 else if (strcmp(errors,"ignore") == 0) {
682 (*source)++;
683 return 0;
684 }
685 else if (strcmp(errors,"replace") == 0) {
686 (*source)++;
687 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
688 (*dest)++;
689 return 0;
690 }
691 else {
692 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000693 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 errors);
695 return -1;
696 }
697}
698
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699PyObject *PyUnicode_DecodeUTF8(const char *s,
700 int size,
701 const char *errors)
702{
703 int n;
704 const char *e;
705 PyUnicodeObject *unicode;
706 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708
709 /* Note: size will always be longer than the resulting Unicode
710 character count */
711 unicode = _PyUnicode_New(size);
712 if (!unicode)
713 return NULL;
714 if (size == 0)
715 return (PyObject *)unicode;
716
717 /* Unpack UTF-8 encoded data */
718 p = unicode->str;
719 e = s + size;
720
721 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000722 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723
724 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000725 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 s++;
727 continue;
728 }
729
730 n = utf8_code_length[ch];
731
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 if (s + n > e) {
733 errmsg = "unexpected end of data";
734 goto utf8Error;
735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736
737 switch (n) {
738
739 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000740 errmsg = "unexpected code byte";
741 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742
743 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 errmsg = "internal error";
745 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000746
747 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000748 if ((s[1] & 0xc0) != 0x80) {
749 errmsg = "invalid data";
750 goto utf8Error;
751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000752 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000753 if (ch < 0x80) {
754 errmsg = "illegal encoding";
755 goto utf8Error;
756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000758 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759 break;
760
761 case 3:
762 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000763 (s[2] & 0xc0) != 0x80) {
764 errmsg = "invalid data";
765 goto utf8Error;
766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000768 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
769 errmsg = "illegal encoding";
770 goto utf8Error;
771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000773 *p++ = (Py_UNICODE)ch;
774 break;
775
776 case 4:
777 if ((s[1] & 0xc0) != 0x80 ||
778 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000779 (s[3] & 0xc0) != 0x80) {
780 errmsg = "invalid data";
781 goto utf8Error;
782 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000783 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
784 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
785 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000786 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000787 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000788 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000790 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000791 errmsg = "illegal encoding";
792 goto utf8Error;
793 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000794#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000795 *p++ = (Py_UNICODE)ch;
796#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000797 /* compute and append the two surrogates: */
798
799 /* translate from 10000..10FFFF to 0..FFFF */
800 ch -= 0x10000;
801
802 /* high surrogate = top 10 bits added to D800 */
803 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
804
805 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000806 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000807#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 break;
809
810 default:
811 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000812 errmsg = "unsupported Unicode code range";
813 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000816 continue;
817
818 utf8Error:
819 if (utf8_decoding_error(&s, &p, errors, errmsg))
820 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 }
822
823 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000824 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 goto onError;
826
827 return (PyObject *)unicode;
828
829onError:
830 Py_DECREF(unicode);
831 return NULL;
832}
833
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000834/* Not used anymore, now that the encoder supports UTF-16
835 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000836#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837static
838int utf8_encoding_error(const Py_UNICODE **source,
839 char **dest,
840 const char *errors,
841 const char *details)
842{
843 if ((errors == NULL) ||
844 (strcmp(errors,"strict") == 0)) {
845 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000846 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000847 details);
848 return -1;
849 }
850 else if (strcmp(errors,"ignore") == 0) {
851 return 0;
852 }
853 else if (strcmp(errors,"replace") == 0) {
854 **dest = '?';
855 (*dest)++;
856 return 0;
857 }
858 else {
859 PyErr_Format(PyExc_ValueError,
860 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000861 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 errors);
863 return -1;
864 }
865}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000866#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
869 int size,
870 const char *errors)
871{
872 PyObject *v;
873 char *p;
874 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000875 Py_UCS4 ch2;
876 unsigned int cbAllocated = 3 * size;
877 unsigned int cbWritten = 0;
878 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000880 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881 if (v == NULL)
882 return NULL;
883 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000884 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
886 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000887 while (i < size) {
888 Py_UCS4 ch = s[i++];
889 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000891 cbWritten++;
892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 else if (ch < 0x0800) {
894 *p++ = 0xc0 | (ch >> 6);
895 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000896 cbWritten += 2;
897 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000898 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000899 /* Check for high surrogate */
900 if (0xD800 <= ch && ch <= 0xDBFF) {
901 if (i != size) {
902 ch2 = s[i];
903 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
904
905 if (cbWritten >= (cbAllocated - 4)) {
906 /* Provide enough room for some more
907 surrogates */
908 cbAllocated += 4*10;
909 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000910 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000911 }
912
913 /* combine the two values */
914 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
915
916 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000917 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000918 i++;
919 cbWritten += 4;
920 }
921 }
922 }
923 else {
924 *p++ = (char)(0xe0 | (ch >> 12));
925 cbWritten += 3;
926 }
927 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
928 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000929 } else {
930 *p++ = 0xf0 | (ch>>18);
931 *p++ = 0x80 | ((ch>>12) & 0x3f);
932 *p++ = 0x80 | ((ch>>6) & 0x3f);
933 *p++ = 0x80 | (ch & 0x3f);
934 cbWritten += 4;
935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 }
937 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000938 if (_PyString_Resize(&v, p - q))
939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return v;
941
942 onError:
943 Py_DECREF(v);
944 return NULL;
945}
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
948{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 if (!PyUnicode_Check(unicode)) {
950 PyErr_BadArgument();
951 return NULL;
952 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000953 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
954 PyUnicode_GET_SIZE(unicode),
955 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956}
957
958/* --- UTF-16 Codec ------------------------------------------------------- */
959
960static
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000961int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 Py_UNICODE **dest,
963 const char *errors,
964 const char *details)
965{
966 if ((errors == NULL) ||
967 (strcmp(errors,"strict") == 0)) {
968 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000969 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 details);
971 return -1;
972 }
973 else if (strcmp(errors,"ignore") == 0) {
974 return 0;
975 }
976 else if (strcmp(errors,"replace") == 0) {
977 if (dest) {
978 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
979 (*dest)++;
980 }
981 return 0;
982 }
983 else {
984 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000985 "UTF-16 decoding error; "
986 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 errors);
988 return -1;
989 }
990}
991
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992PyObject *PyUnicode_DecodeUTF16(const char *s,
993 int size,
994 const char *errors,
995 int *byteorder)
996{
997 PyUnicodeObject *unicode;
998 Py_UNICODE *p;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000999 const Py_UCS2 *q, *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001001 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* size should be an even number */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001004 if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1006 return NULL;
1007 /* The remaining input chars are ignored if we fall through
1008 here... */
1009 }
1010
1011 /* Note: size will always be longer than the resulting Unicode
1012 character count */
1013 unicode = _PyUnicode_New(size);
1014 if (!unicode)
1015 return NULL;
1016 if (size == 0)
1017 return (PyObject *)unicode;
1018
1019 /* Unpack UTF-16 encoded data */
1020 p = unicode->str;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001021 q = (Py_UCS2 *)s;
1022 e = q + (size / sizeof(Py_UCS2));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (byteorder)
1025 bo = *byteorder;
1026
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001027 /* Check for BOM marks (U+FEFF) in the input and adjust current
1028 byte order setting accordingly. In native mode, the leading BOM
1029 mark is skipped, in all other modes, it is copied to the output
1030 stream as-is (giving a ZWNBSP character). */
1031 if (bo == 0) {
1032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1033 if (*q == 0xFEFF) {
1034 q++;
1035 bo = -1;
1036 } else if (*q == 0xFFFE) {
1037 q++;
1038 bo = 1;
1039 }
1040#else
1041 if (*q == 0xFEFF) {
1042 q++;
1043 bo = 1;
1044 } else if (*q == 0xFFFE) {
1045 q++;
1046 bo = -1;
1047 }
1048#endif
1049 }
1050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 while (q < e) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001052 register Py_UCS2 ch = *q++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001054 /* Swap input bytes if needed. (This assumes
1055 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 if (bo == 1)
1058 ch = (ch >> 8) | (ch << 8);
1059#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 if (bo == -1)
1061 ch = (ch >> 8) | (ch << 8);
1062#endif
1063 if (ch < 0xD800 || ch > 0xDFFF) {
1064 *p++ = ch;
1065 continue;
1066 }
1067
1068 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001069 if (q >= e) {
1070 errmsg = "unexpected end of data";
1071 goto utf16Error;
1072 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001073 if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001074 Py_UCS2 ch2 = *q++;
1075#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1076 if (bo == 1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001077 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001078#else
1079 if (bo == -1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001080 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001081#endif
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001082 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001083#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001084 *p++ = ch;
1085 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001086#else
1087 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001088#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001089 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 }
1091 else {
1092 errmsg = "illegal UTF-16 surrogate";
1093 goto utf16Error;
1094 }
1095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001097 errmsg = "illegal encoding";
1098 /* Fall through to report the error */
1099
1100 utf16Error:
1101 if (utf16_decoding_error(&q, &p, errors, errmsg))
1102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104
1105 if (byteorder)
1106 *byteorder = bo;
1107
1108 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001109 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 goto onError;
1111
1112 return (PyObject *)unicode;
1113
1114onError:
1115 Py_DECREF(unicode);
1116 return NULL;
1117}
1118
1119#undef UTF16_ERROR
1120
1121PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1122 int size,
1123 const char *errors,
1124 int byteorder)
1125{
1126 PyObject *v;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001127 Py_UCS2 *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 char *q;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001129 int i, pairs, doswap = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001131 for (i = pairs = 0; i < size; i++)
1132 if (s[i] >= 0x10000)
1133 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 v = PyString_FromStringAndSize(NULL,
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001135 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 if (v == NULL)
1137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 q = PyString_AS_STRING(v);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001140 p = (Py_UCS2 *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 if (byteorder == 0)
1142 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001143 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001144 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 if (byteorder == 0 ||
1146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1147 byteorder == -1
1148#else
1149 byteorder == 1
1150#endif
1151 )
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001152 doswap = 0;
1153 while (size-- > 0) {
1154 Py_UNICODE ch = *s++;
1155 Py_UNICODE ch2 = 0;
1156 if (ch >= 0x10000) {
1157 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1158 ch = 0xD800|((ch-0x10000)>>10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001160 if (doswap){
1161 *p++ = (ch >> 8) | (ch << 8);
1162 if (ch2)
1163 *p++ = (ch2 >> 8) | (ch2 << 8);
1164 }else{
1165 *p++ = ch;
1166 if(ch2)
1167 *p++ = ch2;
1168 }
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 return v;
1171}
1172
1173PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1174{
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 return NULL;
1178 }
1179 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1180 PyUnicode_GET_SIZE(unicode),
1181 NULL,
1182 0);
1183}
1184
1185/* --- Unicode Escape Codec ----------------------------------------------- */
1186
1187static
1188int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001189 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 const char *errors,
1191 const char *details)
1192{
1193 if ((errors == NULL) ||
1194 (strcmp(errors,"strict") == 0)) {
1195 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001196 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 details);
1198 return -1;
1199 }
1200 else if (strcmp(errors,"ignore") == 0) {
1201 return 0;
1202 }
1203 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001204 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 return 0;
1206 }
1207 else {
1208 PyErr_Format(PyExc_ValueError,
1209 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001210 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 errors);
1212 return -1;
1213 }
1214}
1215
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1219 int size,
1220 const char *errors)
1221{
1222 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001225 char* message;
1226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 /* Escaped strings will always be longer than the resulting
1229 Unicode string, so we start with size here and then reduce the
1230 length after conversion to the true value. */
1231 v = _PyUnicode_New(size);
1232 if (v == NULL)
1233 goto onError;
1234 if (size == 0)
1235 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 p = buf = PyUnicode_AS_UNICODE(v);
1238 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 while (s < end) {
1241 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001242 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001243 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244
1245 /* Non-escape characters are interpreted as Unicode ordinals */
1246 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 continue;
1249 }
1250
1251 /* \ - Escapes */
1252 s++;
1253 switch (*s++) {
1254
1255 /* \x escapes */
1256 case '\n': break;
1257 case '\\': *p++ = '\\'; break;
1258 case '\'': *p++ = '\''; break;
1259 case '\"': *p++ = '\"'; break;
1260 case 'b': *p++ = '\b'; break;
1261 case 'f': *p++ = '\014'; break; /* FF */
1262 case 't': *p++ = '\t'; break;
1263 case 'n': *p++ = '\n'; break;
1264 case 'r': *p++ = '\r'; break;
1265 case 'v': *p++ = '\013'; break; /* VT */
1266 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1267
1268 /* \OOO (octal) escapes */
1269 case '0': case '1': case '2': case '3':
1270 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001271 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001273 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001275 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001277 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 break;
1279
Fredrik Lundhccc74732001-02-18 22:13:49 +00001280 /* hex escapes */
1281 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001283 digits = 2;
1284 message = "truncated \\xXX escape";
1285 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
Fredrik Lundhccc74732001-02-18 22:13:49 +00001287 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001289 digits = 4;
1290 message = "truncated \\uXXXX escape";
1291 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292
Fredrik Lundhccc74732001-02-18 22:13:49 +00001293 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001294 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001295 digits = 8;
1296 message = "truncated \\UXXXXXXXX escape";
1297 hexescape:
1298 chr = 0;
1299 for (i = 0; i < digits; i++) {
1300 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001301 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001302 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001303 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001304 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001305 i++;
1306 break;
1307 }
1308 chr = (chr<<4) & ~0xF;
1309 if (c >= '0' && c <= '9')
1310 chr += c - '0';
1311 else if (c >= 'a' && c <= 'f')
1312 chr += 10 + c - 'a';
1313 else
1314 chr += 10 + c - 'A';
1315 }
1316 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001317 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001318 /* when we get here, chr is a 32-bit unicode character */
1319 if (chr <= 0xffff)
1320 /* UCS-2 character */
1321 *p++ = (Py_UNICODE) chr;
1322 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001323 /* UCS-4 character. Either store directly, or as
1324 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001325#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001326 *p++ = chr;
1327#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001328 chr -= 0x10000L;
1329 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001330 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001331#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001332 } else {
1333 if (unicodeescape_decoding_error(
1334 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001335 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 )
1337 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001338 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001339 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001340 break;
1341
1342 /* \N{name} */
1343 case 'N':
1344 message = "malformed \\N character escape";
1345 if (ucnhash_CAPI == NULL) {
1346 /* load the unicode data module */
1347 PyObject *m, *v;
1348 m = PyImport_ImportModule("unicodedata");
1349 if (m == NULL)
1350 goto ucnhashError;
1351 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1352 Py_DECREF(m);
1353 if (v == NULL)
1354 goto ucnhashError;
1355 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1356 Py_DECREF(v);
1357 if (ucnhash_CAPI == NULL)
1358 goto ucnhashError;
1359 }
1360 if (*s == '{') {
1361 const char *start = s+1;
1362 /* look for the closing brace */
1363 while (*s != '}' && s < end)
1364 s++;
1365 if (s > start && s < end && *s == '}') {
1366 /* found a name. look it up in the unicode database */
1367 message = "unknown Unicode character name";
1368 s++;
1369 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1370 goto store;
1371 }
1372 }
1373 if (unicodeescape_decoding_error(&s, &x, errors, message))
1374 goto onError;
1375 *p++ = x;
1376 break;
1377
1378 default:
1379 *p++ = '\\';
1380 *p++ = (unsigned char)s[-1];
1381 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
1383 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001384 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 return (PyObject *)v;
1387
Fredrik Lundhccc74732001-02-18 22:13:49 +00001388ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001389 PyErr_SetString(
1390 PyExc_UnicodeError,
1391 "\\N escapes not supported (can't load unicodedata module)"
1392 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001393 return NULL;
1394
Fredrik Lundhccc74732001-02-18 22:13:49 +00001395onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_XDECREF(v);
1397 return NULL;
1398}
1399
1400/* Return a Unicode-Escape string version of the Unicode object.
1401
1402 If quotes is true, the string is enclosed in u"" or u'' quotes as
1403 appropriate.
1404
1405*/
1406
Barry Warsaw51ac5802000-03-20 16:36:48 +00001407static const Py_UNICODE *findchar(const Py_UNICODE *s,
1408 int size,
1409 Py_UNICODE ch);
1410
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411static
1412PyObject *unicodeescape_string(const Py_UNICODE *s,
1413 int size,
1414 int quotes)
1415{
1416 PyObject *repr;
1417 char *p;
1418 char *q;
1419
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001420 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
1422 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1423 if (repr == NULL)
1424 return NULL;
1425
1426 p = q = PyString_AS_STRING(repr);
1427
1428 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 *p++ = 'u';
1430 *p++ = (findchar(s, size, '\'') &&
1431 !findchar(s, size, '"')) ? '"' : '\'';
1432 }
1433 while (size-- > 0) {
1434 Py_UNICODE ch = *s++;
1435 /* Escape quotes */
Fredrik Lundh30831632001-06-26 15:11:00 +00001436 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 *p++ = '\\';
1438 *p++ = (char) ch;
1439 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001440#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001441 /* Map 21-bit characters to '\U00xxxxxx' */
1442 else if (ch >= 0x10000) {
1443 *p++ = '\\';
1444 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001445 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1446 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1447 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1448 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1449 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1450 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1451 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001452 *p++ = hexdigit[ch & 15];
1453 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001454#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001455 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1456 else if (ch >= 0xD800 && ch < 0xDC00) {
1457 Py_UNICODE ch2;
1458 Py_UCS4 ucs;
1459
1460 ch2 = *s++;
1461 size--;
1462 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1463 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1464 *p++ = '\\';
1465 *p++ = 'U';
1466 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1467 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1468 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1469 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1470 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1471 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1472 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1473 *p++ = hexdigit[ucs & 0x0000000F];
1474 continue;
1475 }
1476 /* Fall through: isolated surrogates are copied as-is */
1477 s--;
1478 size++;
1479 }
1480
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001482 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 *p++ = '\\';
1484 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001485 *p++ = hexdigit[(ch >> 12) & 0x000F];
1486 *p++ = hexdigit[(ch >> 8) & 0x000F];
1487 *p++ = hexdigit[(ch >> 4) & 0x000F];
1488 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001490 /* Map special whitespace to '\t', \n', '\r' */
1491 else if (ch == '\t') {
1492 *p++ = '\\';
1493 *p++ = 't';
1494 }
1495 else if (ch == '\n') {
1496 *p++ = '\\';
1497 *p++ = 'n';
1498 }
1499 else if (ch == '\r') {
1500 *p++ = '\\';
1501 *p++ = 'r';
1502 }
1503 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 else if (ch < ' ' || ch >= 128) {
1505 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001506 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001507 *p++ = hexdigit[(ch >> 4) & 0x000F];
1508 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
1510 /* Copy everything else as-is */
1511 else
1512 *p++ = (char) ch;
1513 }
1514 if (quotes)
1515 *p++ = q[1];
1516
1517 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001518 if (_PyString_Resize(&repr, p - q))
1519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001522
1523 onError:
1524 Py_DECREF(repr);
1525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526}
1527
1528PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1529 int size)
1530{
1531 return unicodeescape_string(s, size, 0);
1532}
1533
1534PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1535{
1536 if (!PyUnicode_Check(unicode)) {
1537 PyErr_BadArgument();
1538 return NULL;
1539 }
1540 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1541 PyUnicode_GET_SIZE(unicode));
1542}
1543
1544/* --- Raw Unicode Escape Codec ------------------------------------------- */
1545
1546PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1547 int size,
1548 const char *errors)
1549{
1550 PyUnicodeObject *v;
1551 Py_UNICODE *p, *buf;
1552 const char *end;
1553 const char *bs;
1554
1555 /* Escaped strings will always be longer than the resulting
1556 Unicode string, so we start with size here and then reduce the
1557 length after conversion to the true value. */
1558 v = _PyUnicode_New(size);
1559 if (v == NULL)
1560 goto onError;
1561 if (size == 0)
1562 return (PyObject *)v;
1563 p = buf = PyUnicode_AS_UNICODE(v);
1564 end = s + size;
1565 while (s < end) {
1566 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001567 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 int i;
1569
1570 /* Non-escape characters are interpreted as Unicode ordinals */
1571 if (*s != '\\') {
1572 *p++ = (unsigned char)*s++;
1573 continue;
1574 }
1575
1576 /* \u-escapes are only interpreted iff the number of leading
1577 backslashes if odd */
1578 bs = s;
1579 for (;s < end;) {
1580 if (*s != '\\')
1581 break;
1582 *p++ = (unsigned char)*s++;
1583 }
1584 if (((s - bs) & 1) == 0 ||
1585 s >= end ||
1586 *s != 'u') {
1587 continue;
1588 }
1589 p--;
1590 s++;
1591
1592 /* \uXXXX with 4 hex digits */
1593 for (x = 0, i = 0; i < 4; i++) {
1594 c = (unsigned char)s[i];
1595 if (!isxdigit(c)) {
1596 if (unicodeescape_decoding_error(&s, &x, errors,
1597 "truncated \\uXXXX"))
1598 goto onError;
1599 i++;
1600 break;
1601 }
1602 x = (x<<4) & ~0xF;
1603 if (c >= '0' && c <= '9')
1604 x += c - '0';
1605 else if (c >= 'a' && c <= 'f')
1606 x += 10 + c - 'a';
1607 else
1608 x += 10 + c - 'A';
1609 }
1610 s += i;
1611 *p++ = x;
1612 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001613 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 return (PyObject *)v;
1616
1617 onError:
1618 Py_XDECREF(v);
1619 return NULL;
1620}
1621
1622PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1623 int size)
1624{
1625 PyObject *repr;
1626 char *p;
1627 char *q;
1628
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001629 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
1631 repr = PyString_FromStringAndSize(NULL, 6 * size);
1632 if (repr == NULL)
1633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001634 if (size == 0)
1635 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636
1637 p = q = PyString_AS_STRING(repr);
1638 while (size-- > 0) {
1639 Py_UNICODE ch = *s++;
1640 /* Map 16-bit characters to '\uxxxx' */
1641 if (ch >= 256) {
1642 *p++ = '\\';
1643 *p++ = 'u';
1644 *p++ = hexdigit[(ch >> 12) & 0xf];
1645 *p++ = hexdigit[(ch >> 8) & 0xf];
1646 *p++ = hexdigit[(ch >> 4) & 0xf];
1647 *p++ = hexdigit[ch & 15];
1648 }
1649 /* Copy everything else as-is */
1650 else
1651 *p++ = (char) ch;
1652 }
1653 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001654 if (_PyString_Resize(&repr, p - q))
1655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656
1657 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001658
1659 onError:
1660 Py_DECREF(repr);
1661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662}
1663
1664PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1665{
1666 if (!PyUnicode_Check(unicode)) {
1667 PyErr_BadArgument();
1668 return NULL;
1669 }
1670 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1671 PyUnicode_GET_SIZE(unicode));
1672}
1673
1674/* --- Latin-1 Codec ------------------------------------------------------ */
1675
1676PyObject *PyUnicode_DecodeLatin1(const char *s,
1677 int size,
1678 const char *errors)
1679{
1680 PyUnicodeObject *v;
1681 Py_UNICODE *p;
1682
1683 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001684 if (size == 1 && *(unsigned char*)s < 256) {
1685 Py_UNICODE r = *(unsigned char*)s;
1686 return PyUnicode_FromUnicode(&r, 1);
1687 }
1688
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 v = _PyUnicode_New(size);
1690 if (v == NULL)
1691 goto onError;
1692 if (size == 0)
1693 return (PyObject *)v;
1694 p = PyUnicode_AS_UNICODE(v);
1695 while (size-- > 0)
1696 *p++ = (unsigned char)*s++;
1697 return (PyObject *)v;
1698
1699 onError:
1700 Py_XDECREF(v);
1701 return NULL;
1702}
1703
1704static
1705int latin1_encoding_error(const Py_UNICODE **source,
1706 char **dest,
1707 const char *errors,
1708 const char *details)
1709{
1710 if ((errors == NULL) ||
1711 (strcmp(errors,"strict") == 0)) {
1712 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001713 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 details);
1715 return -1;
1716 }
1717 else if (strcmp(errors,"ignore") == 0) {
1718 return 0;
1719 }
1720 else if (strcmp(errors,"replace") == 0) {
1721 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001722 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 return 0;
1724 }
1725 else {
1726 PyErr_Format(PyExc_ValueError,
1727 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001728 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 errors);
1730 return -1;
1731 }
1732}
1733
1734PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1735 int size,
1736 const char *errors)
1737{
1738 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001739 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001740
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 repr = PyString_FromStringAndSize(NULL, size);
1742 if (repr == NULL)
1743 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001744 if (size == 0)
1745 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746
1747 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001748 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (size-- > 0) {
1750 Py_UNICODE ch = *p++;
1751 if (ch >= 256) {
1752 if (latin1_encoding_error(&p, &s, errors,
1753 "ordinal not in range(256)"))
1754 goto onError;
1755 }
1756 else
1757 *s++ = (char)ch;
1758 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001759 /* Resize if error handling skipped some characters */
1760 if (s - start < PyString_GET_SIZE(repr))
1761 if (_PyString_Resize(&repr, s - start))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return repr;
1764
1765 onError:
1766 Py_DECREF(repr);
1767 return NULL;
1768}
1769
1770PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1771{
1772 if (!PyUnicode_Check(unicode)) {
1773 PyErr_BadArgument();
1774 return NULL;
1775 }
1776 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1777 PyUnicode_GET_SIZE(unicode),
1778 NULL);
1779}
1780
1781/* --- 7-bit ASCII Codec -------------------------------------------------- */
1782
1783static
1784int ascii_decoding_error(const char **source,
1785 Py_UNICODE **dest,
1786 const char *errors,
1787 const char *details)
1788{
1789 if ((errors == NULL) ||
1790 (strcmp(errors,"strict") == 0)) {
1791 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001792 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 details);
1794 return -1;
1795 }
1796 else if (strcmp(errors,"ignore") == 0) {
1797 return 0;
1798 }
1799 else if (strcmp(errors,"replace") == 0) {
1800 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1801 (*dest)++;
1802 return 0;
1803 }
1804 else {
1805 PyErr_Format(PyExc_ValueError,
1806 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001807 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 errors);
1809 return -1;
1810 }
1811}
1812
1813PyObject *PyUnicode_DecodeASCII(const char *s,
1814 int size,
1815 const char *errors)
1816{
1817 PyUnicodeObject *v;
1818 Py_UNICODE *p;
1819
1820 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001821 if (size == 1 && *(unsigned char*)s < 128) {
1822 Py_UNICODE r = *(unsigned char*)s;
1823 return PyUnicode_FromUnicode(&r, 1);
1824 }
1825
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 v = _PyUnicode_New(size);
1827 if (v == NULL)
1828 goto onError;
1829 if (size == 0)
1830 return (PyObject *)v;
1831 p = PyUnicode_AS_UNICODE(v);
1832 while (size-- > 0) {
1833 register unsigned char c;
1834
1835 c = (unsigned char)*s++;
1836 if (c < 128)
1837 *p++ = c;
1838 else if (ascii_decoding_error(&s, &p, errors,
1839 "ordinal not in range(128)"))
1840 goto onError;
1841 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001842 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001843 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001844 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 return (PyObject *)v;
1846
1847 onError:
1848 Py_XDECREF(v);
1849 return NULL;
1850}
1851
1852static
1853int ascii_encoding_error(const Py_UNICODE **source,
1854 char **dest,
1855 const char *errors,
1856 const char *details)
1857{
1858 if ((errors == NULL) ||
1859 (strcmp(errors,"strict") == 0)) {
1860 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001861 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 details);
1863 return -1;
1864 }
1865 else if (strcmp(errors,"ignore") == 0) {
1866 return 0;
1867 }
1868 else if (strcmp(errors,"replace") == 0) {
1869 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001870 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 return 0;
1872 }
1873 else {
1874 PyErr_Format(PyExc_ValueError,
1875 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001876 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 errors);
1878 return -1;
1879 }
1880}
1881
1882PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1883 int size,
1884 const char *errors)
1885{
1886 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001887 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001888
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 repr = PyString_FromStringAndSize(NULL, size);
1890 if (repr == NULL)
1891 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001892 if (size == 0)
1893 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
1895 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001896 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 while (size-- > 0) {
1898 Py_UNICODE ch = *p++;
1899 if (ch >= 128) {
1900 if (ascii_encoding_error(&p, &s, errors,
1901 "ordinal not in range(128)"))
1902 goto onError;
1903 }
1904 else
1905 *s++ = (char)ch;
1906 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001907 /* Resize if error handling skipped some characters */
1908 if (s - start < PyString_GET_SIZE(repr))
1909 if (_PyString_Resize(&repr, s - start))
1910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 return repr;
1912
1913 onError:
1914 Py_DECREF(repr);
1915 return NULL;
1916}
1917
1918PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1919{
1920 if (!PyUnicode_Check(unicode)) {
1921 PyErr_BadArgument();
1922 return NULL;
1923 }
1924 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1925 PyUnicode_GET_SIZE(unicode),
1926 NULL);
1927}
1928
Fredrik Lundh30831632001-06-26 15:11:00 +00001929#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001930
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001931/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001932
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001933PyObject *PyUnicode_DecodeMBCS(const char *s,
1934 int size,
1935 const char *errors)
1936{
1937 PyUnicodeObject *v;
1938 Py_UNICODE *p;
1939
1940 /* First get the size of the result */
1941 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001942 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001943 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1944
1945 v = _PyUnicode_New(usize);
1946 if (v == NULL)
1947 return NULL;
1948 if (usize == 0)
1949 return (PyObject *)v;
1950 p = PyUnicode_AS_UNICODE(v);
1951 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1952 Py_DECREF(v);
1953 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1954 }
1955
1956 return (PyObject *)v;
1957}
1958
1959PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1960 int size,
1961 const char *errors)
1962{
1963 PyObject *repr;
1964 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001965 DWORD mbcssize;
1966
1967 /* If there are no characters, bail now! */
1968 if (size==0)
1969 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001970
1971 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001972 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001973 if (mbcssize==0)
1974 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1975
1976 repr = PyString_FromStringAndSize(NULL, mbcssize);
1977 if (repr == NULL)
1978 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001979 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001980 return repr;
1981
1982 /* Do the conversion */
1983 s = PyString_AS_STRING(repr);
1984 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1985 Py_DECREF(repr);
1986 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1987 }
1988 return repr;
1989}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001990
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001991#endif /* MS_WIN32 */
1992
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993/* --- Character Mapping Codec -------------------------------------------- */
1994
1995static
1996int charmap_decoding_error(const char **source,
1997 Py_UNICODE **dest,
1998 const char *errors,
1999 const char *details)
2000{
2001 if ((errors == NULL) ||
2002 (strcmp(errors,"strict") == 0)) {
2003 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002004 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 details);
2006 return -1;
2007 }
2008 else if (strcmp(errors,"ignore") == 0) {
2009 return 0;
2010 }
2011 else if (strcmp(errors,"replace") == 0) {
2012 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2013 (*dest)++;
2014 return 0;
2015 }
2016 else {
2017 PyErr_Format(PyExc_ValueError,
2018 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002019 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 errors);
2021 return -1;
2022 }
2023}
2024
2025PyObject *PyUnicode_DecodeCharmap(const char *s,
2026 int size,
2027 PyObject *mapping,
2028 const char *errors)
2029{
2030 PyUnicodeObject *v;
2031 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002032 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
2034 /* Default to Latin-1 */
2035 if (mapping == NULL)
2036 return PyUnicode_DecodeLatin1(s, size, errors);
2037
2038 v = _PyUnicode_New(size);
2039 if (v == NULL)
2040 goto onError;
2041 if (size == 0)
2042 return (PyObject *)v;
2043 p = PyUnicode_AS_UNICODE(v);
2044 while (size-- > 0) {
2045 unsigned char ch = *s++;
2046 PyObject *w, *x;
2047
2048 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2049 w = PyInt_FromLong((long)ch);
2050 if (w == NULL)
2051 goto onError;
2052 x = PyObject_GetItem(mapping, w);
2053 Py_DECREF(w);
2054 if (x == NULL) {
2055 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002056 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002058 x = Py_None;
2059 Py_INCREF(x);
2060 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 }
2063
2064 /* Apply mapping */
2065 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002066 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 if (value < 0 || value > 65535) {
2068 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002069 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 Py_DECREF(x);
2071 goto onError;
2072 }
2073 *p++ = (Py_UNICODE)value;
2074 }
2075 else if (x == Py_None) {
2076 /* undefined mapping */
2077 if (charmap_decoding_error(&s, &p, errors,
2078 "character maps to <undefined>")) {
2079 Py_DECREF(x);
2080 goto onError;
2081 }
2082 }
2083 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002084 int targetsize = PyUnicode_GET_SIZE(x);
2085
2086 if (targetsize == 1)
2087 /* 1-1 mapping */
2088 *p++ = *PyUnicode_AS_UNICODE(x);
2089
2090 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002092 if (targetsize > extrachars) {
2093 /* resize first */
2094 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2095 int needed = (targetsize - extrachars) + \
2096 (targetsize << 2);
2097 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002098 if (_PyUnicode_Resize(&v,
2099 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002100 Py_DECREF(x);
2101 goto onError;
2102 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002103 p = PyUnicode_AS_UNICODE(v) + oldpos;
2104 }
2105 Py_UNICODE_COPY(p,
2106 PyUnicode_AS_UNICODE(x),
2107 targetsize);
2108 p += targetsize;
2109 extrachars -= targetsize;
2110 }
2111 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 }
2113 else {
2114 /* wrong return value */
2115 PyErr_SetString(PyExc_TypeError,
2116 "character mapping must return integer, None or unicode");
2117 Py_DECREF(x);
2118 goto onError;
2119 }
2120 Py_DECREF(x);
2121 }
2122 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002123 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 goto onError;
2125 return (PyObject *)v;
2126
2127 onError:
2128 Py_XDECREF(v);
2129 return NULL;
2130}
2131
2132static
2133int charmap_encoding_error(const Py_UNICODE **source,
2134 char **dest,
2135 const char *errors,
2136 const char *details)
2137{
2138 if ((errors == NULL) ||
2139 (strcmp(errors,"strict") == 0)) {
2140 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002141 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 details);
2143 return -1;
2144 }
2145 else if (strcmp(errors,"ignore") == 0) {
2146 return 0;
2147 }
2148 else if (strcmp(errors,"replace") == 0) {
2149 **dest = '?';
2150 (*dest)++;
2151 return 0;
2152 }
2153 else {
2154 PyErr_Format(PyExc_ValueError,
2155 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002156 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 errors);
2158 return -1;
2159 }
2160}
2161
2162PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2163 int size,
2164 PyObject *mapping,
2165 const char *errors)
2166{
2167 PyObject *v;
2168 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002169 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170
2171 /* Default to Latin-1 */
2172 if (mapping == NULL)
2173 return PyUnicode_EncodeLatin1(p, size, errors);
2174
2175 v = PyString_FromStringAndSize(NULL, size);
2176 if (v == NULL)
2177 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002178 if (size == 0)
2179 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 s = PyString_AS_STRING(v);
2181 while (size-- > 0) {
2182 Py_UNICODE ch = *p++;
2183 PyObject *w, *x;
2184
2185 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2186 w = PyInt_FromLong((long)ch);
2187 if (w == NULL)
2188 goto onError;
2189 x = PyObject_GetItem(mapping, w);
2190 Py_DECREF(w);
2191 if (x == NULL) {
2192 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002193 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002195 x = Py_None;
2196 Py_INCREF(x);
2197 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 }
2200
2201 /* Apply mapping */
2202 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002203 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 if (value < 0 || value > 255) {
2205 PyErr_SetString(PyExc_TypeError,
2206 "character mapping must be in range(256)");
2207 Py_DECREF(x);
2208 goto onError;
2209 }
2210 *s++ = (char)value;
2211 }
2212 else if (x == Py_None) {
2213 /* undefined mapping */
2214 if (charmap_encoding_error(&p, &s, errors,
2215 "character maps to <undefined>")) {
2216 Py_DECREF(x);
2217 goto onError;
2218 }
2219 }
2220 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002221 int targetsize = PyString_GET_SIZE(x);
2222
2223 if (targetsize == 1)
2224 /* 1-1 mapping */
2225 *s++ = *PyString_AS_STRING(x);
2226
2227 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002229 if (targetsize > extrachars) {
2230 /* resize first */
2231 int oldpos = (int)(s - PyString_AS_STRING(v));
2232 int needed = (targetsize - extrachars) + \
2233 (targetsize << 2);
2234 extrachars += needed;
2235 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002236 Py_DECREF(x);
2237 goto onError;
2238 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002239 s = PyString_AS_STRING(v) + oldpos;
2240 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002241 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002242 s += targetsize;
2243 extrachars -= targetsize;
2244 }
2245 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 }
2247 else {
2248 /* wrong return value */
2249 PyErr_SetString(PyExc_TypeError,
2250 "character mapping must return integer, None or unicode");
2251 Py_DECREF(x);
2252 goto onError;
2253 }
2254 Py_DECREF(x);
2255 }
2256 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2257 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2258 goto onError;
2259 return v;
2260
2261 onError:
2262 Py_DECREF(v);
2263 return NULL;
2264}
2265
2266PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2267 PyObject *mapping)
2268{
2269 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2270 PyErr_BadArgument();
2271 return NULL;
2272 }
2273 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2274 PyUnicode_GET_SIZE(unicode),
2275 mapping,
2276 NULL);
2277}
2278
2279static
2280int translate_error(const Py_UNICODE **source,
2281 Py_UNICODE **dest,
2282 const char *errors,
2283 const char *details)
2284{
2285 if ((errors == NULL) ||
2286 (strcmp(errors,"strict") == 0)) {
2287 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002288 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 details);
2290 return -1;
2291 }
2292 else if (strcmp(errors,"ignore") == 0) {
2293 return 0;
2294 }
2295 else if (strcmp(errors,"replace") == 0) {
2296 **dest = '?';
2297 (*dest)++;
2298 return 0;
2299 }
2300 else {
2301 PyErr_Format(PyExc_ValueError,
2302 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002303 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 errors);
2305 return -1;
2306 }
2307}
2308
2309PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2310 int size,
2311 PyObject *mapping,
2312 const char *errors)
2313{
2314 PyUnicodeObject *v;
2315 Py_UNICODE *p;
2316
2317 if (mapping == NULL) {
2318 PyErr_BadArgument();
2319 return NULL;
2320 }
2321
2322 /* Output will never be longer than input */
2323 v = _PyUnicode_New(size);
2324 if (v == NULL)
2325 goto onError;
2326 if (size == 0)
2327 goto done;
2328 p = PyUnicode_AS_UNICODE(v);
2329 while (size-- > 0) {
2330 Py_UNICODE ch = *s++;
2331 PyObject *w, *x;
2332
2333 /* Get mapping */
2334 w = PyInt_FromLong(ch);
2335 if (w == NULL)
2336 goto onError;
2337 x = PyObject_GetItem(mapping, w);
2338 Py_DECREF(w);
2339 if (x == NULL) {
2340 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2341 /* No mapping found: default to 1-1 mapping */
2342 PyErr_Clear();
2343 *p++ = ch;
2344 continue;
2345 }
2346 goto onError;
2347 }
2348
2349 /* Apply mapping */
2350 if (PyInt_Check(x))
2351 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2352 else if (x == Py_None) {
2353 /* undefined mapping */
2354 if (translate_error(&s, &p, errors,
2355 "character maps to <undefined>")) {
2356 Py_DECREF(x);
2357 goto onError;
2358 }
2359 }
2360 else if (PyUnicode_Check(x)) {
2361 if (PyUnicode_GET_SIZE(x) != 1) {
2362 /* 1-n mapping */
2363 PyErr_SetString(PyExc_NotImplementedError,
2364 "1-n mappings are currently not implemented");
2365 Py_DECREF(x);
2366 goto onError;
2367 }
2368 *p++ = *PyUnicode_AS_UNICODE(x);
2369 }
2370 else {
2371 /* wrong return value */
2372 PyErr_SetString(PyExc_TypeError,
2373 "translate mapping must return integer, None or unicode");
2374 Py_DECREF(x);
2375 goto onError;
2376 }
2377 Py_DECREF(x);
2378 }
2379 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002380 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382
2383 done:
2384 return (PyObject *)v;
2385
2386 onError:
2387 Py_XDECREF(v);
2388 return NULL;
2389}
2390
2391PyObject *PyUnicode_Translate(PyObject *str,
2392 PyObject *mapping,
2393 const char *errors)
2394{
2395 PyObject *result;
2396
2397 str = PyUnicode_FromObject(str);
2398 if (str == NULL)
2399 goto onError;
2400 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2401 PyUnicode_GET_SIZE(str),
2402 mapping,
2403 errors);
2404 Py_DECREF(str);
2405 return result;
2406
2407 onError:
2408 Py_XDECREF(str);
2409 return NULL;
2410}
2411
Guido van Rossum9e896b32000-04-05 20:11:21 +00002412/* --- Decimal Encoder ---------------------------------------------------- */
2413
2414int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2415 int length,
2416 char *output,
2417 const char *errors)
2418{
2419 Py_UNICODE *p, *end;
2420
2421 if (output == NULL) {
2422 PyErr_BadArgument();
2423 return -1;
2424 }
2425
2426 p = s;
2427 end = s + length;
2428 while (p < end) {
2429 register Py_UNICODE ch = *p++;
2430 int decimal;
2431
2432 if (Py_UNICODE_ISSPACE(ch)) {
2433 *output++ = ' ';
2434 continue;
2435 }
2436 decimal = Py_UNICODE_TODECIMAL(ch);
2437 if (decimal >= 0) {
2438 *output++ = '0' + decimal;
2439 continue;
2440 }
Guido van Rossumba477042000-04-06 18:18:10 +00002441 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002442 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002443 continue;
2444 }
2445 /* All other characters are considered invalid */
2446 if (errors == NULL || strcmp(errors, "strict") == 0) {
2447 PyErr_SetString(PyExc_ValueError,
2448 "invalid decimal Unicode string");
2449 goto onError;
2450 }
2451 else if (strcmp(errors, "ignore") == 0)
2452 continue;
2453 else if (strcmp(errors, "replace") == 0) {
2454 *output++ = '?';
2455 continue;
2456 }
2457 }
2458 /* 0-terminate the output string */
2459 *output++ = '\0';
2460 return 0;
2461
2462 onError:
2463 return -1;
2464}
2465
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466/* --- Helpers ------------------------------------------------------------ */
2467
2468static
2469int count(PyUnicodeObject *self,
2470 int start,
2471 int end,
2472 PyUnicodeObject *substring)
2473{
2474 int count = 0;
2475
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002476 if (start < 0)
2477 start += self->length;
2478 if (start < 0)
2479 start = 0;
2480 if (end > self->length)
2481 end = self->length;
2482 if (end < 0)
2483 end += self->length;
2484 if (end < 0)
2485 end = 0;
2486
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002487 if (substring->length == 0)
2488 return (end - start + 1);
2489
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 end -= substring->length;
2491
2492 while (start <= end)
2493 if (Py_UNICODE_MATCH(self, start, substring)) {
2494 count++;
2495 start += substring->length;
2496 } else
2497 start++;
2498
2499 return count;
2500}
2501
2502int PyUnicode_Count(PyObject *str,
2503 PyObject *substr,
2504 int start,
2505 int end)
2506{
2507 int result;
2508
2509 str = PyUnicode_FromObject(str);
2510 if (str == NULL)
2511 return -1;
2512 substr = PyUnicode_FromObject(substr);
2513 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002514 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 return -1;
2516 }
2517
2518 result = count((PyUnicodeObject *)str,
2519 start, end,
2520 (PyUnicodeObject *)substr);
2521
2522 Py_DECREF(str);
2523 Py_DECREF(substr);
2524 return result;
2525}
2526
2527static
2528int findstring(PyUnicodeObject *self,
2529 PyUnicodeObject *substring,
2530 int start,
2531 int end,
2532 int direction)
2533{
2534 if (start < 0)
2535 start += self->length;
2536 if (start < 0)
2537 start = 0;
2538
2539 if (substring->length == 0)
2540 return start;
2541
2542 if (end > self->length)
2543 end = self->length;
2544 if (end < 0)
2545 end += self->length;
2546 if (end < 0)
2547 end = 0;
2548
2549 end -= substring->length;
2550
2551 if (direction < 0) {
2552 for (; end >= start; end--)
2553 if (Py_UNICODE_MATCH(self, end, substring))
2554 return end;
2555 } else {
2556 for (; start <= end; start++)
2557 if (Py_UNICODE_MATCH(self, start, substring))
2558 return start;
2559 }
2560
2561 return -1;
2562}
2563
2564int PyUnicode_Find(PyObject *str,
2565 PyObject *substr,
2566 int start,
2567 int end,
2568 int direction)
2569{
2570 int result;
2571
2572 str = PyUnicode_FromObject(str);
2573 if (str == NULL)
2574 return -1;
2575 substr = PyUnicode_FromObject(substr);
2576 if (substr == NULL) {
2577 Py_DECREF(substr);
2578 return -1;
2579 }
2580
2581 result = findstring((PyUnicodeObject *)str,
2582 (PyUnicodeObject *)substr,
2583 start, end, direction);
2584 Py_DECREF(str);
2585 Py_DECREF(substr);
2586 return result;
2587}
2588
2589static
2590int tailmatch(PyUnicodeObject *self,
2591 PyUnicodeObject *substring,
2592 int start,
2593 int end,
2594 int direction)
2595{
2596 if (start < 0)
2597 start += self->length;
2598 if (start < 0)
2599 start = 0;
2600
2601 if (substring->length == 0)
2602 return 1;
2603
2604 if (end > self->length)
2605 end = self->length;
2606 if (end < 0)
2607 end += self->length;
2608 if (end < 0)
2609 end = 0;
2610
2611 end -= substring->length;
2612 if (end < start)
2613 return 0;
2614
2615 if (direction > 0) {
2616 if (Py_UNICODE_MATCH(self, end, substring))
2617 return 1;
2618 } else {
2619 if (Py_UNICODE_MATCH(self, start, substring))
2620 return 1;
2621 }
2622
2623 return 0;
2624}
2625
2626int PyUnicode_Tailmatch(PyObject *str,
2627 PyObject *substr,
2628 int start,
2629 int end,
2630 int direction)
2631{
2632 int result;
2633
2634 str = PyUnicode_FromObject(str);
2635 if (str == NULL)
2636 return -1;
2637 substr = PyUnicode_FromObject(substr);
2638 if (substr == NULL) {
2639 Py_DECREF(substr);
2640 return -1;
2641 }
2642
2643 result = tailmatch((PyUnicodeObject *)str,
2644 (PyUnicodeObject *)substr,
2645 start, end, direction);
2646 Py_DECREF(str);
2647 Py_DECREF(substr);
2648 return result;
2649}
2650
2651static
2652const Py_UNICODE *findchar(const Py_UNICODE *s,
2653 int size,
2654 Py_UNICODE ch)
2655{
2656 /* like wcschr, but doesn't stop at NULL characters */
2657
2658 while (size-- > 0) {
2659 if (*s == ch)
2660 return s;
2661 s++;
2662 }
2663
2664 return NULL;
2665}
2666
2667/* Apply fixfct filter to the Unicode object self and return a
2668 reference to the modified object */
2669
2670static
2671PyObject *fixup(PyUnicodeObject *self,
2672 int (*fixfct)(PyUnicodeObject *s))
2673{
2674
2675 PyUnicodeObject *u;
2676
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002677 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 if (u == NULL)
2679 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002680
2681 Py_UNICODE_COPY(u->str, self->str, self->length);
2682
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 if (!fixfct(u)) {
2684 /* fixfct should return TRUE if it modified the buffer. If
2685 FALSE, return a reference to the original buffer instead
2686 (to save space, not time) */
2687 Py_INCREF(self);
2688 Py_DECREF(u);
2689 return (PyObject*) self;
2690 }
2691 return (PyObject*) u;
2692}
2693
2694static
2695int fixupper(PyUnicodeObject *self)
2696{
2697 int len = self->length;
2698 Py_UNICODE *s = self->str;
2699 int status = 0;
2700
2701 while (len-- > 0) {
2702 register Py_UNICODE ch;
2703
2704 ch = Py_UNICODE_TOUPPER(*s);
2705 if (ch != *s) {
2706 status = 1;
2707 *s = ch;
2708 }
2709 s++;
2710 }
2711
2712 return status;
2713}
2714
2715static
2716int fixlower(PyUnicodeObject *self)
2717{
2718 int len = self->length;
2719 Py_UNICODE *s = self->str;
2720 int status = 0;
2721
2722 while (len-- > 0) {
2723 register Py_UNICODE ch;
2724
2725 ch = Py_UNICODE_TOLOWER(*s);
2726 if (ch != *s) {
2727 status = 1;
2728 *s = ch;
2729 }
2730 s++;
2731 }
2732
2733 return status;
2734}
2735
2736static
2737int fixswapcase(PyUnicodeObject *self)
2738{
2739 int len = self->length;
2740 Py_UNICODE *s = self->str;
2741 int status = 0;
2742
2743 while (len-- > 0) {
2744 if (Py_UNICODE_ISUPPER(*s)) {
2745 *s = Py_UNICODE_TOLOWER(*s);
2746 status = 1;
2747 } else if (Py_UNICODE_ISLOWER(*s)) {
2748 *s = Py_UNICODE_TOUPPER(*s);
2749 status = 1;
2750 }
2751 s++;
2752 }
2753
2754 return status;
2755}
2756
2757static
2758int fixcapitalize(PyUnicodeObject *self)
2759{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002760 int len = self->length;
2761 Py_UNICODE *s = self->str;
2762 int status = 0;
2763
2764 if (len == 0)
2765 return 0;
2766 if (Py_UNICODE_ISLOWER(*s)) {
2767 *s = Py_UNICODE_TOUPPER(*s);
2768 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002770 s++;
2771 while (--len > 0) {
2772 if (Py_UNICODE_ISUPPER(*s)) {
2773 *s = Py_UNICODE_TOLOWER(*s);
2774 status = 1;
2775 }
2776 s++;
2777 }
2778 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779}
2780
2781static
2782int fixtitle(PyUnicodeObject *self)
2783{
2784 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2785 register Py_UNICODE *e;
2786 int previous_is_cased;
2787
2788 /* Shortcut for single character strings */
2789 if (PyUnicode_GET_SIZE(self) == 1) {
2790 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2791 if (*p != ch) {
2792 *p = ch;
2793 return 1;
2794 }
2795 else
2796 return 0;
2797 }
2798
2799 e = p + PyUnicode_GET_SIZE(self);
2800 previous_is_cased = 0;
2801 for (; p < e; p++) {
2802 register const Py_UNICODE ch = *p;
2803
2804 if (previous_is_cased)
2805 *p = Py_UNICODE_TOLOWER(ch);
2806 else
2807 *p = Py_UNICODE_TOTITLE(ch);
2808
2809 if (Py_UNICODE_ISLOWER(ch) ||
2810 Py_UNICODE_ISUPPER(ch) ||
2811 Py_UNICODE_ISTITLE(ch))
2812 previous_is_cased = 1;
2813 else
2814 previous_is_cased = 0;
2815 }
2816 return 1;
2817}
2818
2819PyObject *PyUnicode_Join(PyObject *separator,
2820 PyObject *seq)
2821{
2822 Py_UNICODE *sep;
2823 int seplen;
2824 PyUnicodeObject *res = NULL;
2825 int reslen = 0;
2826 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 int sz = 100;
2828 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002829 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830
Tim Peters2cfe3682001-05-05 05:36:48 +00002831 it = PyObject_GetIter(seq);
2832 if (it == NULL)
2833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
2835 if (separator == NULL) {
2836 Py_UNICODE blank = ' ';
2837 sep = &blank;
2838 seplen = 1;
2839 }
2840 else {
2841 separator = PyUnicode_FromObject(separator);
2842 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002843 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 sep = PyUnicode_AS_UNICODE(separator);
2845 seplen = PyUnicode_GET_SIZE(separator);
2846 }
2847
2848 res = _PyUnicode_New(sz);
2849 if (res == NULL)
2850 goto onError;
2851 p = PyUnicode_AS_UNICODE(res);
2852 reslen = 0;
2853
Tim Peters2cfe3682001-05-05 05:36:48 +00002854 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002856 PyObject *item = PyIter_Next(it);
2857 if (item == NULL) {
2858 if (PyErr_Occurred())
2859 goto onError;
2860 break;
2861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 if (!PyUnicode_Check(item)) {
2863 PyObject *v;
2864 v = PyUnicode_FromObject(item);
2865 Py_DECREF(item);
2866 item = v;
2867 if (item == NULL)
2868 goto onError;
2869 }
2870 itemlen = PyUnicode_GET_SIZE(item);
2871 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002872 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 goto onError;
2874 sz *= 2;
2875 p = PyUnicode_AS_UNICODE(res) + reslen;
2876 }
2877 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002878 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 p += seplen;
2880 reslen += seplen;
2881 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002882 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 p += itemlen;
2884 reslen += itemlen;
2885 Py_DECREF(item);
2886 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002887 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 goto onError;
2889
2890 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002891 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 return (PyObject *)res;
2893
2894 onError:
2895 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002896 Py_XDECREF(res);
2897 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 return NULL;
2899}
2900
2901static
2902PyUnicodeObject *pad(PyUnicodeObject *self,
2903 int left,
2904 int right,
2905 Py_UNICODE fill)
2906{
2907 PyUnicodeObject *u;
2908
2909 if (left < 0)
2910 left = 0;
2911 if (right < 0)
2912 right = 0;
2913
2914 if (left == 0 && right == 0) {
2915 Py_INCREF(self);
2916 return self;
2917 }
2918
2919 u = _PyUnicode_New(left + self->length + right);
2920 if (u) {
2921 if (left)
2922 Py_UNICODE_FILL(u->str, fill, left);
2923 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2924 if (right)
2925 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2926 }
2927
2928 return u;
2929}
2930
2931#define SPLIT_APPEND(data, left, right) \
2932 str = PyUnicode_FromUnicode(data + left, right - left); \
2933 if (!str) \
2934 goto onError; \
2935 if (PyList_Append(list, str)) { \
2936 Py_DECREF(str); \
2937 goto onError; \
2938 } \
2939 else \
2940 Py_DECREF(str);
2941
2942static
2943PyObject *split_whitespace(PyUnicodeObject *self,
2944 PyObject *list,
2945 int maxcount)
2946{
2947 register int i;
2948 register int j;
2949 int len = self->length;
2950 PyObject *str;
2951
2952 for (i = j = 0; i < len; ) {
2953 /* find a token */
2954 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2955 i++;
2956 j = i;
2957 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2958 i++;
2959 if (j < i) {
2960 if (maxcount-- <= 0)
2961 break;
2962 SPLIT_APPEND(self->str, j, i);
2963 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2964 i++;
2965 j = i;
2966 }
2967 }
2968 if (j < len) {
2969 SPLIT_APPEND(self->str, j, len);
2970 }
2971 return list;
2972
2973 onError:
2974 Py_DECREF(list);
2975 return NULL;
2976}
2977
2978PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002979 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
2981 register int i;
2982 register int j;
2983 int len;
2984 PyObject *list;
2985 PyObject *str;
2986 Py_UNICODE *data;
2987
2988 string = PyUnicode_FromObject(string);
2989 if (string == NULL)
2990 return NULL;
2991 data = PyUnicode_AS_UNICODE(string);
2992 len = PyUnicode_GET_SIZE(string);
2993
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 list = PyList_New(0);
2995 if (!list)
2996 goto onError;
2997
2998 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002999 int eol;
3000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 /* Find a line and append it */
3002 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3003 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004
3005 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003006 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 if (i < len) {
3008 if (data[i] == '\r' && i + 1 < len &&
3009 data[i+1] == '\n')
3010 i += 2;
3011 else
3012 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003013 if (keepends)
3014 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 }
Guido van Rossum86662912000-04-11 15:38:46 +00003016 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 j = i;
3018 }
3019 if (j < len) {
3020 SPLIT_APPEND(data, j, len);
3021 }
3022
3023 Py_DECREF(string);
3024 return list;
3025
3026 onError:
3027 Py_DECREF(list);
3028 Py_DECREF(string);
3029 return NULL;
3030}
3031
3032static
3033PyObject *split_char(PyUnicodeObject *self,
3034 PyObject *list,
3035 Py_UNICODE ch,
3036 int maxcount)
3037{
3038 register int i;
3039 register int j;
3040 int len = self->length;
3041 PyObject *str;
3042
3043 for (i = j = 0; i < len; ) {
3044 if (self->str[i] == ch) {
3045 if (maxcount-- <= 0)
3046 break;
3047 SPLIT_APPEND(self->str, j, i);
3048 i = j = i + 1;
3049 } else
3050 i++;
3051 }
3052 if (j <= len) {
3053 SPLIT_APPEND(self->str, j, len);
3054 }
3055 return list;
3056
3057 onError:
3058 Py_DECREF(list);
3059 return NULL;
3060}
3061
3062static
3063PyObject *split_substring(PyUnicodeObject *self,
3064 PyObject *list,
3065 PyUnicodeObject *substring,
3066 int maxcount)
3067{
3068 register int i;
3069 register int j;
3070 int len = self->length;
3071 int sublen = substring->length;
3072 PyObject *str;
3073
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003074 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 if (Py_UNICODE_MATCH(self, i, substring)) {
3076 if (maxcount-- <= 0)
3077 break;
3078 SPLIT_APPEND(self->str, j, i);
3079 i = j = i + sublen;
3080 } else
3081 i++;
3082 }
3083 if (j <= len) {
3084 SPLIT_APPEND(self->str, j, len);
3085 }
3086 return list;
3087
3088 onError:
3089 Py_DECREF(list);
3090 return NULL;
3091}
3092
3093#undef SPLIT_APPEND
3094
3095static
3096PyObject *split(PyUnicodeObject *self,
3097 PyUnicodeObject *substring,
3098 int maxcount)
3099{
3100 PyObject *list;
3101
3102 if (maxcount < 0)
3103 maxcount = INT_MAX;
3104
3105 list = PyList_New(0);
3106 if (!list)
3107 return NULL;
3108
3109 if (substring == NULL)
3110 return split_whitespace(self,list,maxcount);
3111
3112 else if (substring->length == 1)
3113 return split_char(self,list,substring->str[0],maxcount);
3114
3115 else if (substring->length == 0) {
3116 Py_DECREF(list);
3117 PyErr_SetString(PyExc_ValueError, "empty separator");
3118 return NULL;
3119 }
3120 else
3121 return split_substring(self,list,substring,maxcount);
3122}
3123
3124static
3125PyObject *strip(PyUnicodeObject *self,
3126 int left,
3127 int right)
3128{
3129 Py_UNICODE *p = self->str;
3130 int start = 0;
3131 int end = self->length;
3132
3133 if (left)
3134 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3135 start++;
3136
3137 if (right)
3138 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3139 end--;
3140
3141 if (start == 0 && end == self->length) {
3142 /* couldn't strip anything off, return original string */
3143 Py_INCREF(self);
3144 return (PyObject*) self;
3145 }
3146
3147 return (PyObject*) PyUnicode_FromUnicode(
3148 self->str + start,
3149 end - start
3150 );
3151}
3152
3153static
3154PyObject *replace(PyUnicodeObject *self,
3155 PyUnicodeObject *str1,
3156 PyUnicodeObject *str2,
3157 int maxcount)
3158{
3159 PyUnicodeObject *u;
3160
3161 if (maxcount < 0)
3162 maxcount = INT_MAX;
3163
3164 if (str1->length == 1 && str2->length == 1) {
3165 int i;
3166
3167 /* replace characters */
3168 if (!findchar(self->str, self->length, str1->str[0])) {
3169 /* nothing to replace, return original string */
3170 Py_INCREF(self);
3171 u = self;
3172 } else {
3173 Py_UNICODE u1 = str1->str[0];
3174 Py_UNICODE u2 = str2->str[0];
3175
3176 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003177 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 self->length
3179 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003180 if (u != NULL) {
3181 Py_UNICODE_COPY(u->str, self->str,
3182 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 for (i = 0; i < u->length; i++)
3184 if (u->str[i] == u1) {
3185 if (--maxcount < 0)
3186 break;
3187 u->str[i] = u2;
3188 }
3189 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191
3192 } else {
3193 int n, i;
3194 Py_UNICODE *p;
3195
3196 /* replace strings */
3197 n = count(self, 0, self->length, str1);
3198 if (n > maxcount)
3199 n = maxcount;
3200 if (n == 0) {
3201 /* nothing to replace, return original string */
3202 Py_INCREF(self);
3203 u = self;
3204 } else {
3205 u = _PyUnicode_New(
3206 self->length + n * (str2->length - str1->length));
3207 if (u) {
3208 i = 0;
3209 p = u->str;
3210 while (i <= self->length - str1->length)
3211 if (Py_UNICODE_MATCH(self, i, str1)) {
3212 /* replace string segment */
3213 Py_UNICODE_COPY(p, str2->str, str2->length);
3214 p += str2->length;
3215 i += str1->length;
3216 if (--n <= 0) {
3217 /* copy remaining part */
3218 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3219 break;
3220 }
3221 } else
3222 *p++ = self->str[i++];
3223 }
3224 }
3225 }
3226
3227 return (PyObject *) u;
3228}
3229
3230/* --- Unicode Object Methods --------------------------------------------- */
3231
3232static char title__doc__[] =
3233"S.title() -> unicode\n\
3234\n\
3235Return a titlecased version of S, i.e. words start with title case\n\
3236characters, all remaining cased characters have lower case.";
3237
3238static PyObject*
3239unicode_title(PyUnicodeObject *self, PyObject *args)
3240{
3241 if (!PyArg_NoArgs(args))
3242 return NULL;
3243 return fixup(self, fixtitle);
3244}
3245
3246static char capitalize__doc__[] =
3247"S.capitalize() -> unicode\n\
3248\n\
3249Return a capitalized version of S, i.e. make the first character\n\
3250have upper case.";
3251
3252static PyObject*
3253unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3254{
3255 if (!PyArg_NoArgs(args))
3256 return NULL;
3257 return fixup(self, fixcapitalize);
3258}
3259
3260#if 0
3261static char capwords__doc__[] =
3262"S.capwords() -> unicode\n\
3263\n\
3264Apply .capitalize() to all words in S and return the result with\n\
3265normalized whitespace (all whitespace strings are replaced by ' ').";
3266
3267static PyObject*
3268unicode_capwords(PyUnicodeObject *self, PyObject *args)
3269{
3270 PyObject *list;
3271 PyObject *item;
3272 int i;
3273
3274 if (!PyArg_NoArgs(args))
3275 return NULL;
3276
3277 /* Split into words */
3278 list = split(self, NULL, -1);
3279 if (!list)
3280 return NULL;
3281
3282 /* Capitalize each word */
3283 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3284 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3285 fixcapitalize);
3286 if (item == NULL)
3287 goto onError;
3288 Py_DECREF(PyList_GET_ITEM(list, i));
3289 PyList_SET_ITEM(list, i, item);
3290 }
3291
3292 /* Join the words to form a new string */
3293 item = PyUnicode_Join(NULL, list);
3294
3295onError:
3296 Py_DECREF(list);
3297 return (PyObject *)item;
3298}
3299#endif
3300
3301static char center__doc__[] =
3302"S.center(width) -> unicode\n\
3303\n\
3304Return S centered in a Unicode string of length width. Padding is done\n\
3305using spaces.";
3306
3307static PyObject *
3308unicode_center(PyUnicodeObject *self, PyObject *args)
3309{
3310 int marg, left;
3311 int width;
3312
3313 if (!PyArg_ParseTuple(args, "i:center", &width))
3314 return NULL;
3315
3316 if (self->length >= width) {
3317 Py_INCREF(self);
3318 return (PyObject*) self;
3319 }
3320
3321 marg = width - self->length;
3322 left = marg / 2 + (marg & width & 1);
3323
3324 return (PyObject*) pad(self, left, marg - left, ' ');
3325}
3326
Marc-André Lemburge5034372000-08-08 08:04:29 +00003327#if 0
3328
3329/* This code should go into some future Unicode collation support
3330 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003331 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003332
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003333/* speedy UTF-16 code point order comparison */
3334/* gleaned from: */
3335/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3336
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003337static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003338{
3339 0, 0, 0, 0, 0, 0, 0, 0,
3340 0, 0, 0, 0, 0, 0, 0, 0,
3341 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003342 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003343};
3344
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345static int
3346unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3347{
3348 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 Py_UNICODE *s1 = str1->str;
3351 Py_UNICODE *s2 = str2->str;
3352
3353 len1 = str1->length;
3354 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003355
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003357 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003358
3359 c1 = *s1++;
3360 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003361
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003362 if (c1 > (1<<11) * 26)
3363 c1 += utf16Fixup[c1>>11];
3364 if (c2 > (1<<11) * 26)
3365 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003366 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003367
3368 if (c1 != c2)
3369 return (c1 < c2) ? -1 : 1;
3370
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003371 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 }
3373
3374 return (len1 < len2) ? -1 : (len1 != len2);
3375}
3376
Marc-André Lemburge5034372000-08-08 08:04:29 +00003377#else
3378
3379static int
3380unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3381{
3382 register int len1, len2;
3383
3384 Py_UNICODE *s1 = str1->str;
3385 Py_UNICODE *s2 = str2->str;
3386
3387 len1 = str1->length;
3388 len2 = str2->length;
3389
3390 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003391 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003392
Fredrik Lundh45714e92001-06-26 16:39:36 +00003393 c1 = *s1++;
3394 c2 = *s2++;
3395
3396 if (c1 != c2)
3397 return (c1 < c2) ? -1 : 1;
3398
Marc-André Lemburge5034372000-08-08 08:04:29 +00003399 len1--; len2--;
3400 }
3401
3402 return (len1 < len2) ? -1 : (len1 != len2);
3403}
3404
3405#endif
3406
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407int PyUnicode_Compare(PyObject *left,
3408 PyObject *right)
3409{
3410 PyUnicodeObject *u = NULL, *v = NULL;
3411 int result;
3412
3413 /* Coerce the two arguments */
3414 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3415 if (u == NULL)
3416 goto onError;
3417 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3418 if (v == NULL)
3419 goto onError;
3420
Thomas Wouters7e474022000-07-16 12:04:32 +00003421 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 if (v == u) {
3423 Py_DECREF(u);
3424 Py_DECREF(v);
3425 return 0;
3426 }
3427
3428 result = unicode_compare(u, v);
3429
3430 Py_DECREF(u);
3431 Py_DECREF(v);
3432 return result;
3433
3434onError:
3435 Py_XDECREF(u);
3436 Py_XDECREF(v);
3437 return -1;
3438}
3439
Guido van Rossum403d68b2000-03-13 15:55:09 +00003440int PyUnicode_Contains(PyObject *container,
3441 PyObject *element)
3442{
3443 PyUnicodeObject *u = NULL, *v = NULL;
3444 int result;
3445 register const Py_UNICODE *p, *e;
3446 register Py_UNICODE ch;
3447
3448 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003449 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003450 if (v == NULL) {
3451 PyErr_SetString(PyExc_TypeError,
3452 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003453 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003454 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003455 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3456 if (u == NULL) {
3457 Py_DECREF(v);
3458 goto onError;
3459 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003460
3461 /* Check v in u */
3462 if (PyUnicode_GET_SIZE(v) != 1) {
3463 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003464 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003465 goto onError;
3466 }
3467 ch = *PyUnicode_AS_UNICODE(v);
3468 p = PyUnicode_AS_UNICODE(u);
3469 e = p + PyUnicode_GET_SIZE(u);
3470 result = 0;
3471 while (p < e) {
3472 if (*p++ == ch) {
3473 result = 1;
3474 break;
3475 }
3476 }
3477
3478 Py_DECREF(u);
3479 Py_DECREF(v);
3480 return result;
3481
3482onError:
3483 Py_XDECREF(u);
3484 Py_XDECREF(v);
3485 return -1;
3486}
3487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488/* Concat to string or Unicode object giving a new Unicode object. */
3489
3490PyObject *PyUnicode_Concat(PyObject *left,
3491 PyObject *right)
3492{
3493 PyUnicodeObject *u = NULL, *v = NULL, *w;
3494
3495 /* Coerce the two arguments */
3496 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3497 if (u == NULL)
3498 goto onError;
3499 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3500 if (v == NULL)
3501 goto onError;
3502
3503 /* Shortcuts */
3504 if (v == unicode_empty) {
3505 Py_DECREF(v);
3506 return (PyObject *)u;
3507 }
3508 if (u == unicode_empty) {
3509 Py_DECREF(u);
3510 return (PyObject *)v;
3511 }
3512
3513 /* Concat the two Unicode strings */
3514 w = _PyUnicode_New(u->length + v->length);
3515 if (w == NULL)
3516 goto onError;
3517 Py_UNICODE_COPY(w->str, u->str, u->length);
3518 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3519
3520 Py_DECREF(u);
3521 Py_DECREF(v);
3522 return (PyObject *)w;
3523
3524onError:
3525 Py_XDECREF(u);
3526 Py_XDECREF(v);
3527 return NULL;
3528}
3529
3530static char count__doc__[] =
3531"S.count(sub[, start[, end]]) -> int\n\
3532\n\
3533Return the number of occurrences of substring sub in Unicode string\n\
3534S[start:end]. Optional arguments start and end are\n\
3535interpreted as in slice notation.";
3536
3537static PyObject *
3538unicode_count(PyUnicodeObject *self, PyObject *args)
3539{
3540 PyUnicodeObject *substring;
3541 int start = 0;
3542 int end = INT_MAX;
3543 PyObject *result;
3544
Guido van Rossumb8872e62000-05-09 14:14:27 +00003545 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3546 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 return NULL;
3548
3549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3550 (PyObject *)substring);
3551 if (substring == NULL)
3552 return NULL;
3553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 if (start < 0)
3555 start += self->length;
3556 if (start < 0)
3557 start = 0;
3558 if (end > self->length)
3559 end = self->length;
3560 if (end < 0)
3561 end += self->length;
3562 if (end < 0)
3563 end = 0;
3564
3565 result = PyInt_FromLong((long) count(self, start, end, substring));
3566
3567 Py_DECREF(substring);
3568 return result;
3569}
3570
3571static char encode__doc__[] =
3572"S.encode([encoding[,errors]]) -> string\n\
3573\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003574Return an encoded string version of S. Default encoding is the current\n\
3575default string encoding. errors may be given to set a different error\n\
3576handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3577a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578
3579static PyObject *
3580unicode_encode(PyUnicodeObject *self, PyObject *args)
3581{
3582 char *encoding = NULL;
3583 char *errors = NULL;
3584 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3585 return NULL;
3586 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3587}
3588
3589static char expandtabs__doc__[] =
3590"S.expandtabs([tabsize]) -> unicode\n\
3591\n\
3592Return a copy of S where all tab characters are expanded using spaces.\n\
3593If tabsize is not given, a tab size of 8 characters is assumed.";
3594
3595static PyObject*
3596unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3597{
3598 Py_UNICODE *e;
3599 Py_UNICODE *p;
3600 Py_UNICODE *q;
3601 int i, j;
3602 PyUnicodeObject *u;
3603 int tabsize = 8;
3604
3605 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3606 return NULL;
3607
Thomas Wouters7e474022000-07-16 12:04:32 +00003608 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 i = j = 0;
3610 e = self->str + self->length;
3611 for (p = self->str; p < e; p++)
3612 if (*p == '\t') {
3613 if (tabsize > 0)
3614 j += tabsize - (j % tabsize);
3615 }
3616 else {
3617 j++;
3618 if (*p == '\n' || *p == '\r') {
3619 i += j;
3620 j = 0;
3621 }
3622 }
3623
3624 /* Second pass: create output string and fill it */
3625 u = _PyUnicode_New(i + j);
3626 if (!u)
3627 return NULL;
3628
3629 j = 0;
3630 q = u->str;
3631
3632 for (p = self->str; p < e; p++)
3633 if (*p == '\t') {
3634 if (tabsize > 0) {
3635 i = tabsize - (j % tabsize);
3636 j += i;
3637 while (i--)
3638 *q++ = ' ';
3639 }
3640 }
3641 else {
3642 j++;
3643 *q++ = *p;
3644 if (*p == '\n' || *p == '\r')
3645 j = 0;
3646 }
3647
3648 return (PyObject*) u;
3649}
3650
3651static char find__doc__[] =
3652"S.find(sub [,start [,end]]) -> int\n\
3653\n\
3654Return the lowest index in S where substring sub is found,\n\
3655such that sub is contained within s[start,end]. Optional\n\
3656arguments start and end are interpreted as in slice notation.\n\
3657\n\
3658Return -1 on failure.";
3659
3660static PyObject *
3661unicode_find(PyUnicodeObject *self, PyObject *args)
3662{
3663 PyUnicodeObject *substring;
3664 int start = 0;
3665 int end = INT_MAX;
3666 PyObject *result;
3667
Guido van Rossumb8872e62000-05-09 14:14:27 +00003668 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3669 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 return NULL;
3671 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3672 (PyObject *)substring);
3673 if (substring == NULL)
3674 return NULL;
3675
3676 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3677
3678 Py_DECREF(substring);
3679 return result;
3680}
3681
3682static PyObject *
3683unicode_getitem(PyUnicodeObject *self, int index)
3684{
3685 if (index < 0 || index >= self->length) {
3686 PyErr_SetString(PyExc_IndexError, "string index out of range");
3687 return NULL;
3688 }
3689
3690 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3691}
3692
3693static long
3694unicode_hash(PyUnicodeObject *self)
3695{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003696 /* Since Unicode objects compare equal to their ASCII string
3697 counterparts, they should use the individual character values
3698 as basis for their hash value. This is needed to assure that
3699 strings and Unicode objects behave in the same way as
3700 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701
Fredrik Lundhdde61642000-07-10 18:27:47 +00003702 register int len;
3703 register Py_UNICODE *p;
3704 register long x;
3705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 if (self->hash != -1)
3707 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003708 len = PyUnicode_GET_SIZE(self);
3709 p = PyUnicode_AS_UNICODE(self);
3710 x = *p << 7;
3711 while (--len >= 0)
3712 x = (1000003*x) ^ *p++;
3713 x ^= PyUnicode_GET_SIZE(self);
3714 if (x == -1)
3715 x = -2;
3716 self->hash = x;
3717 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718}
3719
3720static char index__doc__[] =
3721"S.index(sub [,start [,end]]) -> int\n\
3722\n\
3723Like S.find() but raise ValueError when the substring is not found.";
3724
3725static PyObject *
3726unicode_index(PyUnicodeObject *self, PyObject *args)
3727{
3728 int result;
3729 PyUnicodeObject *substring;
3730 int start = 0;
3731 int end = INT_MAX;
3732
Guido van Rossumb8872e62000-05-09 14:14:27 +00003733 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3734 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 return NULL;
3736
3737 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3738 (PyObject *)substring);
3739 if (substring == NULL)
3740 return NULL;
3741
3742 result = findstring(self, substring, start, end, 1);
3743
3744 Py_DECREF(substring);
3745 if (result < 0) {
3746 PyErr_SetString(PyExc_ValueError, "substring not found");
3747 return NULL;
3748 }
3749 return PyInt_FromLong(result);
3750}
3751
3752static char islower__doc__[] =
3753"S.islower() -> int\n\
3754\n\
3755Return 1 if all cased characters in S are lowercase and there is\n\
3756at least one cased character in S, 0 otherwise.";
3757
3758static PyObject*
3759unicode_islower(PyUnicodeObject *self, PyObject *args)
3760{
3761 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3762 register const Py_UNICODE *e;
3763 int cased;
3764
3765 if (!PyArg_NoArgs(args))
3766 return NULL;
3767
3768 /* Shortcut for single character strings */
3769 if (PyUnicode_GET_SIZE(self) == 1)
3770 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3771
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003772 /* Special case for empty strings */
3773 if (PyString_GET_SIZE(self) == 0)
3774 return PyInt_FromLong(0);
3775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 e = p + PyUnicode_GET_SIZE(self);
3777 cased = 0;
3778 for (; p < e; p++) {
3779 register const Py_UNICODE ch = *p;
3780
3781 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3782 return PyInt_FromLong(0);
3783 else if (!cased && Py_UNICODE_ISLOWER(ch))
3784 cased = 1;
3785 }
3786 return PyInt_FromLong(cased);
3787}
3788
3789static char isupper__doc__[] =
3790"S.isupper() -> int\n\
3791\n\
3792Return 1 if all cased characters in S are uppercase and there is\n\
3793at least one cased character in S, 0 otherwise.";
3794
3795static PyObject*
3796unicode_isupper(PyUnicodeObject *self, PyObject *args)
3797{
3798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3799 register const Py_UNICODE *e;
3800 int cased;
3801
3802 if (!PyArg_NoArgs(args))
3803 return NULL;
3804
3805 /* Shortcut for single character strings */
3806 if (PyUnicode_GET_SIZE(self) == 1)
3807 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3808
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003809 /* Special case for empty strings */
3810 if (PyString_GET_SIZE(self) == 0)
3811 return PyInt_FromLong(0);
3812
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 e = p + PyUnicode_GET_SIZE(self);
3814 cased = 0;
3815 for (; p < e; p++) {
3816 register const Py_UNICODE ch = *p;
3817
3818 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3819 return PyInt_FromLong(0);
3820 else if (!cased && Py_UNICODE_ISUPPER(ch))
3821 cased = 1;
3822 }
3823 return PyInt_FromLong(cased);
3824}
3825
3826static char istitle__doc__[] =
3827"S.istitle() -> int\n\
3828\n\
3829Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3830may only follow uncased characters and lowercase characters only cased\n\
3831ones. Return 0 otherwise.";
3832
3833static PyObject*
3834unicode_istitle(PyUnicodeObject *self, PyObject *args)
3835{
3836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3837 register const Py_UNICODE *e;
3838 int cased, previous_is_cased;
3839
3840 if (!PyArg_NoArgs(args))
3841 return NULL;
3842
3843 /* Shortcut for single character strings */
3844 if (PyUnicode_GET_SIZE(self) == 1)
3845 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3846 (Py_UNICODE_ISUPPER(*p) != 0));
3847
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003848 /* Special case for empty strings */
3849 if (PyString_GET_SIZE(self) == 0)
3850 return PyInt_FromLong(0);
3851
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 e = p + PyUnicode_GET_SIZE(self);
3853 cased = 0;
3854 previous_is_cased = 0;
3855 for (; p < e; p++) {
3856 register const Py_UNICODE ch = *p;
3857
3858 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3859 if (previous_is_cased)
3860 return PyInt_FromLong(0);
3861 previous_is_cased = 1;
3862 cased = 1;
3863 }
3864 else if (Py_UNICODE_ISLOWER(ch)) {
3865 if (!previous_is_cased)
3866 return PyInt_FromLong(0);
3867 previous_is_cased = 1;
3868 cased = 1;
3869 }
3870 else
3871 previous_is_cased = 0;
3872 }
3873 return PyInt_FromLong(cased);
3874}
3875
3876static char isspace__doc__[] =
3877"S.isspace() -> int\n\
3878\n\
3879Return 1 if there are only whitespace characters in S,\n\
38800 otherwise.";
3881
3882static PyObject*
3883unicode_isspace(PyUnicodeObject *self, PyObject *args)
3884{
3885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3886 register const Py_UNICODE *e;
3887
3888 if (!PyArg_NoArgs(args))
3889 return NULL;
3890
3891 /* Shortcut for single character strings */
3892 if (PyUnicode_GET_SIZE(self) == 1 &&
3893 Py_UNICODE_ISSPACE(*p))
3894 return PyInt_FromLong(1);
3895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003896 /* Special case for empty strings */
3897 if (PyString_GET_SIZE(self) == 0)
3898 return PyInt_FromLong(0);
3899
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 e = p + PyUnicode_GET_SIZE(self);
3901 for (; p < e; p++) {
3902 if (!Py_UNICODE_ISSPACE(*p))
3903 return PyInt_FromLong(0);
3904 }
3905 return PyInt_FromLong(1);
3906}
3907
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003908static char isalpha__doc__[] =
3909"S.isalpha() -> int\n\
3910\n\
3911Return 1 if all characters in S are alphabetic\n\
3912and there is at least one character in S, 0 otherwise.";
3913
3914static PyObject*
3915unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3916{
3917 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3918 register const Py_UNICODE *e;
3919
3920 if (!PyArg_NoArgs(args))
3921 return NULL;
3922
3923 /* Shortcut for single character strings */
3924 if (PyUnicode_GET_SIZE(self) == 1 &&
3925 Py_UNICODE_ISALPHA(*p))
3926 return PyInt_FromLong(1);
3927
3928 /* Special case for empty strings */
3929 if (PyString_GET_SIZE(self) == 0)
3930 return PyInt_FromLong(0);
3931
3932 e = p + PyUnicode_GET_SIZE(self);
3933 for (; p < e; p++) {
3934 if (!Py_UNICODE_ISALPHA(*p))
3935 return PyInt_FromLong(0);
3936 }
3937 return PyInt_FromLong(1);
3938}
3939
3940static char isalnum__doc__[] =
3941"S.isalnum() -> int\n\
3942\n\
3943Return 1 if all characters in S are alphanumeric\n\
3944and there is at least one character in S, 0 otherwise.";
3945
3946static PyObject*
3947unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3948{
3949 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3950 register const Py_UNICODE *e;
3951
3952 if (!PyArg_NoArgs(args))
3953 return NULL;
3954
3955 /* Shortcut for single character strings */
3956 if (PyUnicode_GET_SIZE(self) == 1 &&
3957 Py_UNICODE_ISALNUM(*p))
3958 return PyInt_FromLong(1);
3959
3960 /* Special case for empty strings */
3961 if (PyString_GET_SIZE(self) == 0)
3962 return PyInt_FromLong(0);
3963
3964 e = p + PyUnicode_GET_SIZE(self);
3965 for (; p < e; p++) {
3966 if (!Py_UNICODE_ISALNUM(*p))
3967 return PyInt_FromLong(0);
3968 }
3969 return PyInt_FromLong(1);
3970}
3971
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972static char isdecimal__doc__[] =
3973"S.isdecimal() -> int\n\
3974\n\
3975Return 1 if there are only decimal characters in S,\n\
39760 otherwise.";
3977
3978static PyObject*
3979unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3980{
3981 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3982 register const Py_UNICODE *e;
3983
3984 if (!PyArg_NoArgs(args))
3985 return NULL;
3986
3987 /* Shortcut for single character strings */
3988 if (PyUnicode_GET_SIZE(self) == 1 &&
3989 Py_UNICODE_ISDECIMAL(*p))
3990 return PyInt_FromLong(1);
3991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003992 /* Special case for empty strings */
3993 if (PyString_GET_SIZE(self) == 0)
3994 return PyInt_FromLong(0);
3995
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 e = p + PyUnicode_GET_SIZE(self);
3997 for (; p < e; p++) {
3998 if (!Py_UNICODE_ISDECIMAL(*p))
3999 return PyInt_FromLong(0);
4000 }
4001 return PyInt_FromLong(1);
4002}
4003
4004static char isdigit__doc__[] =
4005"S.isdigit() -> int\n\
4006\n\
4007Return 1 if there are only digit characters in S,\n\
40080 otherwise.";
4009
4010static PyObject*
4011unicode_isdigit(PyUnicodeObject *self, PyObject *args)
4012{
4013 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4014 register const Py_UNICODE *e;
4015
4016 if (!PyArg_NoArgs(args))
4017 return NULL;
4018
4019 /* Shortcut for single character strings */
4020 if (PyUnicode_GET_SIZE(self) == 1 &&
4021 Py_UNICODE_ISDIGIT(*p))
4022 return PyInt_FromLong(1);
4023
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004024 /* Special case for empty strings */
4025 if (PyString_GET_SIZE(self) == 0)
4026 return PyInt_FromLong(0);
4027
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 e = p + PyUnicode_GET_SIZE(self);
4029 for (; p < e; p++) {
4030 if (!Py_UNICODE_ISDIGIT(*p))
4031 return PyInt_FromLong(0);
4032 }
4033 return PyInt_FromLong(1);
4034}
4035
4036static char isnumeric__doc__[] =
4037"S.isnumeric() -> int\n\
4038\n\
4039Return 1 if there are only numeric characters in S,\n\
40400 otherwise.";
4041
4042static PyObject*
4043unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4044{
4045 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4046 register const Py_UNICODE *e;
4047
4048 if (!PyArg_NoArgs(args))
4049 return NULL;
4050
4051 /* Shortcut for single character strings */
4052 if (PyUnicode_GET_SIZE(self) == 1 &&
4053 Py_UNICODE_ISNUMERIC(*p))
4054 return PyInt_FromLong(1);
4055
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004056 /* Special case for empty strings */
4057 if (PyString_GET_SIZE(self) == 0)
4058 return PyInt_FromLong(0);
4059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 e = p + PyUnicode_GET_SIZE(self);
4061 for (; p < e; p++) {
4062 if (!Py_UNICODE_ISNUMERIC(*p))
4063 return PyInt_FromLong(0);
4064 }
4065 return PyInt_FromLong(1);
4066}
4067
4068static char join__doc__[] =
4069"S.join(sequence) -> unicode\n\
4070\n\
4071Return a string which is the concatenation of the strings in the\n\
4072sequence. The separator between elements is S.";
4073
4074static PyObject*
4075unicode_join(PyUnicodeObject *self, PyObject *args)
4076{
4077 PyObject *data;
4078 if (!PyArg_ParseTuple(args, "O:join", &data))
4079 return NULL;
4080
4081 return PyUnicode_Join((PyObject *)self, data);
4082}
4083
4084static int
4085unicode_length(PyUnicodeObject *self)
4086{
4087 return self->length;
4088}
4089
4090static char ljust__doc__[] =
4091"S.ljust(width) -> unicode\n\
4092\n\
4093Return S left justified in a Unicode string of length width. Padding is\n\
4094done using spaces.";
4095
4096static PyObject *
4097unicode_ljust(PyUnicodeObject *self, PyObject *args)
4098{
4099 int width;
4100 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4101 return NULL;
4102
4103 if (self->length >= width) {
4104 Py_INCREF(self);
4105 return (PyObject*) self;
4106 }
4107
4108 return (PyObject*) pad(self, 0, width - self->length, ' ');
4109}
4110
4111static char lower__doc__[] =
4112"S.lower() -> unicode\n\
4113\n\
4114Return a copy of the string S converted to lowercase.";
4115
4116static PyObject*
4117unicode_lower(PyUnicodeObject *self, PyObject *args)
4118{
4119 if (!PyArg_NoArgs(args))
4120 return NULL;
4121 return fixup(self, fixlower);
4122}
4123
4124static char lstrip__doc__[] =
4125"S.lstrip() -> unicode\n\
4126\n\
4127Return a copy of the string S with leading whitespace removed.";
4128
4129static PyObject *
4130unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4131{
4132 if (!PyArg_NoArgs(args))
4133 return NULL;
4134 return strip(self, 1, 0);
4135}
4136
4137static PyObject*
4138unicode_repeat(PyUnicodeObject *str, int len)
4139{
4140 PyUnicodeObject *u;
4141 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004142 int nchars;
4143 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144
4145 if (len < 0)
4146 len = 0;
4147
4148 if (len == 1) {
4149 /* no repeat, return original string */
4150 Py_INCREF(str);
4151 return (PyObject*) str;
4152 }
Tim Peters8f422462000-09-09 06:13:41 +00004153
4154 /* ensure # of chars needed doesn't overflow int and # of bytes
4155 * needed doesn't overflow size_t
4156 */
4157 nchars = len * str->length;
4158 if (len && nchars / len != str->length) {
4159 PyErr_SetString(PyExc_OverflowError,
4160 "repeated string is too long");
4161 return NULL;
4162 }
4163 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4164 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4165 PyErr_SetString(PyExc_OverflowError,
4166 "repeated string is too long");
4167 return NULL;
4168 }
4169 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 if (!u)
4171 return NULL;
4172
4173 p = u->str;
4174
4175 while (len-- > 0) {
4176 Py_UNICODE_COPY(p, str->str, str->length);
4177 p += str->length;
4178 }
4179
4180 return (PyObject*) u;
4181}
4182
4183PyObject *PyUnicode_Replace(PyObject *obj,
4184 PyObject *subobj,
4185 PyObject *replobj,
4186 int maxcount)
4187{
4188 PyObject *self;
4189 PyObject *str1;
4190 PyObject *str2;
4191 PyObject *result;
4192
4193 self = PyUnicode_FromObject(obj);
4194 if (self == NULL)
4195 return NULL;
4196 str1 = PyUnicode_FromObject(subobj);
4197 if (str1 == NULL) {
4198 Py_DECREF(self);
4199 return NULL;
4200 }
4201 str2 = PyUnicode_FromObject(replobj);
4202 if (str2 == NULL) {
4203 Py_DECREF(self);
4204 Py_DECREF(str1);
4205 return NULL;
4206 }
4207 result = replace((PyUnicodeObject *)self,
4208 (PyUnicodeObject *)str1,
4209 (PyUnicodeObject *)str2,
4210 maxcount);
4211 Py_DECREF(self);
4212 Py_DECREF(str1);
4213 Py_DECREF(str2);
4214 return result;
4215}
4216
4217static char replace__doc__[] =
4218"S.replace (old, new[, maxsplit]) -> unicode\n\
4219\n\
4220Return a copy of S with all occurrences of substring\n\
4221old replaced by new. If the optional argument maxsplit is\n\
4222given, only the first maxsplit occurrences are replaced.";
4223
4224static PyObject*
4225unicode_replace(PyUnicodeObject *self, PyObject *args)
4226{
4227 PyUnicodeObject *str1;
4228 PyUnicodeObject *str2;
4229 int maxcount = -1;
4230 PyObject *result;
4231
4232 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4233 return NULL;
4234 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4235 if (str1 == NULL)
4236 return NULL;
4237 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4238 if (str2 == NULL)
4239 return NULL;
4240
4241 result = replace(self, str1, str2, maxcount);
4242
4243 Py_DECREF(str1);
4244 Py_DECREF(str2);
4245 return result;
4246}
4247
4248static
4249PyObject *unicode_repr(PyObject *unicode)
4250{
4251 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4252 PyUnicode_GET_SIZE(unicode),
4253 1);
4254}
4255
4256static char rfind__doc__[] =
4257"S.rfind(sub [,start [,end]]) -> int\n\
4258\n\
4259Return the highest index in S where substring sub is found,\n\
4260such that sub is contained within s[start,end]. Optional\n\
4261arguments start and end are interpreted as in slice notation.\n\
4262\n\
4263Return -1 on failure.";
4264
4265static PyObject *
4266unicode_rfind(PyUnicodeObject *self, PyObject *args)
4267{
4268 PyUnicodeObject *substring;
4269 int start = 0;
4270 int end = INT_MAX;
4271 PyObject *result;
4272
Guido van Rossumb8872e62000-05-09 14:14:27 +00004273 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4274 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 return NULL;
4276 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4277 (PyObject *)substring);
4278 if (substring == NULL)
4279 return NULL;
4280
4281 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4282
4283 Py_DECREF(substring);
4284 return result;
4285}
4286
4287static char rindex__doc__[] =
4288"S.rindex(sub [,start [,end]]) -> int\n\
4289\n\
4290Like S.rfind() but raise ValueError when the substring is not found.";
4291
4292static PyObject *
4293unicode_rindex(PyUnicodeObject *self, PyObject *args)
4294{
4295 int result;
4296 PyUnicodeObject *substring;
4297 int start = 0;
4298 int end = INT_MAX;
4299
Guido van Rossumb8872e62000-05-09 14:14:27 +00004300 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4301 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302 return NULL;
4303 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4304 (PyObject *)substring);
4305 if (substring == NULL)
4306 return NULL;
4307
4308 result = findstring(self, substring, start, end, -1);
4309
4310 Py_DECREF(substring);
4311 if (result < 0) {
4312 PyErr_SetString(PyExc_ValueError, "substring not found");
4313 return NULL;
4314 }
4315 return PyInt_FromLong(result);
4316}
4317
4318static char rjust__doc__[] =
4319"S.rjust(width) -> unicode\n\
4320\n\
4321Return S right justified in a Unicode string of length width. Padding is\n\
4322done using spaces.";
4323
4324static PyObject *
4325unicode_rjust(PyUnicodeObject *self, PyObject *args)
4326{
4327 int width;
4328 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4329 return NULL;
4330
4331 if (self->length >= width) {
4332 Py_INCREF(self);
4333 return (PyObject*) self;
4334 }
4335
4336 return (PyObject*) pad(self, width - self->length, 0, ' ');
4337}
4338
4339static char rstrip__doc__[] =
4340"S.rstrip() -> unicode\n\
4341\n\
4342Return a copy of the string S with trailing whitespace removed.";
4343
4344static PyObject *
4345unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4346{
4347 if (!PyArg_NoArgs(args))
4348 return NULL;
4349 return strip(self, 0, 1);
4350}
4351
4352static PyObject*
4353unicode_slice(PyUnicodeObject *self, int start, int end)
4354{
4355 /* standard clamping */
4356 if (start < 0)
4357 start = 0;
4358 if (end < 0)
4359 end = 0;
4360 if (end > self->length)
4361 end = self->length;
4362 if (start == 0 && end == self->length) {
4363 /* full slice, return original string */
4364 Py_INCREF(self);
4365 return (PyObject*) self;
4366 }
4367 if (start > end)
4368 start = end;
4369 /* copy slice */
4370 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4371 end - start);
4372}
4373
4374PyObject *PyUnicode_Split(PyObject *s,
4375 PyObject *sep,
4376 int maxsplit)
4377{
4378 PyObject *result;
4379
4380 s = PyUnicode_FromObject(s);
4381 if (s == NULL)
4382 return NULL;
4383 if (sep != NULL) {
4384 sep = PyUnicode_FromObject(sep);
4385 if (sep == NULL) {
4386 Py_DECREF(s);
4387 return NULL;
4388 }
4389 }
4390
4391 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4392
4393 Py_DECREF(s);
4394 Py_XDECREF(sep);
4395 return result;
4396}
4397
4398static char split__doc__[] =
4399"S.split([sep [,maxsplit]]) -> list of strings\n\
4400\n\
4401Return a list of the words in S, using sep as the\n\
4402delimiter string. If maxsplit is given, at most maxsplit\n\
4403splits are done. If sep is not specified, any whitespace string\n\
4404is a separator.";
4405
4406static PyObject*
4407unicode_split(PyUnicodeObject *self, PyObject *args)
4408{
4409 PyObject *substring = Py_None;
4410 int maxcount = -1;
4411
4412 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4413 return NULL;
4414
4415 if (substring == Py_None)
4416 return split(self, NULL, maxcount);
4417 else if (PyUnicode_Check(substring))
4418 return split(self, (PyUnicodeObject *)substring, maxcount);
4419 else
4420 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4421}
4422
4423static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004424"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425\n\
4426Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004427Line breaks are not included in the resulting list unless keepends\n\
4428is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429
4430static PyObject*
4431unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4432{
Guido van Rossum86662912000-04-11 15:38:46 +00004433 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434
Guido van Rossum86662912000-04-11 15:38:46 +00004435 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 return NULL;
4437
Guido van Rossum86662912000-04-11 15:38:46 +00004438 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439}
4440
4441static
4442PyObject *unicode_str(PyUnicodeObject *self)
4443{
Fred Drakee4315f52000-05-09 19:53:39 +00004444 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445}
4446
4447static char strip__doc__[] =
4448"S.strip() -> unicode\n\
4449\n\
4450Return a copy of S with leading and trailing whitespace removed.";
4451
4452static PyObject *
4453unicode_strip(PyUnicodeObject *self, PyObject *args)
4454{
4455 if (!PyArg_NoArgs(args))
4456 return NULL;
4457 return strip(self, 1, 1);
4458}
4459
4460static char swapcase__doc__[] =
4461"S.swapcase() -> unicode\n\
4462\n\
4463Return a copy of S with uppercase characters converted to lowercase\n\
4464and vice versa.";
4465
4466static PyObject*
4467unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4468{
4469 if (!PyArg_NoArgs(args))
4470 return NULL;
4471 return fixup(self, fixswapcase);
4472}
4473
4474static char translate__doc__[] =
4475"S.translate(table) -> unicode\n\
4476\n\
4477Return a copy of the string S, where all characters have been mapped\n\
4478through the given translation table, which must be a mapping of\n\
4479Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4480are left untouched. Characters mapped to None are deleted.";
4481
4482static PyObject*
4483unicode_translate(PyUnicodeObject *self, PyObject *args)
4484{
4485 PyObject *table;
4486
4487 if (!PyArg_ParseTuple(args, "O:translate", &table))
4488 return NULL;
4489 return PyUnicode_TranslateCharmap(self->str,
4490 self->length,
4491 table,
4492 "ignore");
4493}
4494
4495static char upper__doc__[] =
4496"S.upper() -> unicode\n\
4497\n\
4498Return a copy of S converted to uppercase.";
4499
4500static PyObject*
4501unicode_upper(PyUnicodeObject *self, PyObject *args)
4502{
4503 if (!PyArg_NoArgs(args))
4504 return NULL;
4505 return fixup(self, fixupper);
4506}
4507
4508#if 0
4509static char zfill__doc__[] =
4510"S.zfill(width) -> unicode\n\
4511\n\
4512Pad a numeric string x with zeros on the left, to fill a field\n\
4513of the specified width. The string x is never truncated.";
4514
4515static PyObject *
4516unicode_zfill(PyUnicodeObject *self, PyObject *args)
4517{
4518 int fill;
4519 PyUnicodeObject *u;
4520
4521 int width;
4522 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4523 return NULL;
4524
4525 if (self->length >= width) {
4526 Py_INCREF(self);
4527 return (PyObject*) self;
4528 }
4529
4530 fill = width - self->length;
4531
4532 u = pad(self, fill, 0, '0');
4533
4534 if (u->str[fill] == '+' || u->str[fill] == '-') {
4535 /* move sign to beginning of string */
4536 u->str[0] = u->str[fill];
4537 u->str[fill] = '0';
4538 }
4539
4540 return (PyObject*) u;
4541}
4542#endif
4543
4544#if 0
4545static PyObject*
4546unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4547{
4548 if (!PyArg_NoArgs(args))
4549 return NULL;
4550 return PyInt_FromLong(unicode_freelist_size);
4551}
4552#endif
4553
4554static char startswith__doc__[] =
4555"S.startswith(prefix[, start[, end]]) -> int\n\
4556\n\
4557Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4558optional start, test S beginning at that position. With optional end, stop\n\
4559comparing S at that position.";
4560
4561static PyObject *
4562unicode_startswith(PyUnicodeObject *self,
4563 PyObject *args)
4564{
4565 PyUnicodeObject *substring;
4566 int start = 0;
4567 int end = INT_MAX;
4568 PyObject *result;
4569
Guido van Rossumb8872e62000-05-09 14:14:27 +00004570 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4571 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 return NULL;
4573 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4574 (PyObject *)substring);
4575 if (substring == NULL)
4576 return NULL;
4577
4578 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4579
4580 Py_DECREF(substring);
4581 return result;
4582}
4583
4584
4585static char endswith__doc__[] =
4586"S.endswith(suffix[, start[, end]]) -> int\n\
4587\n\
4588Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4589optional start, test S beginning at that position. With optional end, stop\n\
4590comparing S at that position.";
4591
4592static PyObject *
4593unicode_endswith(PyUnicodeObject *self,
4594 PyObject *args)
4595{
4596 PyUnicodeObject *substring;
4597 int start = 0;
4598 int end = INT_MAX;
4599 PyObject *result;
4600
Guido van Rossumb8872e62000-05-09 14:14:27 +00004601 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4602 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 return NULL;
4604 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4605 (PyObject *)substring);
4606 if (substring == NULL)
4607 return NULL;
4608
4609 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4610
4611 Py_DECREF(substring);
4612 return result;
4613}
4614
4615
4616static PyMethodDef unicode_methods[] = {
4617
4618 /* Order is according to common usage: often used methods should
4619 appear first, since lookup is done sequentially. */
4620
4621 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4622 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4623 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4624 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4625 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4626 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4627 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4628 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4629 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4630 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4631 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4632 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4633 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4634 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4635/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4636 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4637 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4638 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4639 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4640 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4641 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4642 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4643 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4644 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4645 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4646 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4647 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4648 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4649 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4650 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4651 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4652 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4653 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004654 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4655 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656#if 0
4657 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4658 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4659#endif
4660
4661#if 0
4662 /* This one is just used for debugging the implementation. */
4663 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4664#endif
4665
4666 {NULL, NULL}
4667};
4668
4669static PyObject *
4670unicode_getattr(PyUnicodeObject *self, char *name)
4671{
4672 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4673}
4674
4675static PySequenceMethods unicode_as_sequence = {
4676 (inquiry) unicode_length, /* sq_length */
4677 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4678 (intargfunc) unicode_repeat, /* sq_repeat */
4679 (intargfunc) unicode_getitem, /* sq_item */
4680 (intintargfunc) unicode_slice, /* sq_slice */
4681 0, /* sq_ass_item */
4682 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004683 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684};
4685
4686static int
4687unicode_buffer_getreadbuf(PyUnicodeObject *self,
4688 int index,
4689 const void **ptr)
4690{
4691 if (index != 0) {
4692 PyErr_SetString(PyExc_SystemError,
4693 "accessing non-existent unicode segment");
4694 return -1;
4695 }
4696 *ptr = (void *) self->str;
4697 return PyUnicode_GET_DATA_SIZE(self);
4698}
4699
4700static int
4701unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4702 const void **ptr)
4703{
4704 PyErr_SetString(PyExc_TypeError,
4705 "cannot use unicode as modifyable buffer");
4706 return -1;
4707}
4708
4709static int
4710unicode_buffer_getsegcount(PyUnicodeObject *self,
4711 int *lenp)
4712{
4713 if (lenp)
4714 *lenp = PyUnicode_GET_DATA_SIZE(self);
4715 return 1;
4716}
4717
4718static int
4719unicode_buffer_getcharbuf(PyUnicodeObject *self,
4720 int index,
4721 const void **ptr)
4722{
4723 PyObject *str;
4724
4725 if (index != 0) {
4726 PyErr_SetString(PyExc_SystemError,
4727 "accessing non-existent unicode segment");
4728 return -1;
4729 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004730 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 if (str == NULL)
4732 return -1;
4733 *ptr = (void *) PyString_AS_STRING(str);
4734 return PyString_GET_SIZE(str);
4735}
4736
4737/* Helpers for PyUnicode_Format() */
4738
4739static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004740getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741{
4742 int argidx = *p_argidx;
4743 if (argidx < arglen) {
4744 (*p_argidx)++;
4745 if (arglen < 0)
4746 return args;
4747 else
4748 return PyTuple_GetItem(args, argidx);
4749 }
4750 PyErr_SetString(PyExc_TypeError,
4751 "not enough arguments for format string");
4752 return NULL;
4753}
4754
4755#define F_LJUST (1<<0)
4756#define F_SIGN (1<<1)
4757#define F_BLANK (1<<2)
4758#define F_ALT (1<<3)
4759#define F_ZERO (1<<4)
4760
4761static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
4764 register int i;
4765 int len;
4766 va_list va;
4767 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
4770 /* First, format the string as char array, then expand to Py_UNICODE
4771 array. */
4772 charbuffer = (char *)buffer;
4773 len = vsprintf(charbuffer, format, va);
4774 for (i = len - 1; i >= 0; i--)
4775 buffer[i] = (Py_UNICODE) charbuffer[i];
4776
4777 va_end(va);
4778 return len;
4779}
4780
4781static int
4782formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004783 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 int flags,
4785 int prec,
4786 int type,
4787 PyObject *v)
4788{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004789 /* fmt = '%#.' + `prec` + `type`
4790 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 char fmt[20];
4792 double x;
4793
4794 x = PyFloat_AsDouble(v);
4795 if (x == -1.0 && PyErr_Occurred())
4796 return -1;
4797 if (prec < 0)
4798 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4800 type = 'g';
4801 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004802 /* worst case length calc to ensure no buffer overrun:
4803 fmt = %#.<prec>g
4804 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4805 for any double rep.)
4806 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4807 If prec=0 the effective precision is 1 (the leading digit is
4808 always given), therefore increase by one to 10+prec. */
4809 if (buflen <= (size_t)10 + (size_t)prec) {
4810 PyErr_SetString(PyExc_OverflowError,
4811 "formatted float is too long (precision too long?)");
4812 return -1;
4813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 return usprintf(buf, fmt, x);
4815}
4816
Tim Peters38fd5b62000-09-21 05:43:11 +00004817static PyObject*
4818formatlong(PyObject *val, int flags, int prec, int type)
4819{
4820 char *buf;
4821 int i, len;
4822 PyObject *str; /* temporary string object. */
4823 PyUnicodeObject *result;
4824
4825 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4826 if (!str)
4827 return NULL;
4828 result = _PyUnicode_New(len);
4829 for (i = 0; i < len; i++)
4830 result->str[i] = buf[i];
4831 result->str[len] = 0;
4832 Py_DECREF(str);
4833 return (PyObject*)result;
4834}
4835
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836static int
4837formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004838 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 int flags,
4840 int prec,
4841 int type,
4842 PyObject *v)
4843{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004844 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004845 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4846 + 1 + 1 = 24*/
4847 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004849 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850
4851 x = PyInt_AsLong(v);
4852 if (x == -1 && PyErr_Occurred())
4853 return -1;
4854 if (prec < 0)
4855 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004856 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4857 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4858 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4859 PyErr_SetString(PyExc_OverflowError,
4860 "formatted integer is too long (precision too long?)");
4861 return -1;
4862 }
Tim Petersfff53252001-04-12 18:38:48 +00004863 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4864 * but we want it (for consistency with other %#x conversions, and
4865 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004866 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4867 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4868 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004869 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004870 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4871 /* Only way to know what the platform does is to try it. */
4872 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4873 if (fmt[1] != (char)type) {
4874 /* Supply our own leading 0x/0X -- needed under std C */
4875 use_native_c_format = 0;
4876 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4877 }
4878 }
4879 if (use_native_c_format)
4880 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 return usprintf(buf, fmt, x);
4882}
4883
4884static int
4885formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004886 size_t buflen,
4887 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004889 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004890 if (PyUnicode_Check(v)) {
4891 if (PyUnicode_GET_SIZE(v) != 1)
4892 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004896 else if (PyString_Check(v)) {
4897 if (PyString_GET_SIZE(v) != 1)
4898 goto onError;
4899 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901
4902 else {
4903 /* Integer input truncated to a character */
4904 long x;
4905 x = PyInt_AsLong(v);
4906 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004907 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 buf[0] = (char) x;
4909 }
4910 buf[1] = '\0';
4911 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004912
4913 onError:
4914 PyErr_SetString(PyExc_TypeError,
4915 "%c requires int or char");
4916 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917}
4918
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004919/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4920
4921 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4922 chars are formatted. XXX This is a magic number. Each formatting
4923 routine does bounds checking to ensure no overflow, but a better
4924 solution may be to malloc a buffer of appropriate size for each
4925 format. For now, the current solution is sufficient.
4926*/
4927#define FORMATBUFLEN (size_t)120
4928
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929PyObject *PyUnicode_Format(PyObject *format,
4930 PyObject *args)
4931{
4932 Py_UNICODE *fmt, *res;
4933 int fmtcnt, rescnt, reslen, arglen, argidx;
4934 int args_owned = 0;
4935 PyUnicodeObject *result = NULL;
4936 PyObject *dict = NULL;
4937 PyObject *uformat;
4938
4939 if (format == NULL || args == NULL) {
4940 PyErr_BadInternalCall();
4941 return NULL;
4942 }
4943 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004944 if (uformat == NULL)
4945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 fmt = PyUnicode_AS_UNICODE(uformat);
4947 fmtcnt = PyUnicode_GET_SIZE(uformat);
4948
4949 reslen = rescnt = fmtcnt + 100;
4950 result = _PyUnicode_New(reslen);
4951 if (result == NULL)
4952 goto onError;
4953 res = PyUnicode_AS_UNICODE(result);
4954
4955 if (PyTuple_Check(args)) {
4956 arglen = PyTuple_Size(args);
4957 argidx = 0;
4958 }
4959 else {
4960 arglen = -1;
4961 argidx = -2;
4962 }
4963 if (args->ob_type->tp_as_mapping)
4964 dict = args;
4965
4966 while (--fmtcnt >= 0) {
4967 if (*fmt != '%') {
4968 if (--rescnt < 0) {
4969 rescnt = fmtcnt + 100;
4970 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004971 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 return NULL;
4973 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4974 --rescnt;
4975 }
4976 *res++ = *fmt++;
4977 }
4978 else {
4979 /* Got a format specifier */
4980 int flags = 0;
4981 int width = -1;
4982 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 Py_UNICODE c = '\0';
4984 Py_UNICODE fill;
4985 PyObject *v = NULL;
4986 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004987 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 Py_UNICODE sign;
4989 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004990 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991
4992 fmt++;
4993 if (*fmt == '(') {
4994 Py_UNICODE *keystart;
4995 int keylen;
4996 PyObject *key;
4997 int pcount = 1;
4998
4999 if (dict == NULL) {
5000 PyErr_SetString(PyExc_TypeError,
5001 "format requires a mapping");
5002 goto onError;
5003 }
5004 ++fmt;
5005 --fmtcnt;
5006 keystart = fmt;
5007 /* Skip over balanced parentheses */
5008 while (pcount > 0 && --fmtcnt >= 0) {
5009 if (*fmt == ')')
5010 --pcount;
5011 else if (*fmt == '(')
5012 ++pcount;
5013 fmt++;
5014 }
5015 keylen = fmt - keystart - 1;
5016 if (fmtcnt < 0 || pcount > 0) {
5017 PyErr_SetString(PyExc_ValueError,
5018 "incomplete format key");
5019 goto onError;
5020 }
Fred Drakee4315f52000-05-09 19:53:39 +00005021 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 then looked up since Python uses strings to hold
5023 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005024 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 key = PyUnicode_EncodeUTF8(keystart,
5026 keylen,
5027 NULL);
5028 if (key == NULL)
5029 goto onError;
5030 if (args_owned) {
5031 Py_DECREF(args);
5032 args_owned = 0;
5033 }
5034 args = PyObject_GetItem(dict, key);
5035 Py_DECREF(key);
5036 if (args == NULL) {
5037 goto onError;
5038 }
5039 args_owned = 1;
5040 arglen = -1;
5041 argidx = -2;
5042 }
5043 while (--fmtcnt >= 0) {
5044 switch (c = *fmt++) {
5045 case '-': flags |= F_LJUST; continue;
5046 case '+': flags |= F_SIGN; continue;
5047 case ' ': flags |= F_BLANK; continue;
5048 case '#': flags |= F_ALT; continue;
5049 case '0': flags |= F_ZERO; continue;
5050 }
5051 break;
5052 }
5053 if (c == '*') {
5054 v = getnextarg(args, arglen, &argidx);
5055 if (v == NULL)
5056 goto onError;
5057 if (!PyInt_Check(v)) {
5058 PyErr_SetString(PyExc_TypeError,
5059 "* wants int");
5060 goto onError;
5061 }
5062 width = PyInt_AsLong(v);
5063 if (width < 0) {
5064 flags |= F_LJUST;
5065 width = -width;
5066 }
5067 if (--fmtcnt >= 0)
5068 c = *fmt++;
5069 }
5070 else if (c >= '0' && c <= '9') {
5071 width = c - '0';
5072 while (--fmtcnt >= 0) {
5073 c = *fmt++;
5074 if (c < '0' || c > '9')
5075 break;
5076 if ((width*10) / 10 != width) {
5077 PyErr_SetString(PyExc_ValueError,
5078 "width too big");
5079 goto onError;
5080 }
5081 width = width*10 + (c - '0');
5082 }
5083 }
5084 if (c == '.') {
5085 prec = 0;
5086 if (--fmtcnt >= 0)
5087 c = *fmt++;
5088 if (c == '*') {
5089 v = getnextarg(args, arglen, &argidx);
5090 if (v == NULL)
5091 goto onError;
5092 if (!PyInt_Check(v)) {
5093 PyErr_SetString(PyExc_TypeError,
5094 "* wants int");
5095 goto onError;
5096 }
5097 prec = PyInt_AsLong(v);
5098 if (prec < 0)
5099 prec = 0;
5100 if (--fmtcnt >= 0)
5101 c = *fmt++;
5102 }
5103 else if (c >= '0' && c <= '9') {
5104 prec = c - '0';
5105 while (--fmtcnt >= 0) {
5106 c = Py_CHARMASK(*fmt++);
5107 if (c < '0' || c > '9')
5108 break;
5109 if ((prec*10) / 10 != prec) {
5110 PyErr_SetString(PyExc_ValueError,
5111 "prec too big");
5112 goto onError;
5113 }
5114 prec = prec*10 + (c - '0');
5115 }
5116 }
5117 } /* prec */
5118 if (fmtcnt >= 0) {
5119 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 if (--fmtcnt >= 0)
5121 c = *fmt++;
5122 }
5123 }
5124 if (fmtcnt < 0) {
5125 PyErr_SetString(PyExc_ValueError,
5126 "incomplete format");
5127 goto onError;
5128 }
5129 if (c != '%') {
5130 v = getnextarg(args, arglen, &argidx);
5131 if (v == NULL)
5132 goto onError;
5133 }
5134 sign = 0;
5135 fill = ' ';
5136 switch (c) {
5137
5138 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005139 pbuf = formatbuf;
5140 /* presume that buffer length is at least 1 */
5141 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 len = 1;
5143 break;
5144
5145 case 's':
5146 case 'r':
5147 if (PyUnicode_Check(v) && c == 's') {
5148 temp = v;
5149 Py_INCREF(temp);
5150 }
5151 else {
5152 PyObject *unicode;
5153 if (c == 's')
5154 temp = PyObject_Str(v);
5155 else
5156 temp = PyObject_Repr(v);
5157 if (temp == NULL)
5158 goto onError;
5159 if (!PyString_Check(temp)) {
5160 /* XXX Note: this should never happen, since
5161 PyObject_Repr() and PyObject_Str() assure
5162 this */
5163 Py_DECREF(temp);
5164 PyErr_SetString(PyExc_TypeError,
5165 "%s argument has non-string str()");
5166 goto onError;
5167 }
Fred Drakee4315f52000-05-09 19:53:39 +00005168 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005170 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 "strict");
5172 Py_DECREF(temp);
5173 temp = unicode;
5174 if (temp == NULL)
5175 goto onError;
5176 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005177 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 len = PyUnicode_GET_SIZE(temp);
5179 if (prec >= 0 && len > prec)
5180 len = prec;
5181 break;
5182
5183 case 'i':
5184 case 'd':
5185 case 'u':
5186 case 'o':
5187 case 'x':
5188 case 'X':
5189 if (c == 'i')
5190 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005191 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005192 temp = formatlong(v, flags, prec, c);
5193 if (!temp)
5194 goto onError;
5195 pbuf = PyUnicode_AS_UNICODE(temp);
5196 len = PyUnicode_GET_SIZE(temp);
5197 /* unbounded ints can always produce
5198 a sign character! */
5199 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005201 else {
5202 pbuf = formatbuf;
5203 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5204 flags, prec, c, v);
5205 if (len < 0)
5206 goto onError;
5207 /* only d conversion is signed */
5208 sign = c == 'd';
5209 }
5210 if (flags & F_ZERO)
5211 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 break;
5213
5214 case 'e':
5215 case 'E':
5216 case 'f':
5217 case 'g':
5218 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005219 pbuf = formatbuf;
5220 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5221 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (len < 0)
5223 goto onError;
5224 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005225 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 fill = '0';
5227 break;
5228
5229 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005230 pbuf = formatbuf;
5231 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 if (len < 0)
5233 goto onError;
5234 break;
5235
5236 default:
5237 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005238 "unsupported format character '%c' (0x%x) "
5239 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005240 (31<=c && c<=126) ? c : '?',
5241 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 goto onError;
5243 }
5244 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005245 if (*pbuf == '-' || *pbuf == '+') {
5246 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 len--;
5248 }
5249 else if (flags & F_SIGN)
5250 sign = '+';
5251 else if (flags & F_BLANK)
5252 sign = ' ';
5253 else
5254 sign = 0;
5255 }
5256 if (width < len)
5257 width = len;
5258 if (rescnt < width + (sign != 0)) {
5259 reslen -= rescnt;
5260 rescnt = width + fmtcnt + 100;
5261 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005262 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 return NULL;
5264 res = PyUnicode_AS_UNICODE(result)
5265 + reslen - rescnt;
5266 }
5267 if (sign) {
5268 if (fill != ' ')
5269 *res++ = sign;
5270 rescnt--;
5271 if (width > len)
5272 width--;
5273 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005274 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5275 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005276 assert(pbuf[1] == c);
5277 if (fill != ' ') {
5278 *res++ = *pbuf++;
5279 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005280 }
Tim Petersfff53252001-04-12 18:38:48 +00005281 rescnt -= 2;
5282 width -= 2;
5283 if (width < 0)
5284 width = 0;
5285 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 if (width > len && !(flags & F_LJUST)) {
5288 do {
5289 --rescnt;
5290 *res++ = fill;
5291 } while (--width > len);
5292 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005293 if (fill == ' ') {
5294 if (sign)
5295 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005296 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005297 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005298 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005299 *res++ = *pbuf++;
5300 *res++ = *pbuf++;
5301 }
5302 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005303 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 res += len;
5305 rescnt -= len;
5306 while (--width >= len) {
5307 --rescnt;
5308 *res++ = ' ';
5309 }
5310 if (dict && (argidx < arglen) && c != '%') {
5311 PyErr_SetString(PyExc_TypeError,
5312 "not all arguments converted");
5313 goto onError;
5314 }
5315 Py_XDECREF(temp);
5316 } /* '%' */
5317 } /* until end */
5318 if (argidx < arglen && !dict) {
5319 PyErr_SetString(PyExc_TypeError,
5320 "not all arguments converted");
5321 goto onError;
5322 }
5323
5324 if (args_owned) {
5325 Py_DECREF(args);
5326 }
5327 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005328 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005329 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 return (PyObject *)result;
5331
5332 onError:
5333 Py_XDECREF(result);
5334 Py_DECREF(uformat);
5335 if (args_owned) {
5336 Py_DECREF(args);
5337 }
5338 return NULL;
5339}
5340
5341static PyBufferProcs unicode_as_buffer = {
5342 (getreadbufferproc) unicode_buffer_getreadbuf,
5343 (getwritebufferproc) unicode_buffer_getwritebuf,
5344 (getsegcountproc) unicode_buffer_getsegcount,
5345 (getcharbufferproc) unicode_buffer_getcharbuf,
5346};
5347
5348PyTypeObject PyUnicode_Type = {
5349 PyObject_HEAD_INIT(&PyType_Type)
5350 0, /* ob_size */
5351 "unicode", /* tp_name */
5352 sizeof(PyUnicodeObject), /* tp_size */
5353 0, /* tp_itemsize */
5354 /* Slots */
5355 (destructor)_PyUnicode_Free, /* tp_dealloc */
5356 0, /* tp_print */
5357 (getattrfunc)unicode_getattr, /* tp_getattr */
5358 0, /* tp_setattr */
5359 (cmpfunc) unicode_compare, /* tp_compare */
5360 (reprfunc) unicode_repr, /* tp_repr */
5361 0, /* tp_as_number */
5362 &unicode_as_sequence, /* tp_as_sequence */
5363 0, /* tp_as_mapping */
5364 (hashfunc) unicode_hash, /* tp_hash*/
5365 0, /* tp_call*/
5366 (reprfunc) unicode_str, /* tp_str */
5367 (getattrofunc) NULL, /* tp_getattro */
5368 (setattrofunc) NULL, /* tp_setattro */
5369 &unicode_as_buffer, /* tp_as_buffer */
5370 Py_TPFLAGS_DEFAULT, /* tp_flags */
5371};
5372
5373/* Initialize the Unicode implementation */
5374
Thomas Wouters78890102000-07-22 19:25:51 +00005375void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005377 int i;
5378
Fred Drakee4315f52000-05-09 19:53:39 +00005379 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005380 unicode_freelist = NULL;
5381 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005383 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005384 for (i = 0; i < 256; i++)
5385 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386}
5387
5388/* Finalize the Unicode implementation */
5389
5390void
Thomas Wouters78890102000-07-22 19:25:51 +00005391_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005393 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005394 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005396 Py_XDECREF(unicode_empty);
5397 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005398
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005399 for (i = 0; i < 256; i++) {
5400 if (unicode_latin1[i]) {
5401 Py_DECREF(unicode_latin1[i]);
5402 unicode_latin1[i] = NULL;
5403 }
5404 }
5405
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005406 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 PyUnicodeObject *v = u;
5408 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005409 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005410 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005411 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005412 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005414 unicode_freelist = NULL;
5415 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416}