blob: 1319c7c52ad3ab13dc6659de5ee605e34f1ff36a [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
433 }
434 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
439 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450
451 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 if (len == 0) {
453 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000454 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000458
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000460 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463 return v;
464
465 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000466 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470}
471
472PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *buffer = NULL, *unicode;
478
Fred Drakee4315f52000-05-09 19:53:39 +0000479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
481
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000499 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
503 }
504 Py_DECREF(buffer);
505 return unicode;
506
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
510}
511
512PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
516{
517 PyObject *v, *unicode;
518
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
525}
526
527PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v;
532
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
Fred Drakee4315f52000-05-09 19:53:39 +0000537
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000544 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000558 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
562 }
563 return v;
564
565 onError:
566 return NULL;
567}
568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000569PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
570 const char *errors)
571{
572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
573
574 if (v)
575 return v;
576 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
577 if (v && errors == NULL)
578 ((PyUnicodeObject *)unicode)->defenc = v;
579 return v;
580}
581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
583{
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
587 }
588 return PyUnicode_AS_UNICODE(unicode);
589
590 onError:
591 return NULL;
592}
593
594int PyUnicode_GetSize(PyObject *unicode)
595{
596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
600 return PyUnicode_GET_SIZE(unicode);
601
602 onError:
603 return -1;
604}
605
Thomas Wouters78890102000-07-22 19:25:51 +0000606const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000607{
608 return unicode_default_encoding;
609}
610
611int PyUnicode_SetDefaultEncoding(const char *encoding)
612{
613 PyObject *v;
614
615 /* Make sure the encoding is valid. As side effect, this also
616 loads the encoding into the codec registry cache. */
617 v = _PyCodec_Lookup(encoding);
618 if (v == NULL)
619 goto onError;
620 Py_DECREF(v);
621 strncpy(unicode_default_encoding,
622 encoding,
623 sizeof(unicode_default_encoding));
624 return 0;
625
626 onError:
627 return -1;
628}
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630/* --- UTF-8 Codec -------------------------------------------------------- */
631
632static
633char utf8_code_length[256] = {
634 /* Map UTF-8 encoded prefix byte to sequence length. zero means
635 illegal prefix. see RFC 2279 for details */
636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
649 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
650 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
652};
653
654static
655int utf8_decoding_error(const char **source,
656 Py_UNICODE **dest,
657 const char *errors,
658 const char *details)
659{
660 if ((errors == NULL) ||
661 (strcmp(errors,"strict") == 0)) {
662 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 details);
665 return -1;
666 }
667 else if (strcmp(errors,"ignore") == 0) {
668 (*source)++;
669 return 0;
670 }
671 else if (strcmp(errors,"replace") == 0) {
672 (*source)++;
673 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
674 (*dest)++;
675 return 0;
676 }
677 else {
678 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000679 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680 errors);
681 return -1;
682 }
683}
684
Guido van Rossumd57fd912000-03-10 22:53:23 +0000685PyObject *PyUnicode_DecodeUTF8(const char *s,
686 int size,
687 const char *errors)
688{
689 int n;
690 const char *e;
691 PyUnicodeObject *unicode;
692 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000693 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694
695 /* Note: size will always be longer than the resulting Unicode
696 character count */
697 unicode = _PyUnicode_New(size);
698 if (!unicode)
699 return NULL;
700 if (size == 0)
701 return (PyObject *)unicode;
702
703 /* Unpack UTF-8 encoded data */
704 p = unicode->str;
705 e = s + size;
706
707 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000708 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000711 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712 s++;
713 continue;
714 }
715
716 n = utf8_code_length[ch];
717
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (s + n > e) {
719 errmsg = "unexpected end of data";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722
723 switch (n) {
724
725 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000726 errmsg = "unexpected code byte";
727 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728
729 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000730 errmsg = "internal error";
731 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732
733 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000734 if ((s[1] & 0xc0) != 0x80) {
735 errmsg = "invalid data";
736 goto utf8Error;
737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000739 if (ch < 0x80) {
740 errmsg = "illegal encoding";
741 goto utf8Error;
742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000743 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 break;
746
747 case 3:
748 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000749 (s[2] & 0xc0) != 0x80) {
750 errmsg = "invalid data";
751 goto utf8Error;
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000754 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000759 *p++ = (Py_UNICODE)ch;
760 break;
761
762 case 4:
763 if ((s[1] & 0xc0) != 0x80 ||
764 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000765 (s[3] & 0xc0) != 0x80) {
766 errmsg = "invalid data";
767 goto utf8Error;
768 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000769 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
770 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
771 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000772 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000773 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000774 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000775 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000776 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 errmsg = "illegal encoding";
778 goto utf8Error;
779 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000781 *p++ = (Py_UNICODE)ch;
782#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000783 /* compute and append the two surrogates: */
784
785 /* translate from 10000..10FFFF to 0..FFFF */
786 ch -= 0x10000;
787
788 /* high surrogate = top 10 bits added to D800 */
789 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
790
791 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000792 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000793#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 break;
795
796 default:
797 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000798 errmsg = "unsupported Unicode code range";
799 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800 }
801 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000802 continue;
803
804 utf8Error:
805 if (utf8_decoding_error(&s, &p, errors, errmsg))
806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
809 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000810 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 goto onError;
812
813 return (PyObject *)unicode;
814
815onError:
816 Py_DECREF(unicode);
817 return NULL;
818}
819
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000820/* Not used anymore, now that the encoder supports UTF-16
821 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000822#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823static
824int utf8_encoding_error(const Py_UNICODE **source,
825 char **dest,
826 const char *errors,
827 const char *details)
828{
829 if ((errors == NULL) ||
830 (strcmp(errors,"strict") == 0)) {
831 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000832 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000833 details);
834 return -1;
835 }
836 else if (strcmp(errors,"ignore") == 0) {
837 return 0;
838 }
839 else if (strcmp(errors,"replace") == 0) {
840 **dest = '?';
841 (*dest)++;
842 return 0;
843 }
844 else {
845 PyErr_Format(PyExc_ValueError,
846 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000847 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 errors);
849 return -1;
850 }
851}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000852#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000853
854PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
855 int size,
856 const char *errors)
857{
858 PyObject *v;
859 char *p;
860 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000861 Py_UCS4 ch2;
862 unsigned int cbAllocated = 3 * size;
863 unsigned int cbWritten = 0;
864 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000865
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 if (v == NULL)
868 return NULL;
869 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000870 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871
872 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 while (i < size) {
874 Py_UCS4 ch = s[i++];
875 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000877 cbWritten++;
878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879 else if (ch < 0x0800) {
880 *p++ = 0xc0 | (ch >> 6);
881 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000882 cbWritten += 2;
883 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000884 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000885 /* Check for high surrogate */
886 if (0xD800 <= ch && ch <= 0xDBFF) {
887 if (i != size) {
888 ch2 = s[i];
889 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
890
891 if (cbWritten >= (cbAllocated - 4)) {
892 /* Provide enough room for some more
893 surrogates */
894 cbAllocated += 4*10;
895 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000896 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000897 }
898
899 /* combine the two values */
900 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
901
902 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000903 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000904 i++;
905 cbWritten += 4;
906 }
907 }
908 }
909 else {
910 *p++ = (char)(0xe0 | (ch >> 12));
911 cbWritten += 3;
912 }
913 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
914 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000915 } else {
916 *p++ = 0xf0 | (ch>>18);
917 *p++ = 0x80 | ((ch>>12) & 0x3f);
918 *p++ = 0x80 | ((ch>>6) & 0x3f);
919 *p++ = 0x80 | (ch & 0x3f);
920 cbWritten += 4;
921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922 }
923 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000924 if (_PyString_Resize(&v, p - q))
925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 return v;
927
928 onError:
929 Py_DECREF(v);
930 return NULL;
931}
932
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
934{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 if (!PyUnicode_Check(unicode)) {
936 PyErr_BadArgument();
937 return NULL;
938 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000939 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
940 PyUnicode_GET_SIZE(unicode),
941 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942}
943
944/* --- UTF-16 Codec ------------------------------------------------------- */
945
946static
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000947int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948 Py_UNICODE **dest,
949 const char *errors,
950 const char *details)
951{
952 if ((errors == NULL) ||
953 (strcmp(errors,"strict") == 0)) {
954 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000955 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956 details);
957 return -1;
958 }
959 else if (strcmp(errors,"ignore") == 0) {
960 return 0;
961 }
962 else if (strcmp(errors,"replace") == 0) {
963 if (dest) {
964 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
965 (*dest)++;
966 }
967 return 0;
968 }
969 else {
970 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000971 "UTF-16 decoding error; "
972 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000973 errors);
974 return -1;
975 }
976}
977
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978PyObject *PyUnicode_DecodeUTF16(const char *s,
979 int size,
980 const char *errors,
981 int *byteorder)
982{
983 PyUnicodeObject *unicode;
984 Py_UNICODE *p;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000985 const Py_UCS2 *q, *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000987 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000988
989 /* size should be an even number */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000990 if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
992 return NULL;
993 /* The remaining input chars are ignored if we fall through
994 here... */
995 }
996
997 /* Note: size will always be longer than the resulting Unicode
998 character count */
999 unicode = _PyUnicode_New(size);
1000 if (!unicode)
1001 return NULL;
1002 if (size == 0)
1003 return (PyObject *)unicode;
1004
1005 /* Unpack UTF-16 encoded data */
1006 p = unicode->str;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001007 q = (Py_UCS2 *)s;
1008 e = q + (size / sizeof(Py_UCS2));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009
1010 if (byteorder)
1011 bo = *byteorder;
1012
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001013 /* Check for BOM marks (U+FEFF) in the input and adjust current
1014 byte order setting accordingly. In native mode, the leading BOM
1015 mark is skipped, in all other modes, it is copied to the output
1016 stream as-is (giving a ZWNBSP character). */
1017 if (bo == 0) {
1018#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1019 if (*q == 0xFEFF) {
1020 q++;
1021 bo = -1;
1022 } else if (*q == 0xFFFE) {
1023 q++;
1024 bo = 1;
1025 }
1026#else
1027 if (*q == 0xFEFF) {
1028 q++;
1029 bo = 1;
1030 } else if (*q == 0xFFFE) {
1031 q++;
1032 bo = -1;
1033 }
1034#endif
1035 }
1036
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 while (q < e) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001038 register Py_UCS2 ch = *q++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001040 /* Swap input bytes if needed. (This assumes
1041 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (bo == 1)
1044 ch = (ch >> 8) | (ch << 8);
1045#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 if (bo == -1)
1047 ch = (ch >> 8) | (ch << 8);
1048#endif
1049 if (ch < 0xD800 || ch > 0xDFFF) {
1050 *p++ = ch;
1051 continue;
1052 }
1053
1054 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 if (q >= e) {
1056 errmsg = "unexpected end of data";
1057 goto utf16Error;
1058 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001059 if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001060 Py_UCS2 ch2 = *q++;
1061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1062 if (bo == 1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001063 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001064#else
1065 if (bo == -1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001066 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001067#endif
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001068 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001069#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001070 *p++ = ch;
1071 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001072#else
1073 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001074#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001075 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001076 }
1077 else {
1078 errmsg = "illegal UTF-16 surrogate";
1079 goto utf16Error;
1080 }
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 errmsg = "illegal encoding";
1084 /* Fall through to report the error */
1085
1086 utf16Error:
1087 if (utf16_decoding_error(&q, &p, errors, errmsg))
1088 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 }
1090
1091 if (byteorder)
1092 *byteorder = bo;
1093
1094 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001095 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 goto onError;
1097
1098 return (PyObject *)unicode;
1099
1100onError:
1101 Py_DECREF(unicode);
1102 return NULL;
1103}
1104
1105#undef UTF16_ERROR
1106
1107PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1108 int size,
1109 const char *errors,
1110 int byteorder)
1111{
1112 PyObject *v;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001113 Py_UCS2 *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 char *q;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001115 int i, pairs, doswap = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001117 for (i = pairs = 0; i < size; i++)
1118 if (s[i] >= 0x10000)
1119 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 v = PyString_FromStringAndSize(NULL,
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001121 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 if (v == NULL)
1123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124
1125 q = PyString_AS_STRING(v);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001126 p = (Py_UCS2 *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 if (byteorder == 0)
1128 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001130 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131 if (byteorder == 0 ||
1132#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1133 byteorder == -1
1134#else
1135 byteorder == 1
1136#endif
1137 )
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001138 doswap = 0;
1139 while (size-- > 0) {
1140 Py_UNICODE ch = *s++;
1141 Py_UNICODE ch2 = 0;
1142 if (ch >= 0x10000) {
1143 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1144 ch = 0xD800|((ch-0x10000)>>10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001146 if (doswap){
1147 *p++ = (ch >> 8) | (ch << 8);
1148 if (ch2)
1149 *p++ = (ch2 >> 8) | (ch2 << 8);
1150 }else{
1151 *p++ = ch;
1152 if(ch2)
1153 *p++ = ch2;
1154 }
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 return v;
1157}
1158
1159PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1160{
1161 if (!PyUnicode_Check(unicode)) {
1162 PyErr_BadArgument();
1163 return NULL;
1164 }
1165 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1166 PyUnicode_GET_SIZE(unicode),
1167 NULL,
1168 0);
1169}
1170
1171/* --- Unicode Escape Codec ----------------------------------------------- */
1172
1173static
1174int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001175 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 const char *errors,
1177 const char *details)
1178{
1179 if ((errors == NULL) ||
1180 (strcmp(errors,"strict") == 0)) {
1181 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001182 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 details);
1184 return -1;
1185 }
1186 else if (strcmp(errors,"ignore") == 0) {
1187 return 0;
1188 }
1189 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001190 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return 0;
1192 }
1193 else {
1194 PyErr_Format(PyExc_ValueError,
1195 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001196 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 errors);
1198 return -1;
1199 }
1200}
1201
Fredrik Lundh06d12682001-01-24 07:59:11 +00001202static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1205 int size,
1206 const char *errors)
1207{
1208 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001209 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001211 char* message;
1212 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 /* Escaped strings will always be longer than the resulting
1215 Unicode string, so we start with size here and then reduce the
1216 length after conversion to the true value. */
1217 v = _PyUnicode_New(size);
1218 if (v == NULL)
1219 goto onError;
1220 if (size == 0)
1221 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 p = buf = PyUnicode_AS_UNICODE(v);
1224 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 while (s < end) {
1227 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001228 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001229 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Non-escape characters are interpreted as Unicode ordinals */
1232 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001233 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 continue;
1235 }
1236
1237 /* \ - Escapes */
1238 s++;
1239 switch (*s++) {
1240
1241 /* \x escapes */
1242 case '\n': break;
1243 case '\\': *p++ = '\\'; break;
1244 case '\'': *p++ = '\''; break;
1245 case '\"': *p++ = '\"'; break;
1246 case 'b': *p++ = '\b'; break;
1247 case 'f': *p++ = '\014'; break; /* FF */
1248 case 't': *p++ = '\t'; break;
1249 case 'n': *p++ = '\n'; break;
1250 case 'r': *p++ = '\r'; break;
1251 case 'v': *p++ = '\013'; break; /* VT */
1252 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1253
1254 /* \OOO (octal) escapes */
1255 case '0': case '1': case '2': case '3':
1256 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001257 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001259 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001261 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001263 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 break;
1265
Fredrik Lundhccc74732001-02-18 22:13:49 +00001266 /* hex escapes */
1267 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001269 digits = 2;
1270 message = "truncated \\xXX escape";
1271 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272
Fredrik Lundhccc74732001-02-18 22:13:49 +00001273 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001275 digits = 4;
1276 message = "truncated \\uXXXX escape";
1277 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278
Fredrik Lundhccc74732001-02-18 22:13:49 +00001279 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001280 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001281 digits = 8;
1282 message = "truncated \\UXXXXXXXX escape";
1283 hexescape:
1284 chr = 0;
1285 for (i = 0; i < digits; i++) {
1286 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001287 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001288 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001289 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001290 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291 i++;
1292 break;
1293 }
1294 chr = (chr<<4) & ~0xF;
1295 if (c >= '0' && c <= '9')
1296 chr += c - '0';
1297 else if (c >= 'a' && c <= 'f')
1298 chr += 10 + c - 'a';
1299 else
1300 chr += 10 + c - 'A';
1301 }
1302 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001303 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001304 /* when we get here, chr is a 32-bit unicode character */
1305 if (chr <= 0xffff)
1306 /* UCS-2 character */
1307 *p++ = (Py_UNICODE) chr;
1308 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001309 /* UCS-4 character. Either store directly, or as
1310 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001311#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001312 *p++ = chr;
1313#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001314 chr -= 0x10000L;
1315 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001316 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001317#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001318 } else {
1319 if (unicodeescape_decoding_error(
1320 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001321 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001322 )
1323 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001324 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001325 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001326 break;
1327
1328 /* \N{name} */
1329 case 'N':
1330 message = "malformed \\N character escape";
1331 if (ucnhash_CAPI == NULL) {
1332 /* load the unicode data module */
1333 PyObject *m, *v;
1334 m = PyImport_ImportModule("unicodedata");
1335 if (m == NULL)
1336 goto ucnhashError;
1337 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1338 Py_DECREF(m);
1339 if (v == NULL)
1340 goto ucnhashError;
1341 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1342 Py_DECREF(v);
1343 if (ucnhash_CAPI == NULL)
1344 goto ucnhashError;
1345 }
1346 if (*s == '{') {
1347 const char *start = s+1;
1348 /* look for the closing brace */
1349 while (*s != '}' && s < end)
1350 s++;
1351 if (s > start && s < end && *s == '}') {
1352 /* found a name. look it up in the unicode database */
1353 message = "unknown Unicode character name";
1354 s++;
1355 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1356 goto store;
1357 }
1358 }
1359 if (unicodeescape_decoding_error(&s, &x, errors, message))
1360 goto onError;
1361 *p++ = x;
1362 break;
1363
1364 default:
1365 *p++ = '\\';
1366 *p++ = (unsigned char)s[-1];
1367 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368 }
1369 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001370 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001371 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 return (PyObject *)v;
1373
Fredrik Lundhccc74732001-02-18 22:13:49 +00001374ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001375 PyErr_SetString(
1376 PyExc_UnicodeError,
1377 "\\N escapes not supported (can't load unicodedata module)"
1378 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001379 return NULL;
1380
Fredrik Lundhccc74732001-02-18 22:13:49 +00001381onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 Py_XDECREF(v);
1383 return NULL;
1384}
1385
1386/* Return a Unicode-Escape string version of the Unicode object.
1387
1388 If quotes is true, the string is enclosed in u"" or u'' quotes as
1389 appropriate.
1390
1391*/
1392
Barry Warsaw51ac5802000-03-20 16:36:48 +00001393static const Py_UNICODE *findchar(const Py_UNICODE *s,
1394 int size,
1395 Py_UNICODE ch);
1396
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397static
1398PyObject *unicodeescape_string(const Py_UNICODE *s,
1399 int size,
1400 int quotes)
1401{
1402 PyObject *repr;
1403 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001405 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406
1407 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1408 if (repr == NULL)
1409 return NULL;
1410
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001411 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412
1413 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 *p++ = 'u';
1415 *p++ = (findchar(s, size, '\'') &&
1416 !findchar(s, size, '"')) ? '"' : '\'';
1417 }
1418 while (size-- > 0) {
1419 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001420
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001422 if (quotes &&
1423 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 *p++ = '\\';
1425 *p++ = (char) ch;
1426 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001427
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001428#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001429 /* Map 21-bit characters to '\U00xxxxxx' */
1430 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001431 int offset = p - PyString_AS_STRING(repr);
1432
1433 /* Resize the string if necessary */
1434 if (offset + 12 > PyString_GET_SIZE(repr)) {
1435 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1436 goto onError;
1437 p = PyString_AS_STRING(repr) + offset;
1438 }
1439
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001440 *p++ = '\\';
1441 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001442 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1443 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1444 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1445 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1446 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1447 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1448 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001449 *p++ = hexdigit[ch & 0x0000000F];
1450 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001451 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001452#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001453 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1454 else if (ch >= 0xD800 && ch < 0xDC00) {
1455 Py_UNICODE ch2;
1456 Py_UCS4 ucs;
1457
1458 ch2 = *s++;
1459 size--;
1460 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1461 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1462 *p++ = '\\';
1463 *p++ = 'U';
1464 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1465 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1466 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1467 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1468 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1469 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1470 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1471 *p++ = hexdigit[ucs & 0x0000000F];
1472 continue;
1473 }
1474 /* Fall through: isolated surrogates are copied as-is */
1475 s--;
1476 size++;
1477 }
1478
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001480 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 *p++ = '\\';
1482 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001483 *p++ = hexdigit[(ch >> 12) & 0x000F];
1484 *p++ = hexdigit[(ch >> 8) & 0x000F];
1485 *p++ = hexdigit[(ch >> 4) & 0x000F];
1486 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001488
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001489 /* Map special whitespace to '\t', \n', '\r' */
1490 else if (ch == '\t') {
1491 *p++ = '\\';
1492 *p++ = 't';
1493 }
1494 else if (ch == '\n') {
1495 *p++ = '\\';
1496 *p++ = 'n';
1497 }
1498 else if (ch == '\r') {
1499 *p++ = '\\';
1500 *p++ = 'r';
1501 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001502
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001503 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 else if (ch < ' ' || ch >= 128) {
1505 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001506 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001507 *p++ = hexdigit[(ch >> 4) & 0x000F];
1508 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001510
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 /* Copy everything else as-is */
1512 else
1513 *p++ = (char) ch;
1514 }
1515 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001516 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517
1518 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001519 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521
1522 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001523
1524 onError:
1525 Py_DECREF(repr);
1526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527}
1528
1529PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1530 int size)
1531{
1532 return unicodeescape_string(s, size, 0);
1533}
1534
1535PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1536{
1537 if (!PyUnicode_Check(unicode)) {
1538 PyErr_BadArgument();
1539 return NULL;
1540 }
1541 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1542 PyUnicode_GET_SIZE(unicode));
1543}
1544
1545/* --- Raw Unicode Escape Codec ------------------------------------------- */
1546
1547PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1548 int size,
1549 const char *errors)
1550{
1551 PyUnicodeObject *v;
1552 Py_UNICODE *p, *buf;
1553 const char *end;
1554 const char *bs;
1555
1556 /* Escaped strings will always be longer than the resulting
1557 Unicode string, so we start with size here and then reduce the
1558 length after conversion to the true value. */
1559 v = _PyUnicode_New(size);
1560 if (v == NULL)
1561 goto onError;
1562 if (size == 0)
1563 return (PyObject *)v;
1564 p = buf = PyUnicode_AS_UNICODE(v);
1565 end = s + size;
1566 while (s < end) {
1567 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001568 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 int i;
1570
1571 /* Non-escape characters are interpreted as Unicode ordinals */
1572 if (*s != '\\') {
1573 *p++ = (unsigned char)*s++;
1574 continue;
1575 }
1576
1577 /* \u-escapes are only interpreted iff the number of leading
1578 backslashes if odd */
1579 bs = s;
1580 for (;s < end;) {
1581 if (*s != '\\')
1582 break;
1583 *p++ = (unsigned char)*s++;
1584 }
1585 if (((s - bs) & 1) == 0 ||
1586 s >= end ||
1587 *s != 'u') {
1588 continue;
1589 }
1590 p--;
1591 s++;
1592
1593 /* \uXXXX with 4 hex digits */
1594 for (x = 0, i = 0; i < 4; i++) {
1595 c = (unsigned char)s[i];
1596 if (!isxdigit(c)) {
1597 if (unicodeescape_decoding_error(&s, &x, errors,
1598 "truncated \\uXXXX"))
1599 goto onError;
1600 i++;
1601 break;
1602 }
1603 x = (x<<4) & ~0xF;
1604 if (c >= '0' && c <= '9')
1605 x += c - '0';
1606 else if (c >= 'a' && c <= 'f')
1607 x += 10 + c - 'a';
1608 else
1609 x += 10 + c - 'A';
1610 }
1611 s += i;
1612 *p++ = x;
1613 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001614 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001615 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 return (PyObject *)v;
1617
1618 onError:
1619 Py_XDECREF(v);
1620 return NULL;
1621}
1622
1623PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1624 int size)
1625{
1626 PyObject *repr;
1627 char *p;
1628 char *q;
1629
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001630 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631
1632 repr = PyString_FromStringAndSize(NULL, 6 * size);
1633 if (repr == NULL)
1634 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001635 if (size == 0)
1636 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637
1638 p = q = PyString_AS_STRING(repr);
1639 while (size-- > 0) {
1640 Py_UNICODE ch = *s++;
1641 /* Map 16-bit characters to '\uxxxx' */
1642 if (ch >= 256) {
1643 *p++ = '\\';
1644 *p++ = 'u';
1645 *p++ = hexdigit[(ch >> 12) & 0xf];
1646 *p++ = hexdigit[(ch >> 8) & 0xf];
1647 *p++ = hexdigit[(ch >> 4) & 0xf];
1648 *p++ = hexdigit[ch & 15];
1649 }
1650 /* Copy everything else as-is */
1651 else
1652 *p++ = (char) ch;
1653 }
1654 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001655 if (_PyString_Resize(&repr, p - q))
1656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
1658 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001659
1660 onError:
1661 Py_DECREF(repr);
1662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663}
1664
1665PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1666{
1667 if (!PyUnicode_Check(unicode)) {
1668 PyErr_BadArgument();
1669 return NULL;
1670 }
1671 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1672 PyUnicode_GET_SIZE(unicode));
1673}
1674
1675/* --- Latin-1 Codec ------------------------------------------------------ */
1676
1677PyObject *PyUnicode_DecodeLatin1(const char *s,
1678 int size,
1679 const char *errors)
1680{
1681 PyUnicodeObject *v;
1682 Py_UNICODE *p;
1683
1684 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001685 if (size == 1 && *(unsigned char*)s < 256) {
1686 Py_UNICODE r = *(unsigned char*)s;
1687 return PyUnicode_FromUnicode(&r, 1);
1688 }
1689
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 v = _PyUnicode_New(size);
1691 if (v == NULL)
1692 goto onError;
1693 if (size == 0)
1694 return (PyObject *)v;
1695 p = PyUnicode_AS_UNICODE(v);
1696 while (size-- > 0)
1697 *p++ = (unsigned char)*s++;
1698 return (PyObject *)v;
1699
1700 onError:
1701 Py_XDECREF(v);
1702 return NULL;
1703}
1704
1705static
1706int latin1_encoding_error(const Py_UNICODE **source,
1707 char **dest,
1708 const char *errors,
1709 const char *details)
1710{
1711 if ((errors == NULL) ||
1712 (strcmp(errors,"strict") == 0)) {
1713 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001714 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 details);
1716 return -1;
1717 }
1718 else if (strcmp(errors,"ignore") == 0) {
1719 return 0;
1720 }
1721 else if (strcmp(errors,"replace") == 0) {
1722 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001723 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 return 0;
1725 }
1726 else {
1727 PyErr_Format(PyExc_ValueError,
1728 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001729 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 errors);
1731 return -1;
1732 }
1733}
1734
1735PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1736 int size,
1737 const char *errors)
1738{
1739 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001740 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001741
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 repr = PyString_FromStringAndSize(NULL, size);
1743 if (repr == NULL)
1744 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001745 if (size == 0)
1746 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001749 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 while (size-- > 0) {
1751 Py_UNICODE ch = *p++;
1752 if (ch >= 256) {
1753 if (latin1_encoding_error(&p, &s, errors,
1754 "ordinal not in range(256)"))
1755 goto onError;
1756 }
1757 else
1758 *s++ = (char)ch;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 /* Resize if error handling skipped some characters */
1761 if (s - start < PyString_GET_SIZE(repr))
1762 if (_PyString_Resize(&repr, s - start))
1763 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return repr;
1765
1766 onError:
1767 Py_DECREF(repr);
1768 return NULL;
1769}
1770
1771PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1772{
1773 if (!PyUnicode_Check(unicode)) {
1774 PyErr_BadArgument();
1775 return NULL;
1776 }
1777 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1778 PyUnicode_GET_SIZE(unicode),
1779 NULL);
1780}
1781
1782/* --- 7-bit ASCII Codec -------------------------------------------------- */
1783
1784static
1785int ascii_decoding_error(const char **source,
1786 Py_UNICODE **dest,
1787 const char *errors,
1788 const char *details)
1789{
1790 if ((errors == NULL) ||
1791 (strcmp(errors,"strict") == 0)) {
1792 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001793 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 details);
1795 return -1;
1796 }
1797 else if (strcmp(errors,"ignore") == 0) {
1798 return 0;
1799 }
1800 else if (strcmp(errors,"replace") == 0) {
1801 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1802 (*dest)++;
1803 return 0;
1804 }
1805 else {
1806 PyErr_Format(PyExc_ValueError,
1807 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001808 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 errors);
1810 return -1;
1811 }
1812}
1813
1814PyObject *PyUnicode_DecodeASCII(const char *s,
1815 int size,
1816 const char *errors)
1817{
1818 PyUnicodeObject *v;
1819 Py_UNICODE *p;
1820
1821 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001822 if (size == 1 && *(unsigned char*)s < 128) {
1823 Py_UNICODE r = *(unsigned char*)s;
1824 return PyUnicode_FromUnicode(&r, 1);
1825 }
1826
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 v = _PyUnicode_New(size);
1828 if (v == NULL)
1829 goto onError;
1830 if (size == 0)
1831 return (PyObject *)v;
1832 p = PyUnicode_AS_UNICODE(v);
1833 while (size-- > 0) {
1834 register unsigned char c;
1835
1836 c = (unsigned char)*s++;
1837 if (c < 128)
1838 *p++ = c;
1839 else if (ascii_decoding_error(&s, &p, errors,
1840 "ordinal not in range(128)"))
1841 goto onError;
1842 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001843 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001844 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001845 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 return (PyObject *)v;
1847
1848 onError:
1849 Py_XDECREF(v);
1850 return NULL;
1851}
1852
1853static
1854int ascii_encoding_error(const Py_UNICODE **source,
1855 char **dest,
1856 const char *errors,
1857 const char *details)
1858{
1859 if ((errors == NULL) ||
1860 (strcmp(errors,"strict") == 0)) {
1861 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001862 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 details);
1864 return -1;
1865 }
1866 else if (strcmp(errors,"ignore") == 0) {
1867 return 0;
1868 }
1869 else if (strcmp(errors,"replace") == 0) {
1870 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001871 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872 return 0;
1873 }
1874 else {
1875 PyErr_Format(PyExc_ValueError,
1876 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001877 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 errors);
1879 return -1;
1880 }
1881}
1882
1883PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1884 int size,
1885 const char *errors)
1886{
1887 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001888 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001889
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 repr = PyString_FromStringAndSize(NULL, size);
1891 if (repr == NULL)
1892 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001893 if (size == 0)
1894 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001897 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 while (size-- > 0) {
1899 Py_UNICODE ch = *p++;
1900 if (ch >= 128) {
1901 if (ascii_encoding_error(&p, &s, errors,
1902 "ordinal not in range(128)"))
1903 goto onError;
1904 }
1905 else
1906 *s++ = (char)ch;
1907 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001908 /* Resize if error handling skipped some characters */
1909 if (s - start < PyString_GET_SIZE(repr))
1910 if (_PyString_Resize(&repr, s - start))
1911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 return repr;
1913
1914 onError:
1915 Py_DECREF(repr);
1916 return NULL;
1917}
1918
1919PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1920{
1921 if (!PyUnicode_Check(unicode)) {
1922 PyErr_BadArgument();
1923 return NULL;
1924 }
1925 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1926 PyUnicode_GET_SIZE(unicode),
1927 NULL);
1928}
1929
Fredrik Lundh30831632001-06-26 15:11:00 +00001930#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001932/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001933
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001934PyObject *PyUnicode_DecodeMBCS(const char *s,
1935 int size,
1936 const char *errors)
1937{
1938 PyUnicodeObject *v;
1939 Py_UNICODE *p;
1940
1941 /* First get the size of the result */
1942 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001943 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001944 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1945
1946 v = _PyUnicode_New(usize);
1947 if (v == NULL)
1948 return NULL;
1949 if (usize == 0)
1950 return (PyObject *)v;
1951 p = PyUnicode_AS_UNICODE(v);
1952 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1953 Py_DECREF(v);
1954 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1955 }
1956
1957 return (PyObject *)v;
1958}
1959
1960PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1961 int size,
1962 const char *errors)
1963{
1964 PyObject *repr;
1965 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001966 DWORD mbcssize;
1967
1968 /* If there are no characters, bail now! */
1969 if (size==0)
1970 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001971
1972 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001974 if (mbcssize==0)
1975 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1976
1977 repr = PyString_FromStringAndSize(NULL, mbcssize);
1978 if (repr == NULL)
1979 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001980 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001981 return repr;
1982
1983 /* Do the conversion */
1984 s = PyString_AS_STRING(repr);
1985 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1986 Py_DECREF(repr);
1987 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1988 }
1989 return repr;
1990}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001991
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001992#endif /* MS_WIN32 */
1993
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994/* --- Character Mapping Codec -------------------------------------------- */
1995
1996static
1997int charmap_decoding_error(const char **source,
1998 Py_UNICODE **dest,
1999 const char *errors,
2000 const char *details)
2001{
2002 if ((errors == NULL) ||
2003 (strcmp(errors,"strict") == 0)) {
2004 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002005 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 details);
2007 return -1;
2008 }
2009 else if (strcmp(errors,"ignore") == 0) {
2010 return 0;
2011 }
2012 else if (strcmp(errors,"replace") == 0) {
2013 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2014 (*dest)++;
2015 return 0;
2016 }
2017 else {
2018 PyErr_Format(PyExc_ValueError,
2019 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002020 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 errors);
2022 return -1;
2023 }
2024}
2025
2026PyObject *PyUnicode_DecodeCharmap(const char *s,
2027 int size,
2028 PyObject *mapping,
2029 const char *errors)
2030{
2031 PyUnicodeObject *v;
2032 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002033 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Default to Latin-1 */
2036 if (mapping == NULL)
2037 return PyUnicode_DecodeLatin1(s, size, errors);
2038
2039 v = _PyUnicode_New(size);
2040 if (v == NULL)
2041 goto onError;
2042 if (size == 0)
2043 return (PyObject *)v;
2044 p = PyUnicode_AS_UNICODE(v);
2045 while (size-- > 0) {
2046 unsigned char ch = *s++;
2047 PyObject *w, *x;
2048
2049 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2050 w = PyInt_FromLong((long)ch);
2051 if (w == NULL)
2052 goto onError;
2053 x = PyObject_GetItem(mapping, w);
2054 Py_DECREF(w);
2055 if (x == NULL) {
2056 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002057 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002059 x = Py_None;
2060 Py_INCREF(x);
2061 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 }
2064
2065 /* Apply mapping */
2066 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002067 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 if (value < 0 || value > 65535) {
2069 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002070 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 Py_DECREF(x);
2072 goto onError;
2073 }
2074 *p++ = (Py_UNICODE)value;
2075 }
2076 else if (x == Py_None) {
2077 /* undefined mapping */
2078 if (charmap_decoding_error(&s, &p, errors,
2079 "character maps to <undefined>")) {
2080 Py_DECREF(x);
2081 goto onError;
2082 }
2083 }
2084 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002085 int targetsize = PyUnicode_GET_SIZE(x);
2086
2087 if (targetsize == 1)
2088 /* 1-1 mapping */
2089 *p++ = *PyUnicode_AS_UNICODE(x);
2090
2091 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002093 if (targetsize > extrachars) {
2094 /* resize first */
2095 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2096 int needed = (targetsize - extrachars) + \
2097 (targetsize << 2);
2098 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099 if (_PyUnicode_Resize(&v,
2100 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002101 Py_DECREF(x);
2102 goto onError;
2103 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002104 p = PyUnicode_AS_UNICODE(v) + oldpos;
2105 }
2106 Py_UNICODE_COPY(p,
2107 PyUnicode_AS_UNICODE(x),
2108 targetsize);
2109 p += targetsize;
2110 extrachars -= targetsize;
2111 }
2112 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 }
2114 else {
2115 /* wrong return value */
2116 PyErr_SetString(PyExc_TypeError,
2117 "character mapping must return integer, None or unicode");
2118 Py_DECREF(x);
2119 goto onError;
2120 }
2121 Py_DECREF(x);
2122 }
2123 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002124 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 goto onError;
2126 return (PyObject *)v;
2127
2128 onError:
2129 Py_XDECREF(v);
2130 return NULL;
2131}
2132
2133static
2134int charmap_encoding_error(const Py_UNICODE **source,
2135 char **dest,
2136 const char *errors,
2137 const char *details)
2138{
2139 if ((errors == NULL) ||
2140 (strcmp(errors,"strict") == 0)) {
2141 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002142 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 details);
2144 return -1;
2145 }
2146 else if (strcmp(errors,"ignore") == 0) {
2147 return 0;
2148 }
2149 else if (strcmp(errors,"replace") == 0) {
2150 **dest = '?';
2151 (*dest)++;
2152 return 0;
2153 }
2154 else {
2155 PyErr_Format(PyExc_ValueError,
2156 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002157 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 errors);
2159 return -1;
2160 }
2161}
2162
2163PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2164 int size,
2165 PyObject *mapping,
2166 const char *errors)
2167{
2168 PyObject *v;
2169 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002170 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171
2172 /* Default to Latin-1 */
2173 if (mapping == NULL)
2174 return PyUnicode_EncodeLatin1(p, size, errors);
2175
2176 v = PyString_FromStringAndSize(NULL, size);
2177 if (v == NULL)
2178 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002179 if (size == 0)
2180 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 s = PyString_AS_STRING(v);
2182 while (size-- > 0) {
2183 Py_UNICODE ch = *p++;
2184 PyObject *w, *x;
2185
2186 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2187 w = PyInt_FromLong((long)ch);
2188 if (w == NULL)
2189 goto onError;
2190 x = PyObject_GetItem(mapping, w);
2191 Py_DECREF(w);
2192 if (x == NULL) {
2193 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002194 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002196 x = Py_None;
2197 Py_INCREF(x);
2198 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002199 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 }
2201
2202 /* Apply mapping */
2203 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002204 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 if (value < 0 || value > 255) {
2206 PyErr_SetString(PyExc_TypeError,
2207 "character mapping must be in range(256)");
2208 Py_DECREF(x);
2209 goto onError;
2210 }
2211 *s++ = (char)value;
2212 }
2213 else if (x == Py_None) {
2214 /* undefined mapping */
2215 if (charmap_encoding_error(&p, &s, errors,
2216 "character maps to <undefined>")) {
2217 Py_DECREF(x);
2218 goto onError;
2219 }
2220 }
2221 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002222 int targetsize = PyString_GET_SIZE(x);
2223
2224 if (targetsize == 1)
2225 /* 1-1 mapping */
2226 *s++ = *PyString_AS_STRING(x);
2227
2228 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002230 if (targetsize > extrachars) {
2231 /* resize first */
2232 int oldpos = (int)(s - PyString_AS_STRING(v));
2233 int needed = (targetsize - extrachars) + \
2234 (targetsize << 2);
2235 extrachars += needed;
2236 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002237 Py_DECREF(x);
2238 goto onError;
2239 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002240 s = PyString_AS_STRING(v) + oldpos;
2241 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002242 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002243 s += targetsize;
2244 extrachars -= targetsize;
2245 }
2246 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 else {
2249 /* wrong return value */
2250 PyErr_SetString(PyExc_TypeError,
2251 "character mapping must return integer, None or unicode");
2252 Py_DECREF(x);
2253 goto onError;
2254 }
2255 Py_DECREF(x);
2256 }
2257 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2258 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2259 goto onError;
2260 return v;
2261
2262 onError:
2263 Py_DECREF(v);
2264 return NULL;
2265}
2266
2267PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2268 PyObject *mapping)
2269{
2270 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2271 PyErr_BadArgument();
2272 return NULL;
2273 }
2274 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2275 PyUnicode_GET_SIZE(unicode),
2276 mapping,
2277 NULL);
2278}
2279
2280static
2281int translate_error(const Py_UNICODE **source,
2282 Py_UNICODE **dest,
2283 const char *errors,
2284 const char *details)
2285{
2286 if ((errors == NULL) ||
2287 (strcmp(errors,"strict") == 0)) {
2288 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002289 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 details);
2291 return -1;
2292 }
2293 else if (strcmp(errors,"ignore") == 0) {
2294 return 0;
2295 }
2296 else if (strcmp(errors,"replace") == 0) {
2297 **dest = '?';
2298 (*dest)++;
2299 return 0;
2300 }
2301 else {
2302 PyErr_Format(PyExc_ValueError,
2303 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002304 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 errors);
2306 return -1;
2307 }
2308}
2309
2310PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2311 int size,
2312 PyObject *mapping,
2313 const char *errors)
2314{
2315 PyUnicodeObject *v;
2316 Py_UNICODE *p;
2317
2318 if (mapping == NULL) {
2319 PyErr_BadArgument();
2320 return NULL;
2321 }
2322
2323 /* Output will never be longer than input */
2324 v = _PyUnicode_New(size);
2325 if (v == NULL)
2326 goto onError;
2327 if (size == 0)
2328 goto done;
2329 p = PyUnicode_AS_UNICODE(v);
2330 while (size-- > 0) {
2331 Py_UNICODE ch = *s++;
2332 PyObject *w, *x;
2333
2334 /* Get mapping */
2335 w = PyInt_FromLong(ch);
2336 if (w == NULL)
2337 goto onError;
2338 x = PyObject_GetItem(mapping, w);
2339 Py_DECREF(w);
2340 if (x == NULL) {
2341 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2342 /* No mapping found: default to 1-1 mapping */
2343 PyErr_Clear();
2344 *p++ = ch;
2345 continue;
2346 }
2347 goto onError;
2348 }
2349
2350 /* Apply mapping */
2351 if (PyInt_Check(x))
2352 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2353 else if (x == Py_None) {
2354 /* undefined mapping */
2355 if (translate_error(&s, &p, errors,
2356 "character maps to <undefined>")) {
2357 Py_DECREF(x);
2358 goto onError;
2359 }
2360 }
2361 else if (PyUnicode_Check(x)) {
2362 if (PyUnicode_GET_SIZE(x) != 1) {
2363 /* 1-n mapping */
2364 PyErr_SetString(PyExc_NotImplementedError,
2365 "1-n mappings are currently not implemented");
2366 Py_DECREF(x);
2367 goto onError;
2368 }
2369 *p++ = *PyUnicode_AS_UNICODE(x);
2370 }
2371 else {
2372 /* wrong return value */
2373 PyErr_SetString(PyExc_TypeError,
2374 "translate mapping must return integer, None or unicode");
2375 Py_DECREF(x);
2376 goto onError;
2377 }
2378 Py_DECREF(x);
2379 }
2380 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002381 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002382 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383
2384 done:
2385 return (PyObject *)v;
2386
2387 onError:
2388 Py_XDECREF(v);
2389 return NULL;
2390}
2391
2392PyObject *PyUnicode_Translate(PyObject *str,
2393 PyObject *mapping,
2394 const char *errors)
2395{
2396 PyObject *result;
2397
2398 str = PyUnicode_FromObject(str);
2399 if (str == NULL)
2400 goto onError;
2401 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2402 PyUnicode_GET_SIZE(str),
2403 mapping,
2404 errors);
2405 Py_DECREF(str);
2406 return result;
2407
2408 onError:
2409 Py_XDECREF(str);
2410 return NULL;
2411}
2412
Guido van Rossum9e896b32000-04-05 20:11:21 +00002413/* --- Decimal Encoder ---------------------------------------------------- */
2414
2415int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2416 int length,
2417 char *output,
2418 const char *errors)
2419{
2420 Py_UNICODE *p, *end;
2421
2422 if (output == NULL) {
2423 PyErr_BadArgument();
2424 return -1;
2425 }
2426
2427 p = s;
2428 end = s + length;
2429 while (p < end) {
2430 register Py_UNICODE ch = *p++;
2431 int decimal;
2432
2433 if (Py_UNICODE_ISSPACE(ch)) {
2434 *output++ = ' ';
2435 continue;
2436 }
2437 decimal = Py_UNICODE_TODECIMAL(ch);
2438 if (decimal >= 0) {
2439 *output++ = '0' + decimal;
2440 continue;
2441 }
Guido van Rossumba477042000-04-06 18:18:10 +00002442 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002443 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002444 continue;
2445 }
2446 /* All other characters are considered invalid */
2447 if (errors == NULL || strcmp(errors, "strict") == 0) {
2448 PyErr_SetString(PyExc_ValueError,
2449 "invalid decimal Unicode string");
2450 goto onError;
2451 }
2452 else if (strcmp(errors, "ignore") == 0)
2453 continue;
2454 else if (strcmp(errors, "replace") == 0) {
2455 *output++ = '?';
2456 continue;
2457 }
2458 }
2459 /* 0-terminate the output string */
2460 *output++ = '\0';
2461 return 0;
2462
2463 onError:
2464 return -1;
2465}
2466
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467/* --- Helpers ------------------------------------------------------------ */
2468
2469static
2470int count(PyUnicodeObject *self,
2471 int start,
2472 int end,
2473 PyUnicodeObject *substring)
2474{
2475 int count = 0;
2476
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002477 if (start < 0)
2478 start += self->length;
2479 if (start < 0)
2480 start = 0;
2481 if (end > self->length)
2482 end = self->length;
2483 if (end < 0)
2484 end += self->length;
2485 if (end < 0)
2486 end = 0;
2487
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002488 if (substring->length == 0)
2489 return (end - start + 1);
2490
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 end -= substring->length;
2492
2493 while (start <= end)
2494 if (Py_UNICODE_MATCH(self, start, substring)) {
2495 count++;
2496 start += substring->length;
2497 } else
2498 start++;
2499
2500 return count;
2501}
2502
2503int PyUnicode_Count(PyObject *str,
2504 PyObject *substr,
2505 int start,
2506 int end)
2507{
2508 int result;
2509
2510 str = PyUnicode_FromObject(str);
2511 if (str == NULL)
2512 return -1;
2513 substr = PyUnicode_FromObject(substr);
2514 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002515 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 return -1;
2517 }
2518
2519 result = count((PyUnicodeObject *)str,
2520 start, end,
2521 (PyUnicodeObject *)substr);
2522
2523 Py_DECREF(str);
2524 Py_DECREF(substr);
2525 return result;
2526}
2527
2528static
2529int findstring(PyUnicodeObject *self,
2530 PyUnicodeObject *substring,
2531 int start,
2532 int end,
2533 int direction)
2534{
2535 if (start < 0)
2536 start += self->length;
2537 if (start < 0)
2538 start = 0;
2539
2540 if (substring->length == 0)
2541 return start;
2542
2543 if (end > self->length)
2544 end = self->length;
2545 if (end < 0)
2546 end += self->length;
2547 if (end < 0)
2548 end = 0;
2549
2550 end -= substring->length;
2551
2552 if (direction < 0) {
2553 for (; end >= start; end--)
2554 if (Py_UNICODE_MATCH(self, end, substring))
2555 return end;
2556 } else {
2557 for (; start <= end; start++)
2558 if (Py_UNICODE_MATCH(self, start, substring))
2559 return start;
2560 }
2561
2562 return -1;
2563}
2564
2565int PyUnicode_Find(PyObject *str,
2566 PyObject *substr,
2567 int start,
2568 int end,
2569 int direction)
2570{
2571 int result;
2572
2573 str = PyUnicode_FromObject(str);
2574 if (str == NULL)
2575 return -1;
2576 substr = PyUnicode_FromObject(substr);
2577 if (substr == NULL) {
2578 Py_DECREF(substr);
2579 return -1;
2580 }
2581
2582 result = findstring((PyUnicodeObject *)str,
2583 (PyUnicodeObject *)substr,
2584 start, end, direction);
2585 Py_DECREF(str);
2586 Py_DECREF(substr);
2587 return result;
2588}
2589
2590static
2591int tailmatch(PyUnicodeObject *self,
2592 PyUnicodeObject *substring,
2593 int start,
2594 int end,
2595 int direction)
2596{
2597 if (start < 0)
2598 start += self->length;
2599 if (start < 0)
2600 start = 0;
2601
2602 if (substring->length == 0)
2603 return 1;
2604
2605 if (end > self->length)
2606 end = self->length;
2607 if (end < 0)
2608 end += self->length;
2609 if (end < 0)
2610 end = 0;
2611
2612 end -= substring->length;
2613 if (end < start)
2614 return 0;
2615
2616 if (direction > 0) {
2617 if (Py_UNICODE_MATCH(self, end, substring))
2618 return 1;
2619 } else {
2620 if (Py_UNICODE_MATCH(self, start, substring))
2621 return 1;
2622 }
2623
2624 return 0;
2625}
2626
2627int PyUnicode_Tailmatch(PyObject *str,
2628 PyObject *substr,
2629 int start,
2630 int end,
2631 int direction)
2632{
2633 int result;
2634
2635 str = PyUnicode_FromObject(str);
2636 if (str == NULL)
2637 return -1;
2638 substr = PyUnicode_FromObject(substr);
2639 if (substr == NULL) {
2640 Py_DECREF(substr);
2641 return -1;
2642 }
2643
2644 result = tailmatch((PyUnicodeObject *)str,
2645 (PyUnicodeObject *)substr,
2646 start, end, direction);
2647 Py_DECREF(str);
2648 Py_DECREF(substr);
2649 return result;
2650}
2651
2652static
2653const Py_UNICODE *findchar(const Py_UNICODE *s,
2654 int size,
2655 Py_UNICODE ch)
2656{
2657 /* like wcschr, but doesn't stop at NULL characters */
2658
2659 while (size-- > 0) {
2660 if (*s == ch)
2661 return s;
2662 s++;
2663 }
2664
2665 return NULL;
2666}
2667
2668/* Apply fixfct filter to the Unicode object self and return a
2669 reference to the modified object */
2670
2671static
2672PyObject *fixup(PyUnicodeObject *self,
2673 int (*fixfct)(PyUnicodeObject *s))
2674{
2675
2676 PyUnicodeObject *u;
2677
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002678 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 if (u == NULL)
2680 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002681
2682 Py_UNICODE_COPY(u->str, self->str, self->length);
2683
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (!fixfct(u)) {
2685 /* fixfct should return TRUE if it modified the buffer. If
2686 FALSE, return a reference to the original buffer instead
2687 (to save space, not time) */
2688 Py_INCREF(self);
2689 Py_DECREF(u);
2690 return (PyObject*) self;
2691 }
2692 return (PyObject*) u;
2693}
2694
2695static
2696int fixupper(PyUnicodeObject *self)
2697{
2698 int len = self->length;
2699 Py_UNICODE *s = self->str;
2700 int status = 0;
2701
2702 while (len-- > 0) {
2703 register Py_UNICODE ch;
2704
2705 ch = Py_UNICODE_TOUPPER(*s);
2706 if (ch != *s) {
2707 status = 1;
2708 *s = ch;
2709 }
2710 s++;
2711 }
2712
2713 return status;
2714}
2715
2716static
2717int fixlower(PyUnicodeObject *self)
2718{
2719 int len = self->length;
2720 Py_UNICODE *s = self->str;
2721 int status = 0;
2722
2723 while (len-- > 0) {
2724 register Py_UNICODE ch;
2725
2726 ch = Py_UNICODE_TOLOWER(*s);
2727 if (ch != *s) {
2728 status = 1;
2729 *s = ch;
2730 }
2731 s++;
2732 }
2733
2734 return status;
2735}
2736
2737static
2738int fixswapcase(PyUnicodeObject *self)
2739{
2740 int len = self->length;
2741 Py_UNICODE *s = self->str;
2742 int status = 0;
2743
2744 while (len-- > 0) {
2745 if (Py_UNICODE_ISUPPER(*s)) {
2746 *s = Py_UNICODE_TOLOWER(*s);
2747 status = 1;
2748 } else if (Py_UNICODE_ISLOWER(*s)) {
2749 *s = Py_UNICODE_TOUPPER(*s);
2750 status = 1;
2751 }
2752 s++;
2753 }
2754
2755 return status;
2756}
2757
2758static
2759int fixcapitalize(PyUnicodeObject *self)
2760{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002761 int len = self->length;
2762 Py_UNICODE *s = self->str;
2763 int status = 0;
2764
2765 if (len == 0)
2766 return 0;
2767 if (Py_UNICODE_ISLOWER(*s)) {
2768 *s = Py_UNICODE_TOUPPER(*s);
2769 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002771 s++;
2772 while (--len > 0) {
2773 if (Py_UNICODE_ISUPPER(*s)) {
2774 *s = Py_UNICODE_TOLOWER(*s);
2775 status = 1;
2776 }
2777 s++;
2778 }
2779 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780}
2781
2782static
2783int fixtitle(PyUnicodeObject *self)
2784{
2785 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2786 register Py_UNICODE *e;
2787 int previous_is_cased;
2788
2789 /* Shortcut for single character strings */
2790 if (PyUnicode_GET_SIZE(self) == 1) {
2791 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2792 if (*p != ch) {
2793 *p = ch;
2794 return 1;
2795 }
2796 else
2797 return 0;
2798 }
2799
2800 e = p + PyUnicode_GET_SIZE(self);
2801 previous_is_cased = 0;
2802 for (; p < e; p++) {
2803 register const Py_UNICODE ch = *p;
2804
2805 if (previous_is_cased)
2806 *p = Py_UNICODE_TOLOWER(ch);
2807 else
2808 *p = Py_UNICODE_TOTITLE(ch);
2809
2810 if (Py_UNICODE_ISLOWER(ch) ||
2811 Py_UNICODE_ISUPPER(ch) ||
2812 Py_UNICODE_ISTITLE(ch))
2813 previous_is_cased = 1;
2814 else
2815 previous_is_cased = 0;
2816 }
2817 return 1;
2818}
2819
2820PyObject *PyUnicode_Join(PyObject *separator,
2821 PyObject *seq)
2822{
2823 Py_UNICODE *sep;
2824 int seplen;
2825 PyUnicodeObject *res = NULL;
2826 int reslen = 0;
2827 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 int sz = 100;
2829 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002830 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831
Tim Peters2cfe3682001-05-05 05:36:48 +00002832 it = PyObject_GetIter(seq);
2833 if (it == NULL)
2834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835
2836 if (separator == NULL) {
2837 Py_UNICODE blank = ' ';
2838 sep = &blank;
2839 seplen = 1;
2840 }
2841 else {
2842 separator = PyUnicode_FromObject(separator);
2843 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002844 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 sep = PyUnicode_AS_UNICODE(separator);
2846 seplen = PyUnicode_GET_SIZE(separator);
2847 }
2848
2849 res = _PyUnicode_New(sz);
2850 if (res == NULL)
2851 goto onError;
2852 p = PyUnicode_AS_UNICODE(res);
2853 reslen = 0;
2854
Tim Peters2cfe3682001-05-05 05:36:48 +00002855 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002857 PyObject *item = PyIter_Next(it);
2858 if (item == NULL) {
2859 if (PyErr_Occurred())
2860 goto onError;
2861 break;
2862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 if (!PyUnicode_Check(item)) {
2864 PyObject *v;
2865 v = PyUnicode_FromObject(item);
2866 Py_DECREF(item);
2867 item = v;
2868 if (item == NULL)
2869 goto onError;
2870 }
2871 itemlen = PyUnicode_GET_SIZE(item);
2872 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002873 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 goto onError;
2875 sz *= 2;
2876 p = PyUnicode_AS_UNICODE(res) + reslen;
2877 }
2878 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002879 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 p += seplen;
2881 reslen += seplen;
2882 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002883 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 p += itemlen;
2885 reslen += itemlen;
2886 Py_DECREF(item);
2887 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002888 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 goto onError;
2890
2891 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002892 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 return (PyObject *)res;
2894
2895 onError:
2896 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002897 Py_XDECREF(res);
2898 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 return NULL;
2900}
2901
2902static
2903PyUnicodeObject *pad(PyUnicodeObject *self,
2904 int left,
2905 int right,
2906 Py_UNICODE fill)
2907{
2908 PyUnicodeObject *u;
2909
2910 if (left < 0)
2911 left = 0;
2912 if (right < 0)
2913 right = 0;
2914
2915 if (left == 0 && right == 0) {
2916 Py_INCREF(self);
2917 return self;
2918 }
2919
2920 u = _PyUnicode_New(left + self->length + right);
2921 if (u) {
2922 if (left)
2923 Py_UNICODE_FILL(u->str, fill, left);
2924 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2925 if (right)
2926 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2927 }
2928
2929 return u;
2930}
2931
2932#define SPLIT_APPEND(data, left, right) \
2933 str = PyUnicode_FromUnicode(data + left, right - left); \
2934 if (!str) \
2935 goto onError; \
2936 if (PyList_Append(list, str)) { \
2937 Py_DECREF(str); \
2938 goto onError; \
2939 } \
2940 else \
2941 Py_DECREF(str);
2942
2943static
2944PyObject *split_whitespace(PyUnicodeObject *self,
2945 PyObject *list,
2946 int maxcount)
2947{
2948 register int i;
2949 register int j;
2950 int len = self->length;
2951 PyObject *str;
2952
2953 for (i = j = 0; i < len; ) {
2954 /* find a token */
2955 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2956 i++;
2957 j = i;
2958 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2959 i++;
2960 if (j < i) {
2961 if (maxcount-- <= 0)
2962 break;
2963 SPLIT_APPEND(self->str, j, i);
2964 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2965 i++;
2966 j = i;
2967 }
2968 }
2969 if (j < len) {
2970 SPLIT_APPEND(self->str, j, len);
2971 }
2972 return list;
2973
2974 onError:
2975 Py_DECREF(list);
2976 return NULL;
2977}
2978
2979PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002980 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981{
2982 register int i;
2983 register int j;
2984 int len;
2985 PyObject *list;
2986 PyObject *str;
2987 Py_UNICODE *data;
2988
2989 string = PyUnicode_FromObject(string);
2990 if (string == NULL)
2991 return NULL;
2992 data = PyUnicode_AS_UNICODE(string);
2993 len = PyUnicode_GET_SIZE(string);
2994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 list = PyList_New(0);
2996 if (!list)
2997 goto onError;
2998
2999 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003000 int eol;
3001
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 /* Find a line and append it */
3003 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3004 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005
3006 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003007 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 if (i < len) {
3009 if (data[i] == '\r' && i + 1 < len &&
3010 data[i+1] == '\n')
3011 i += 2;
3012 else
3013 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003014 if (keepends)
3015 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 }
Guido van Rossum86662912000-04-11 15:38:46 +00003017 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 j = i;
3019 }
3020 if (j < len) {
3021 SPLIT_APPEND(data, j, len);
3022 }
3023
3024 Py_DECREF(string);
3025 return list;
3026
3027 onError:
3028 Py_DECREF(list);
3029 Py_DECREF(string);
3030 return NULL;
3031}
3032
3033static
3034PyObject *split_char(PyUnicodeObject *self,
3035 PyObject *list,
3036 Py_UNICODE ch,
3037 int maxcount)
3038{
3039 register int i;
3040 register int j;
3041 int len = self->length;
3042 PyObject *str;
3043
3044 for (i = j = 0; i < len; ) {
3045 if (self->str[i] == ch) {
3046 if (maxcount-- <= 0)
3047 break;
3048 SPLIT_APPEND(self->str, j, i);
3049 i = j = i + 1;
3050 } else
3051 i++;
3052 }
3053 if (j <= len) {
3054 SPLIT_APPEND(self->str, j, len);
3055 }
3056 return list;
3057
3058 onError:
3059 Py_DECREF(list);
3060 return NULL;
3061}
3062
3063static
3064PyObject *split_substring(PyUnicodeObject *self,
3065 PyObject *list,
3066 PyUnicodeObject *substring,
3067 int maxcount)
3068{
3069 register int i;
3070 register int j;
3071 int len = self->length;
3072 int sublen = substring->length;
3073 PyObject *str;
3074
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003075 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 if (Py_UNICODE_MATCH(self, i, substring)) {
3077 if (maxcount-- <= 0)
3078 break;
3079 SPLIT_APPEND(self->str, j, i);
3080 i = j = i + sublen;
3081 } else
3082 i++;
3083 }
3084 if (j <= len) {
3085 SPLIT_APPEND(self->str, j, len);
3086 }
3087 return list;
3088
3089 onError:
3090 Py_DECREF(list);
3091 return NULL;
3092}
3093
3094#undef SPLIT_APPEND
3095
3096static
3097PyObject *split(PyUnicodeObject *self,
3098 PyUnicodeObject *substring,
3099 int maxcount)
3100{
3101 PyObject *list;
3102
3103 if (maxcount < 0)
3104 maxcount = INT_MAX;
3105
3106 list = PyList_New(0);
3107 if (!list)
3108 return NULL;
3109
3110 if (substring == NULL)
3111 return split_whitespace(self,list,maxcount);
3112
3113 else if (substring->length == 1)
3114 return split_char(self,list,substring->str[0],maxcount);
3115
3116 else if (substring->length == 0) {
3117 Py_DECREF(list);
3118 PyErr_SetString(PyExc_ValueError, "empty separator");
3119 return NULL;
3120 }
3121 else
3122 return split_substring(self,list,substring,maxcount);
3123}
3124
3125static
3126PyObject *strip(PyUnicodeObject *self,
3127 int left,
3128 int right)
3129{
3130 Py_UNICODE *p = self->str;
3131 int start = 0;
3132 int end = self->length;
3133
3134 if (left)
3135 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3136 start++;
3137
3138 if (right)
3139 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3140 end--;
3141
3142 if (start == 0 && end == self->length) {
3143 /* couldn't strip anything off, return original string */
3144 Py_INCREF(self);
3145 return (PyObject*) self;
3146 }
3147
3148 return (PyObject*) PyUnicode_FromUnicode(
3149 self->str + start,
3150 end - start
3151 );
3152}
3153
3154static
3155PyObject *replace(PyUnicodeObject *self,
3156 PyUnicodeObject *str1,
3157 PyUnicodeObject *str2,
3158 int maxcount)
3159{
3160 PyUnicodeObject *u;
3161
3162 if (maxcount < 0)
3163 maxcount = INT_MAX;
3164
3165 if (str1->length == 1 && str2->length == 1) {
3166 int i;
3167
3168 /* replace characters */
3169 if (!findchar(self->str, self->length, str1->str[0])) {
3170 /* nothing to replace, return original string */
3171 Py_INCREF(self);
3172 u = self;
3173 } else {
3174 Py_UNICODE u1 = str1->str[0];
3175 Py_UNICODE u2 = str2->str[0];
3176
3177 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003178 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 self->length
3180 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003181 if (u != NULL) {
3182 Py_UNICODE_COPY(u->str, self->str,
3183 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 for (i = 0; i < u->length; i++)
3185 if (u->str[i] == u1) {
3186 if (--maxcount < 0)
3187 break;
3188 u->str[i] = u2;
3189 }
3190 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192
3193 } else {
3194 int n, i;
3195 Py_UNICODE *p;
3196
3197 /* replace strings */
3198 n = count(self, 0, self->length, str1);
3199 if (n > maxcount)
3200 n = maxcount;
3201 if (n == 0) {
3202 /* nothing to replace, return original string */
3203 Py_INCREF(self);
3204 u = self;
3205 } else {
3206 u = _PyUnicode_New(
3207 self->length + n * (str2->length - str1->length));
3208 if (u) {
3209 i = 0;
3210 p = u->str;
3211 while (i <= self->length - str1->length)
3212 if (Py_UNICODE_MATCH(self, i, str1)) {
3213 /* replace string segment */
3214 Py_UNICODE_COPY(p, str2->str, str2->length);
3215 p += str2->length;
3216 i += str1->length;
3217 if (--n <= 0) {
3218 /* copy remaining part */
3219 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3220 break;
3221 }
3222 } else
3223 *p++ = self->str[i++];
3224 }
3225 }
3226 }
3227
3228 return (PyObject *) u;
3229}
3230
3231/* --- Unicode Object Methods --------------------------------------------- */
3232
3233static char title__doc__[] =
3234"S.title() -> unicode\n\
3235\n\
3236Return a titlecased version of S, i.e. words start with title case\n\
3237characters, all remaining cased characters have lower case.";
3238
3239static PyObject*
3240unicode_title(PyUnicodeObject *self, PyObject *args)
3241{
3242 if (!PyArg_NoArgs(args))
3243 return NULL;
3244 return fixup(self, fixtitle);
3245}
3246
3247static char capitalize__doc__[] =
3248"S.capitalize() -> unicode\n\
3249\n\
3250Return a capitalized version of S, i.e. make the first character\n\
3251have upper case.";
3252
3253static PyObject*
3254unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3255{
3256 if (!PyArg_NoArgs(args))
3257 return NULL;
3258 return fixup(self, fixcapitalize);
3259}
3260
3261#if 0
3262static char capwords__doc__[] =
3263"S.capwords() -> unicode\n\
3264\n\
3265Apply .capitalize() to all words in S and return the result with\n\
3266normalized whitespace (all whitespace strings are replaced by ' ').";
3267
3268static PyObject*
3269unicode_capwords(PyUnicodeObject *self, PyObject *args)
3270{
3271 PyObject *list;
3272 PyObject *item;
3273 int i;
3274
3275 if (!PyArg_NoArgs(args))
3276 return NULL;
3277
3278 /* Split into words */
3279 list = split(self, NULL, -1);
3280 if (!list)
3281 return NULL;
3282
3283 /* Capitalize each word */
3284 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3285 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3286 fixcapitalize);
3287 if (item == NULL)
3288 goto onError;
3289 Py_DECREF(PyList_GET_ITEM(list, i));
3290 PyList_SET_ITEM(list, i, item);
3291 }
3292
3293 /* Join the words to form a new string */
3294 item = PyUnicode_Join(NULL, list);
3295
3296onError:
3297 Py_DECREF(list);
3298 return (PyObject *)item;
3299}
3300#endif
3301
3302static char center__doc__[] =
3303"S.center(width) -> unicode\n\
3304\n\
3305Return S centered in a Unicode string of length width. Padding is done\n\
3306using spaces.";
3307
3308static PyObject *
3309unicode_center(PyUnicodeObject *self, PyObject *args)
3310{
3311 int marg, left;
3312 int width;
3313
3314 if (!PyArg_ParseTuple(args, "i:center", &width))
3315 return NULL;
3316
3317 if (self->length >= width) {
3318 Py_INCREF(self);
3319 return (PyObject*) self;
3320 }
3321
3322 marg = width - self->length;
3323 left = marg / 2 + (marg & width & 1);
3324
3325 return (PyObject*) pad(self, left, marg - left, ' ');
3326}
3327
Marc-André Lemburge5034372000-08-08 08:04:29 +00003328#if 0
3329
3330/* This code should go into some future Unicode collation support
3331 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003332 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003333
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003334/* speedy UTF-16 code point order comparison */
3335/* gleaned from: */
3336/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3337
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003338static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003339{
3340 0, 0, 0, 0, 0, 0, 0, 0,
3341 0, 0, 0, 0, 0, 0, 0, 0,
3342 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003343 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003344};
3345
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346static int
3347unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3348{
3349 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003350
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 Py_UNICODE *s1 = str1->str;
3352 Py_UNICODE *s2 = str2->str;
3353
3354 len1 = str1->length;
3355 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003356
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003358 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003359
3360 c1 = *s1++;
3361 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003362
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003363 if (c1 > (1<<11) * 26)
3364 c1 += utf16Fixup[c1>>11];
3365 if (c2 > (1<<11) * 26)
3366 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003367 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003368
3369 if (c1 != c2)
3370 return (c1 < c2) ? -1 : 1;
3371
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003372 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 }
3374
3375 return (len1 < len2) ? -1 : (len1 != len2);
3376}
3377
Marc-André Lemburge5034372000-08-08 08:04:29 +00003378#else
3379
3380static int
3381unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3382{
3383 register int len1, len2;
3384
3385 Py_UNICODE *s1 = str1->str;
3386 Py_UNICODE *s2 = str2->str;
3387
3388 len1 = str1->length;
3389 len2 = str2->length;
3390
3391 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003392 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003393
Fredrik Lundh45714e92001-06-26 16:39:36 +00003394 c1 = *s1++;
3395 c2 = *s2++;
3396
3397 if (c1 != c2)
3398 return (c1 < c2) ? -1 : 1;
3399
Marc-André Lemburge5034372000-08-08 08:04:29 +00003400 len1--; len2--;
3401 }
3402
3403 return (len1 < len2) ? -1 : (len1 != len2);
3404}
3405
3406#endif
3407
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408int PyUnicode_Compare(PyObject *left,
3409 PyObject *right)
3410{
3411 PyUnicodeObject *u = NULL, *v = NULL;
3412 int result;
3413
3414 /* Coerce the two arguments */
3415 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3416 if (u == NULL)
3417 goto onError;
3418 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3419 if (v == NULL)
3420 goto onError;
3421
Thomas Wouters7e474022000-07-16 12:04:32 +00003422 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 if (v == u) {
3424 Py_DECREF(u);
3425 Py_DECREF(v);
3426 return 0;
3427 }
3428
3429 result = unicode_compare(u, v);
3430
3431 Py_DECREF(u);
3432 Py_DECREF(v);
3433 return result;
3434
3435onError:
3436 Py_XDECREF(u);
3437 Py_XDECREF(v);
3438 return -1;
3439}
3440
Guido van Rossum403d68b2000-03-13 15:55:09 +00003441int PyUnicode_Contains(PyObject *container,
3442 PyObject *element)
3443{
3444 PyUnicodeObject *u = NULL, *v = NULL;
3445 int result;
3446 register const Py_UNICODE *p, *e;
3447 register Py_UNICODE ch;
3448
3449 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003450 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003451 if (v == NULL) {
3452 PyErr_SetString(PyExc_TypeError,
3453 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003454 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003455 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003456 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3457 if (u == NULL) {
3458 Py_DECREF(v);
3459 goto onError;
3460 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003461
3462 /* Check v in u */
3463 if (PyUnicode_GET_SIZE(v) != 1) {
3464 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003465 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003466 goto onError;
3467 }
3468 ch = *PyUnicode_AS_UNICODE(v);
3469 p = PyUnicode_AS_UNICODE(u);
3470 e = p + PyUnicode_GET_SIZE(u);
3471 result = 0;
3472 while (p < e) {
3473 if (*p++ == ch) {
3474 result = 1;
3475 break;
3476 }
3477 }
3478
3479 Py_DECREF(u);
3480 Py_DECREF(v);
3481 return result;
3482
3483onError:
3484 Py_XDECREF(u);
3485 Py_XDECREF(v);
3486 return -1;
3487}
3488
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489/* Concat to string or Unicode object giving a new Unicode object. */
3490
3491PyObject *PyUnicode_Concat(PyObject *left,
3492 PyObject *right)
3493{
3494 PyUnicodeObject *u = NULL, *v = NULL, *w;
3495
3496 /* Coerce the two arguments */
3497 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3498 if (u == NULL)
3499 goto onError;
3500 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3501 if (v == NULL)
3502 goto onError;
3503
3504 /* Shortcuts */
3505 if (v == unicode_empty) {
3506 Py_DECREF(v);
3507 return (PyObject *)u;
3508 }
3509 if (u == unicode_empty) {
3510 Py_DECREF(u);
3511 return (PyObject *)v;
3512 }
3513
3514 /* Concat the two Unicode strings */
3515 w = _PyUnicode_New(u->length + v->length);
3516 if (w == NULL)
3517 goto onError;
3518 Py_UNICODE_COPY(w->str, u->str, u->length);
3519 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3520
3521 Py_DECREF(u);
3522 Py_DECREF(v);
3523 return (PyObject *)w;
3524
3525onError:
3526 Py_XDECREF(u);
3527 Py_XDECREF(v);
3528 return NULL;
3529}
3530
3531static char count__doc__[] =
3532"S.count(sub[, start[, end]]) -> int\n\
3533\n\
3534Return the number of occurrences of substring sub in Unicode string\n\
3535S[start:end]. Optional arguments start and end are\n\
3536interpreted as in slice notation.";
3537
3538static PyObject *
3539unicode_count(PyUnicodeObject *self, PyObject *args)
3540{
3541 PyUnicodeObject *substring;
3542 int start = 0;
3543 int end = INT_MAX;
3544 PyObject *result;
3545
Guido van Rossumb8872e62000-05-09 14:14:27 +00003546 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3547 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 return NULL;
3549
3550 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3551 (PyObject *)substring);
3552 if (substring == NULL)
3553 return NULL;
3554
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 if (start < 0)
3556 start += self->length;
3557 if (start < 0)
3558 start = 0;
3559 if (end > self->length)
3560 end = self->length;
3561 if (end < 0)
3562 end += self->length;
3563 if (end < 0)
3564 end = 0;
3565
3566 result = PyInt_FromLong((long) count(self, start, end, substring));
3567
3568 Py_DECREF(substring);
3569 return result;
3570}
3571
3572static char encode__doc__[] =
3573"S.encode([encoding[,errors]]) -> string\n\
3574\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003575Return an encoded string version of S. Default encoding is the current\n\
3576default string encoding. errors may be given to set a different error\n\
3577handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3578a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579
3580static PyObject *
3581unicode_encode(PyUnicodeObject *self, PyObject *args)
3582{
3583 char *encoding = NULL;
3584 char *errors = NULL;
3585 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3586 return NULL;
3587 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3588}
3589
3590static char expandtabs__doc__[] =
3591"S.expandtabs([tabsize]) -> unicode\n\
3592\n\
3593Return a copy of S where all tab characters are expanded using spaces.\n\
3594If tabsize is not given, a tab size of 8 characters is assumed.";
3595
3596static PyObject*
3597unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3598{
3599 Py_UNICODE *e;
3600 Py_UNICODE *p;
3601 Py_UNICODE *q;
3602 int i, j;
3603 PyUnicodeObject *u;
3604 int tabsize = 8;
3605
3606 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3607 return NULL;
3608
Thomas Wouters7e474022000-07-16 12:04:32 +00003609 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 i = j = 0;
3611 e = self->str + self->length;
3612 for (p = self->str; p < e; p++)
3613 if (*p == '\t') {
3614 if (tabsize > 0)
3615 j += tabsize - (j % tabsize);
3616 }
3617 else {
3618 j++;
3619 if (*p == '\n' || *p == '\r') {
3620 i += j;
3621 j = 0;
3622 }
3623 }
3624
3625 /* Second pass: create output string and fill it */
3626 u = _PyUnicode_New(i + j);
3627 if (!u)
3628 return NULL;
3629
3630 j = 0;
3631 q = u->str;
3632
3633 for (p = self->str; p < e; p++)
3634 if (*p == '\t') {
3635 if (tabsize > 0) {
3636 i = tabsize - (j % tabsize);
3637 j += i;
3638 while (i--)
3639 *q++ = ' ';
3640 }
3641 }
3642 else {
3643 j++;
3644 *q++ = *p;
3645 if (*p == '\n' || *p == '\r')
3646 j = 0;
3647 }
3648
3649 return (PyObject*) u;
3650}
3651
3652static char find__doc__[] =
3653"S.find(sub [,start [,end]]) -> int\n\
3654\n\
3655Return the lowest index in S where substring sub is found,\n\
3656such that sub is contained within s[start,end]. Optional\n\
3657arguments start and end are interpreted as in slice notation.\n\
3658\n\
3659Return -1 on failure.";
3660
3661static PyObject *
3662unicode_find(PyUnicodeObject *self, PyObject *args)
3663{
3664 PyUnicodeObject *substring;
3665 int start = 0;
3666 int end = INT_MAX;
3667 PyObject *result;
3668
Guido van Rossumb8872e62000-05-09 14:14:27 +00003669 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3670 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 return NULL;
3672 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3673 (PyObject *)substring);
3674 if (substring == NULL)
3675 return NULL;
3676
3677 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3678
3679 Py_DECREF(substring);
3680 return result;
3681}
3682
3683static PyObject *
3684unicode_getitem(PyUnicodeObject *self, int index)
3685{
3686 if (index < 0 || index >= self->length) {
3687 PyErr_SetString(PyExc_IndexError, "string index out of range");
3688 return NULL;
3689 }
3690
3691 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3692}
3693
3694static long
3695unicode_hash(PyUnicodeObject *self)
3696{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003697 /* Since Unicode objects compare equal to their ASCII string
3698 counterparts, they should use the individual character values
3699 as basis for their hash value. This is needed to assure that
3700 strings and Unicode objects behave in the same way as
3701 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702
Fredrik Lundhdde61642000-07-10 18:27:47 +00003703 register int len;
3704 register Py_UNICODE *p;
3705 register long x;
3706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 if (self->hash != -1)
3708 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003709 len = PyUnicode_GET_SIZE(self);
3710 p = PyUnicode_AS_UNICODE(self);
3711 x = *p << 7;
3712 while (--len >= 0)
3713 x = (1000003*x) ^ *p++;
3714 x ^= PyUnicode_GET_SIZE(self);
3715 if (x == -1)
3716 x = -2;
3717 self->hash = x;
3718 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719}
3720
3721static char index__doc__[] =
3722"S.index(sub [,start [,end]]) -> int\n\
3723\n\
3724Like S.find() but raise ValueError when the substring is not found.";
3725
3726static PyObject *
3727unicode_index(PyUnicodeObject *self, PyObject *args)
3728{
3729 int result;
3730 PyUnicodeObject *substring;
3731 int start = 0;
3732 int end = INT_MAX;
3733
Guido van Rossumb8872e62000-05-09 14:14:27 +00003734 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3735 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 return NULL;
3737
3738 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3739 (PyObject *)substring);
3740 if (substring == NULL)
3741 return NULL;
3742
3743 result = findstring(self, substring, start, end, 1);
3744
3745 Py_DECREF(substring);
3746 if (result < 0) {
3747 PyErr_SetString(PyExc_ValueError, "substring not found");
3748 return NULL;
3749 }
3750 return PyInt_FromLong(result);
3751}
3752
3753static char islower__doc__[] =
3754"S.islower() -> int\n\
3755\n\
3756Return 1 if all cased characters in S are lowercase and there is\n\
3757at least one cased character in S, 0 otherwise.";
3758
3759static PyObject*
3760unicode_islower(PyUnicodeObject *self, PyObject *args)
3761{
3762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3763 register const Py_UNICODE *e;
3764 int cased;
3765
3766 if (!PyArg_NoArgs(args))
3767 return NULL;
3768
3769 /* Shortcut for single character strings */
3770 if (PyUnicode_GET_SIZE(self) == 1)
3771 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003773 /* Special case for empty strings */
3774 if (PyString_GET_SIZE(self) == 0)
3775 return PyInt_FromLong(0);
3776
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 e = p + PyUnicode_GET_SIZE(self);
3778 cased = 0;
3779 for (; p < e; p++) {
3780 register const Py_UNICODE ch = *p;
3781
3782 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3783 return PyInt_FromLong(0);
3784 else if (!cased && Py_UNICODE_ISLOWER(ch))
3785 cased = 1;
3786 }
3787 return PyInt_FromLong(cased);
3788}
3789
3790static char isupper__doc__[] =
3791"S.isupper() -> int\n\
3792\n\
3793Return 1 if all cased characters in S are uppercase and there is\n\
3794at least one cased character in S, 0 otherwise.";
3795
3796static PyObject*
3797unicode_isupper(PyUnicodeObject *self, PyObject *args)
3798{
3799 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3800 register const Py_UNICODE *e;
3801 int cased;
3802
3803 if (!PyArg_NoArgs(args))
3804 return NULL;
3805
3806 /* Shortcut for single character strings */
3807 if (PyUnicode_GET_SIZE(self) == 1)
3808 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003810 /* Special case for empty strings */
3811 if (PyString_GET_SIZE(self) == 0)
3812 return PyInt_FromLong(0);
3813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 e = p + PyUnicode_GET_SIZE(self);
3815 cased = 0;
3816 for (; p < e; p++) {
3817 register const Py_UNICODE ch = *p;
3818
3819 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3820 return PyInt_FromLong(0);
3821 else if (!cased && Py_UNICODE_ISUPPER(ch))
3822 cased = 1;
3823 }
3824 return PyInt_FromLong(cased);
3825}
3826
3827static char istitle__doc__[] =
3828"S.istitle() -> int\n\
3829\n\
3830Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3831may only follow uncased characters and lowercase characters only cased\n\
3832ones. Return 0 otherwise.";
3833
3834static PyObject*
3835unicode_istitle(PyUnicodeObject *self, PyObject *args)
3836{
3837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3838 register const Py_UNICODE *e;
3839 int cased, previous_is_cased;
3840
3841 if (!PyArg_NoArgs(args))
3842 return NULL;
3843
3844 /* Shortcut for single character strings */
3845 if (PyUnicode_GET_SIZE(self) == 1)
3846 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3847 (Py_UNICODE_ISUPPER(*p) != 0));
3848
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003849 /* Special case for empty strings */
3850 if (PyString_GET_SIZE(self) == 0)
3851 return PyInt_FromLong(0);
3852
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 e = p + PyUnicode_GET_SIZE(self);
3854 cased = 0;
3855 previous_is_cased = 0;
3856 for (; p < e; p++) {
3857 register const Py_UNICODE ch = *p;
3858
3859 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3860 if (previous_is_cased)
3861 return PyInt_FromLong(0);
3862 previous_is_cased = 1;
3863 cased = 1;
3864 }
3865 else if (Py_UNICODE_ISLOWER(ch)) {
3866 if (!previous_is_cased)
3867 return PyInt_FromLong(0);
3868 previous_is_cased = 1;
3869 cased = 1;
3870 }
3871 else
3872 previous_is_cased = 0;
3873 }
3874 return PyInt_FromLong(cased);
3875}
3876
3877static char isspace__doc__[] =
3878"S.isspace() -> int\n\
3879\n\
3880Return 1 if there are only whitespace characters in S,\n\
38810 otherwise.";
3882
3883static PyObject*
3884unicode_isspace(PyUnicodeObject *self, PyObject *args)
3885{
3886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3887 register const Py_UNICODE *e;
3888
3889 if (!PyArg_NoArgs(args))
3890 return NULL;
3891
3892 /* Shortcut for single character strings */
3893 if (PyUnicode_GET_SIZE(self) == 1 &&
3894 Py_UNICODE_ISSPACE(*p))
3895 return PyInt_FromLong(1);
3896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003897 /* Special case for empty strings */
3898 if (PyString_GET_SIZE(self) == 0)
3899 return PyInt_FromLong(0);
3900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 e = p + PyUnicode_GET_SIZE(self);
3902 for (; p < e; p++) {
3903 if (!Py_UNICODE_ISSPACE(*p))
3904 return PyInt_FromLong(0);
3905 }
3906 return PyInt_FromLong(1);
3907}
3908
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003909static char isalpha__doc__[] =
3910"S.isalpha() -> int\n\
3911\n\
3912Return 1 if all characters in S are alphabetic\n\
3913and there is at least one character in S, 0 otherwise.";
3914
3915static PyObject*
3916unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3917{
3918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3919 register const Py_UNICODE *e;
3920
3921 if (!PyArg_NoArgs(args))
3922 return NULL;
3923
3924 /* Shortcut for single character strings */
3925 if (PyUnicode_GET_SIZE(self) == 1 &&
3926 Py_UNICODE_ISALPHA(*p))
3927 return PyInt_FromLong(1);
3928
3929 /* Special case for empty strings */
3930 if (PyString_GET_SIZE(self) == 0)
3931 return PyInt_FromLong(0);
3932
3933 e = p + PyUnicode_GET_SIZE(self);
3934 for (; p < e; p++) {
3935 if (!Py_UNICODE_ISALPHA(*p))
3936 return PyInt_FromLong(0);
3937 }
3938 return PyInt_FromLong(1);
3939}
3940
3941static char isalnum__doc__[] =
3942"S.isalnum() -> int\n\
3943\n\
3944Return 1 if all characters in S are alphanumeric\n\
3945and there is at least one character in S, 0 otherwise.";
3946
3947static PyObject*
3948unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3949{
3950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3951 register const Py_UNICODE *e;
3952
3953 if (!PyArg_NoArgs(args))
3954 return NULL;
3955
3956 /* Shortcut for single character strings */
3957 if (PyUnicode_GET_SIZE(self) == 1 &&
3958 Py_UNICODE_ISALNUM(*p))
3959 return PyInt_FromLong(1);
3960
3961 /* Special case for empty strings */
3962 if (PyString_GET_SIZE(self) == 0)
3963 return PyInt_FromLong(0);
3964
3965 e = p + PyUnicode_GET_SIZE(self);
3966 for (; p < e; p++) {
3967 if (!Py_UNICODE_ISALNUM(*p))
3968 return PyInt_FromLong(0);
3969 }
3970 return PyInt_FromLong(1);
3971}
3972
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973static char isdecimal__doc__[] =
3974"S.isdecimal() -> int\n\
3975\n\
3976Return 1 if there are only decimal characters in S,\n\
39770 otherwise.";
3978
3979static PyObject*
3980unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3981{
3982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3983 register const Py_UNICODE *e;
3984
3985 if (!PyArg_NoArgs(args))
3986 return NULL;
3987
3988 /* Shortcut for single character strings */
3989 if (PyUnicode_GET_SIZE(self) == 1 &&
3990 Py_UNICODE_ISDECIMAL(*p))
3991 return PyInt_FromLong(1);
3992
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003993 /* Special case for empty strings */
3994 if (PyString_GET_SIZE(self) == 0)
3995 return PyInt_FromLong(0);
3996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 e = p + PyUnicode_GET_SIZE(self);
3998 for (; p < e; p++) {
3999 if (!Py_UNICODE_ISDECIMAL(*p))
4000 return PyInt_FromLong(0);
4001 }
4002 return PyInt_FromLong(1);
4003}
4004
4005static char isdigit__doc__[] =
4006"S.isdigit() -> int\n\
4007\n\
4008Return 1 if there are only digit characters in S,\n\
40090 otherwise.";
4010
4011static PyObject*
4012unicode_isdigit(PyUnicodeObject *self, PyObject *args)
4013{
4014 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4015 register const Py_UNICODE *e;
4016
4017 if (!PyArg_NoArgs(args))
4018 return NULL;
4019
4020 /* Shortcut for single character strings */
4021 if (PyUnicode_GET_SIZE(self) == 1 &&
4022 Py_UNICODE_ISDIGIT(*p))
4023 return PyInt_FromLong(1);
4024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004025 /* Special case for empty strings */
4026 if (PyString_GET_SIZE(self) == 0)
4027 return PyInt_FromLong(0);
4028
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 e = p + PyUnicode_GET_SIZE(self);
4030 for (; p < e; p++) {
4031 if (!Py_UNICODE_ISDIGIT(*p))
4032 return PyInt_FromLong(0);
4033 }
4034 return PyInt_FromLong(1);
4035}
4036
4037static char isnumeric__doc__[] =
4038"S.isnumeric() -> int\n\
4039\n\
4040Return 1 if there are only numeric characters in S,\n\
40410 otherwise.";
4042
4043static PyObject*
4044unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4045{
4046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4047 register const Py_UNICODE *e;
4048
4049 if (!PyArg_NoArgs(args))
4050 return NULL;
4051
4052 /* Shortcut for single character strings */
4053 if (PyUnicode_GET_SIZE(self) == 1 &&
4054 Py_UNICODE_ISNUMERIC(*p))
4055 return PyInt_FromLong(1);
4056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004057 /* Special case for empty strings */
4058 if (PyString_GET_SIZE(self) == 0)
4059 return PyInt_FromLong(0);
4060
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 e = p + PyUnicode_GET_SIZE(self);
4062 for (; p < e; p++) {
4063 if (!Py_UNICODE_ISNUMERIC(*p))
4064 return PyInt_FromLong(0);
4065 }
4066 return PyInt_FromLong(1);
4067}
4068
4069static char join__doc__[] =
4070"S.join(sequence) -> unicode\n\
4071\n\
4072Return a string which is the concatenation of the strings in the\n\
4073sequence. The separator between elements is S.";
4074
4075static PyObject*
4076unicode_join(PyUnicodeObject *self, PyObject *args)
4077{
4078 PyObject *data;
4079 if (!PyArg_ParseTuple(args, "O:join", &data))
4080 return NULL;
4081
4082 return PyUnicode_Join((PyObject *)self, data);
4083}
4084
4085static int
4086unicode_length(PyUnicodeObject *self)
4087{
4088 return self->length;
4089}
4090
4091static char ljust__doc__[] =
4092"S.ljust(width) -> unicode\n\
4093\n\
4094Return S left justified in a Unicode string of length width. Padding is\n\
4095done using spaces.";
4096
4097static PyObject *
4098unicode_ljust(PyUnicodeObject *self, PyObject *args)
4099{
4100 int width;
4101 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4102 return NULL;
4103
4104 if (self->length >= width) {
4105 Py_INCREF(self);
4106 return (PyObject*) self;
4107 }
4108
4109 return (PyObject*) pad(self, 0, width - self->length, ' ');
4110}
4111
4112static char lower__doc__[] =
4113"S.lower() -> unicode\n\
4114\n\
4115Return a copy of the string S converted to lowercase.";
4116
4117static PyObject*
4118unicode_lower(PyUnicodeObject *self, PyObject *args)
4119{
4120 if (!PyArg_NoArgs(args))
4121 return NULL;
4122 return fixup(self, fixlower);
4123}
4124
4125static char lstrip__doc__[] =
4126"S.lstrip() -> unicode\n\
4127\n\
4128Return a copy of the string S with leading whitespace removed.";
4129
4130static PyObject *
4131unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4132{
4133 if (!PyArg_NoArgs(args))
4134 return NULL;
4135 return strip(self, 1, 0);
4136}
4137
4138static PyObject*
4139unicode_repeat(PyUnicodeObject *str, int len)
4140{
4141 PyUnicodeObject *u;
4142 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004143 int nchars;
4144 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145
4146 if (len < 0)
4147 len = 0;
4148
4149 if (len == 1) {
4150 /* no repeat, return original string */
4151 Py_INCREF(str);
4152 return (PyObject*) str;
4153 }
Tim Peters8f422462000-09-09 06:13:41 +00004154
4155 /* ensure # of chars needed doesn't overflow int and # of bytes
4156 * needed doesn't overflow size_t
4157 */
4158 nchars = len * str->length;
4159 if (len && nchars / len != str->length) {
4160 PyErr_SetString(PyExc_OverflowError,
4161 "repeated string is too long");
4162 return NULL;
4163 }
4164 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4165 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4166 PyErr_SetString(PyExc_OverflowError,
4167 "repeated string is too long");
4168 return NULL;
4169 }
4170 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 if (!u)
4172 return NULL;
4173
4174 p = u->str;
4175
4176 while (len-- > 0) {
4177 Py_UNICODE_COPY(p, str->str, str->length);
4178 p += str->length;
4179 }
4180
4181 return (PyObject*) u;
4182}
4183
4184PyObject *PyUnicode_Replace(PyObject *obj,
4185 PyObject *subobj,
4186 PyObject *replobj,
4187 int maxcount)
4188{
4189 PyObject *self;
4190 PyObject *str1;
4191 PyObject *str2;
4192 PyObject *result;
4193
4194 self = PyUnicode_FromObject(obj);
4195 if (self == NULL)
4196 return NULL;
4197 str1 = PyUnicode_FromObject(subobj);
4198 if (str1 == NULL) {
4199 Py_DECREF(self);
4200 return NULL;
4201 }
4202 str2 = PyUnicode_FromObject(replobj);
4203 if (str2 == NULL) {
4204 Py_DECREF(self);
4205 Py_DECREF(str1);
4206 return NULL;
4207 }
4208 result = replace((PyUnicodeObject *)self,
4209 (PyUnicodeObject *)str1,
4210 (PyUnicodeObject *)str2,
4211 maxcount);
4212 Py_DECREF(self);
4213 Py_DECREF(str1);
4214 Py_DECREF(str2);
4215 return result;
4216}
4217
4218static char replace__doc__[] =
4219"S.replace (old, new[, maxsplit]) -> unicode\n\
4220\n\
4221Return a copy of S with all occurrences of substring\n\
4222old replaced by new. If the optional argument maxsplit is\n\
4223given, only the first maxsplit occurrences are replaced.";
4224
4225static PyObject*
4226unicode_replace(PyUnicodeObject *self, PyObject *args)
4227{
4228 PyUnicodeObject *str1;
4229 PyUnicodeObject *str2;
4230 int maxcount = -1;
4231 PyObject *result;
4232
4233 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4234 return NULL;
4235 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4236 if (str1 == NULL)
4237 return NULL;
4238 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4239 if (str2 == NULL)
4240 return NULL;
4241
4242 result = replace(self, str1, str2, maxcount);
4243
4244 Py_DECREF(str1);
4245 Py_DECREF(str2);
4246 return result;
4247}
4248
4249static
4250PyObject *unicode_repr(PyObject *unicode)
4251{
4252 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4253 PyUnicode_GET_SIZE(unicode),
4254 1);
4255}
4256
4257static char rfind__doc__[] =
4258"S.rfind(sub [,start [,end]]) -> int\n\
4259\n\
4260Return the highest index in S where substring sub is found,\n\
4261such that sub is contained within s[start,end]. Optional\n\
4262arguments start and end are interpreted as in slice notation.\n\
4263\n\
4264Return -1 on failure.";
4265
4266static PyObject *
4267unicode_rfind(PyUnicodeObject *self, PyObject *args)
4268{
4269 PyUnicodeObject *substring;
4270 int start = 0;
4271 int end = INT_MAX;
4272 PyObject *result;
4273
Guido van Rossumb8872e62000-05-09 14:14:27 +00004274 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4275 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 return NULL;
4277 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4278 (PyObject *)substring);
4279 if (substring == NULL)
4280 return NULL;
4281
4282 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4283
4284 Py_DECREF(substring);
4285 return result;
4286}
4287
4288static char rindex__doc__[] =
4289"S.rindex(sub [,start [,end]]) -> int\n\
4290\n\
4291Like S.rfind() but raise ValueError when the substring is not found.";
4292
4293static PyObject *
4294unicode_rindex(PyUnicodeObject *self, PyObject *args)
4295{
4296 int result;
4297 PyUnicodeObject *substring;
4298 int start = 0;
4299 int end = INT_MAX;
4300
Guido van Rossumb8872e62000-05-09 14:14:27 +00004301 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4302 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return NULL;
4304 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4305 (PyObject *)substring);
4306 if (substring == NULL)
4307 return NULL;
4308
4309 result = findstring(self, substring, start, end, -1);
4310
4311 Py_DECREF(substring);
4312 if (result < 0) {
4313 PyErr_SetString(PyExc_ValueError, "substring not found");
4314 return NULL;
4315 }
4316 return PyInt_FromLong(result);
4317}
4318
4319static char rjust__doc__[] =
4320"S.rjust(width) -> unicode\n\
4321\n\
4322Return S right justified in a Unicode string of length width. Padding is\n\
4323done using spaces.";
4324
4325static PyObject *
4326unicode_rjust(PyUnicodeObject *self, PyObject *args)
4327{
4328 int width;
4329 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4330 return NULL;
4331
4332 if (self->length >= width) {
4333 Py_INCREF(self);
4334 return (PyObject*) self;
4335 }
4336
4337 return (PyObject*) pad(self, width - self->length, 0, ' ');
4338}
4339
4340static char rstrip__doc__[] =
4341"S.rstrip() -> unicode\n\
4342\n\
4343Return a copy of the string S with trailing whitespace removed.";
4344
4345static PyObject *
4346unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4347{
4348 if (!PyArg_NoArgs(args))
4349 return NULL;
4350 return strip(self, 0, 1);
4351}
4352
4353static PyObject*
4354unicode_slice(PyUnicodeObject *self, int start, int end)
4355{
4356 /* standard clamping */
4357 if (start < 0)
4358 start = 0;
4359 if (end < 0)
4360 end = 0;
4361 if (end > self->length)
4362 end = self->length;
4363 if (start == 0 && end == self->length) {
4364 /* full slice, return original string */
4365 Py_INCREF(self);
4366 return (PyObject*) self;
4367 }
4368 if (start > end)
4369 start = end;
4370 /* copy slice */
4371 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4372 end - start);
4373}
4374
4375PyObject *PyUnicode_Split(PyObject *s,
4376 PyObject *sep,
4377 int maxsplit)
4378{
4379 PyObject *result;
4380
4381 s = PyUnicode_FromObject(s);
4382 if (s == NULL)
4383 return NULL;
4384 if (sep != NULL) {
4385 sep = PyUnicode_FromObject(sep);
4386 if (sep == NULL) {
4387 Py_DECREF(s);
4388 return NULL;
4389 }
4390 }
4391
4392 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4393
4394 Py_DECREF(s);
4395 Py_XDECREF(sep);
4396 return result;
4397}
4398
4399static char split__doc__[] =
4400"S.split([sep [,maxsplit]]) -> list of strings\n\
4401\n\
4402Return a list of the words in S, using sep as the\n\
4403delimiter string. If maxsplit is given, at most maxsplit\n\
4404splits are done. If sep is not specified, any whitespace string\n\
4405is a separator.";
4406
4407static PyObject*
4408unicode_split(PyUnicodeObject *self, PyObject *args)
4409{
4410 PyObject *substring = Py_None;
4411 int maxcount = -1;
4412
4413 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4414 return NULL;
4415
4416 if (substring == Py_None)
4417 return split(self, NULL, maxcount);
4418 else if (PyUnicode_Check(substring))
4419 return split(self, (PyUnicodeObject *)substring, maxcount);
4420 else
4421 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4422}
4423
4424static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004425"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426\n\
4427Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004428Line breaks are not included in the resulting list unless keepends\n\
4429is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430
4431static PyObject*
4432unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4433{
Guido van Rossum86662912000-04-11 15:38:46 +00004434 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435
Guido van Rossum86662912000-04-11 15:38:46 +00004436 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 return NULL;
4438
Guido van Rossum86662912000-04-11 15:38:46 +00004439 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440}
4441
4442static
4443PyObject *unicode_str(PyUnicodeObject *self)
4444{
Fred Drakee4315f52000-05-09 19:53:39 +00004445 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446}
4447
4448static char strip__doc__[] =
4449"S.strip() -> unicode\n\
4450\n\
4451Return a copy of S with leading and trailing whitespace removed.";
4452
4453static PyObject *
4454unicode_strip(PyUnicodeObject *self, PyObject *args)
4455{
4456 if (!PyArg_NoArgs(args))
4457 return NULL;
4458 return strip(self, 1, 1);
4459}
4460
4461static char swapcase__doc__[] =
4462"S.swapcase() -> unicode\n\
4463\n\
4464Return a copy of S with uppercase characters converted to lowercase\n\
4465and vice versa.";
4466
4467static PyObject*
4468unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4469{
4470 if (!PyArg_NoArgs(args))
4471 return NULL;
4472 return fixup(self, fixswapcase);
4473}
4474
4475static char translate__doc__[] =
4476"S.translate(table) -> unicode\n\
4477\n\
4478Return a copy of the string S, where all characters have been mapped\n\
4479through the given translation table, which must be a mapping of\n\
4480Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4481are left untouched. Characters mapped to None are deleted.";
4482
4483static PyObject*
4484unicode_translate(PyUnicodeObject *self, PyObject *args)
4485{
4486 PyObject *table;
4487
4488 if (!PyArg_ParseTuple(args, "O:translate", &table))
4489 return NULL;
4490 return PyUnicode_TranslateCharmap(self->str,
4491 self->length,
4492 table,
4493 "ignore");
4494}
4495
4496static char upper__doc__[] =
4497"S.upper() -> unicode\n\
4498\n\
4499Return a copy of S converted to uppercase.";
4500
4501static PyObject*
4502unicode_upper(PyUnicodeObject *self, PyObject *args)
4503{
4504 if (!PyArg_NoArgs(args))
4505 return NULL;
4506 return fixup(self, fixupper);
4507}
4508
4509#if 0
4510static char zfill__doc__[] =
4511"S.zfill(width) -> unicode\n\
4512\n\
4513Pad a numeric string x with zeros on the left, to fill a field\n\
4514of the specified width. The string x is never truncated.";
4515
4516static PyObject *
4517unicode_zfill(PyUnicodeObject *self, PyObject *args)
4518{
4519 int fill;
4520 PyUnicodeObject *u;
4521
4522 int width;
4523 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4524 return NULL;
4525
4526 if (self->length >= width) {
4527 Py_INCREF(self);
4528 return (PyObject*) self;
4529 }
4530
4531 fill = width - self->length;
4532
4533 u = pad(self, fill, 0, '0');
4534
4535 if (u->str[fill] == '+' || u->str[fill] == '-') {
4536 /* move sign to beginning of string */
4537 u->str[0] = u->str[fill];
4538 u->str[fill] = '0';
4539 }
4540
4541 return (PyObject*) u;
4542}
4543#endif
4544
4545#if 0
4546static PyObject*
4547unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4548{
4549 if (!PyArg_NoArgs(args))
4550 return NULL;
4551 return PyInt_FromLong(unicode_freelist_size);
4552}
4553#endif
4554
4555static char startswith__doc__[] =
4556"S.startswith(prefix[, start[, end]]) -> int\n\
4557\n\
4558Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4559optional start, test S beginning at that position. With optional end, stop\n\
4560comparing S at that position.";
4561
4562static PyObject *
4563unicode_startswith(PyUnicodeObject *self,
4564 PyObject *args)
4565{
4566 PyUnicodeObject *substring;
4567 int start = 0;
4568 int end = INT_MAX;
4569 PyObject *result;
4570
Guido van Rossumb8872e62000-05-09 14:14:27 +00004571 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4572 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 return NULL;
4574 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4575 (PyObject *)substring);
4576 if (substring == NULL)
4577 return NULL;
4578
4579 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4580
4581 Py_DECREF(substring);
4582 return result;
4583}
4584
4585
4586static char endswith__doc__[] =
4587"S.endswith(suffix[, start[, end]]) -> int\n\
4588\n\
4589Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4590optional start, test S beginning at that position. With optional end, stop\n\
4591comparing S at that position.";
4592
4593static PyObject *
4594unicode_endswith(PyUnicodeObject *self,
4595 PyObject *args)
4596{
4597 PyUnicodeObject *substring;
4598 int start = 0;
4599 int end = INT_MAX;
4600 PyObject *result;
4601
Guido van Rossumb8872e62000-05-09 14:14:27 +00004602 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4603 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 return NULL;
4605 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4606 (PyObject *)substring);
4607 if (substring == NULL)
4608 return NULL;
4609
4610 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4611
4612 Py_DECREF(substring);
4613 return result;
4614}
4615
4616
4617static PyMethodDef unicode_methods[] = {
4618
4619 /* Order is according to common usage: often used methods should
4620 appear first, since lookup is done sequentially. */
4621
4622 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4623 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4624 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4625 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4626 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4627 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4628 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4629 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4630 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4631 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4632 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4633 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4634 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4635 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4636/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4637 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4638 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4639 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4640 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4641 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4642 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4643 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4644 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4645 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4646 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4647 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4648 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4649 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4650 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4651 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4652 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4653 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4654 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004655 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4656 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657#if 0
4658 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4659 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4660#endif
4661
4662#if 0
4663 /* This one is just used for debugging the implementation. */
4664 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4665#endif
4666
4667 {NULL, NULL}
4668};
4669
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670static PySequenceMethods unicode_as_sequence = {
4671 (inquiry) unicode_length, /* sq_length */
4672 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4673 (intargfunc) unicode_repeat, /* sq_repeat */
4674 (intargfunc) unicode_getitem, /* sq_item */
4675 (intintargfunc) unicode_slice, /* sq_slice */
4676 0, /* sq_ass_item */
4677 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004678 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679};
4680
4681static int
4682unicode_buffer_getreadbuf(PyUnicodeObject *self,
4683 int index,
4684 const void **ptr)
4685{
4686 if (index != 0) {
4687 PyErr_SetString(PyExc_SystemError,
4688 "accessing non-existent unicode segment");
4689 return -1;
4690 }
4691 *ptr = (void *) self->str;
4692 return PyUnicode_GET_DATA_SIZE(self);
4693}
4694
4695static int
4696unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4697 const void **ptr)
4698{
4699 PyErr_SetString(PyExc_TypeError,
4700 "cannot use unicode as modifyable buffer");
4701 return -1;
4702}
4703
4704static int
4705unicode_buffer_getsegcount(PyUnicodeObject *self,
4706 int *lenp)
4707{
4708 if (lenp)
4709 *lenp = PyUnicode_GET_DATA_SIZE(self);
4710 return 1;
4711}
4712
4713static int
4714unicode_buffer_getcharbuf(PyUnicodeObject *self,
4715 int index,
4716 const void **ptr)
4717{
4718 PyObject *str;
4719
4720 if (index != 0) {
4721 PyErr_SetString(PyExc_SystemError,
4722 "accessing non-existent unicode segment");
4723 return -1;
4724 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004725 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 if (str == NULL)
4727 return -1;
4728 *ptr = (void *) PyString_AS_STRING(str);
4729 return PyString_GET_SIZE(str);
4730}
4731
4732/* Helpers for PyUnicode_Format() */
4733
4734static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004735getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736{
4737 int argidx = *p_argidx;
4738 if (argidx < arglen) {
4739 (*p_argidx)++;
4740 if (arglen < 0)
4741 return args;
4742 else
4743 return PyTuple_GetItem(args, argidx);
4744 }
4745 PyErr_SetString(PyExc_TypeError,
4746 "not enough arguments for format string");
4747 return NULL;
4748}
4749
4750#define F_LJUST (1<<0)
4751#define F_SIGN (1<<1)
4752#define F_BLANK (1<<2)
4753#define F_ALT (1<<3)
4754#define F_ZERO (1<<4)
4755
4756static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
4759 register int i;
4760 int len;
4761 va_list va;
4762 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764
4765 /* First, format the string as char array, then expand to Py_UNICODE
4766 array. */
4767 charbuffer = (char *)buffer;
4768 len = vsprintf(charbuffer, format, va);
4769 for (i = len - 1; i >= 0; i--)
4770 buffer[i] = (Py_UNICODE) charbuffer[i];
4771
4772 va_end(va);
4773 return len;
4774}
4775
4776static int
4777formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004778 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 int flags,
4780 int prec,
4781 int type,
4782 PyObject *v)
4783{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004784 /* fmt = '%#.' + `prec` + `type`
4785 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 char fmt[20];
4787 double x;
4788
4789 x = PyFloat_AsDouble(v);
4790 if (x == -1.0 && PyErr_Occurred())
4791 return -1;
4792 if (prec < 0)
4793 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4795 type = 'g';
4796 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004797 /* worst case length calc to ensure no buffer overrun:
4798 fmt = %#.<prec>g
4799 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4800 for any double rep.)
4801 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4802 If prec=0 the effective precision is 1 (the leading digit is
4803 always given), therefore increase by one to 10+prec. */
4804 if (buflen <= (size_t)10 + (size_t)prec) {
4805 PyErr_SetString(PyExc_OverflowError,
4806 "formatted float is too long (precision too long?)");
4807 return -1;
4808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 return usprintf(buf, fmt, x);
4810}
4811
Tim Peters38fd5b62000-09-21 05:43:11 +00004812static PyObject*
4813formatlong(PyObject *val, int flags, int prec, int type)
4814{
4815 char *buf;
4816 int i, len;
4817 PyObject *str; /* temporary string object. */
4818 PyUnicodeObject *result;
4819
4820 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4821 if (!str)
4822 return NULL;
4823 result = _PyUnicode_New(len);
4824 for (i = 0; i < len; i++)
4825 result->str[i] = buf[i];
4826 result->str[len] = 0;
4827 Py_DECREF(str);
4828 return (PyObject*)result;
4829}
4830
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831static int
4832formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004833 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 int flags,
4835 int prec,
4836 int type,
4837 PyObject *v)
4838{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004839 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004840 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4841 + 1 + 1 = 24*/
4842 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004844 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
4846 x = PyInt_AsLong(v);
4847 if (x == -1 && PyErr_Occurred())
4848 return -1;
4849 if (prec < 0)
4850 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004851 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4852 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4853 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4854 PyErr_SetString(PyExc_OverflowError,
4855 "formatted integer is too long (precision too long?)");
4856 return -1;
4857 }
Tim Petersfff53252001-04-12 18:38:48 +00004858 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4859 * but we want it (for consistency with other %#x conversions, and
4860 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004861 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4862 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4863 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004864 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004865 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4866 /* Only way to know what the platform does is to try it. */
4867 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4868 if (fmt[1] != (char)type) {
4869 /* Supply our own leading 0x/0X -- needed under std C */
4870 use_native_c_format = 0;
4871 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4872 }
4873 }
4874 if (use_native_c_format)
4875 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 return usprintf(buf, fmt, x);
4877}
4878
4879static int
4880formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004881 size_t buflen,
4882 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004884 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004885 if (PyUnicode_Check(v)) {
4886 if (PyUnicode_GET_SIZE(v) != 1)
4887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004891 else if (PyString_Check(v)) {
4892 if (PyString_GET_SIZE(v) != 1)
4893 goto onError;
4894 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896
4897 else {
4898 /* Integer input truncated to a character */
4899 long x;
4900 x = PyInt_AsLong(v);
4901 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 buf[0] = (char) x;
4904 }
4905 buf[1] = '\0';
4906 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004907
4908 onError:
4909 PyErr_SetString(PyExc_TypeError,
4910 "%c requires int or char");
4911 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
4913
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004914/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4915
4916 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4917 chars are formatted. XXX This is a magic number. Each formatting
4918 routine does bounds checking to ensure no overflow, but a better
4919 solution may be to malloc a buffer of appropriate size for each
4920 format. For now, the current solution is sufficient.
4921*/
4922#define FORMATBUFLEN (size_t)120
4923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924PyObject *PyUnicode_Format(PyObject *format,
4925 PyObject *args)
4926{
4927 Py_UNICODE *fmt, *res;
4928 int fmtcnt, rescnt, reslen, arglen, argidx;
4929 int args_owned = 0;
4930 PyUnicodeObject *result = NULL;
4931 PyObject *dict = NULL;
4932 PyObject *uformat;
4933
4934 if (format == NULL || args == NULL) {
4935 PyErr_BadInternalCall();
4936 return NULL;
4937 }
4938 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004939 if (uformat == NULL)
4940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 fmt = PyUnicode_AS_UNICODE(uformat);
4942 fmtcnt = PyUnicode_GET_SIZE(uformat);
4943
4944 reslen = rescnt = fmtcnt + 100;
4945 result = _PyUnicode_New(reslen);
4946 if (result == NULL)
4947 goto onError;
4948 res = PyUnicode_AS_UNICODE(result);
4949
4950 if (PyTuple_Check(args)) {
4951 arglen = PyTuple_Size(args);
4952 argidx = 0;
4953 }
4954 else {
4955 arglen = -1;
4956 argidx = -2;
4957 }
4958 if (args->ob_type->tp_as_mapping)
4959 dict = args;
4960
4961 while (--fmtcnt >= 0) {
4962 if (*fmt != '%') {
4963 if (--rescnt < 0) {
4964 rescnt = fmtcnt + 100;
4965 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004966 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 return NULL;
4968 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4969 --rescnt;
4970 }
4971 *res++ = *fmt++;
4972 }
4973 else {
4974 /* Got a format specifier */
4975 int flags = 0;
4976 int width = -1;
4977 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 Py_UNICODE c = '\0';
4979 Py_UNICODE fill;
4980 PyObject *v = NULL;
4981 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004982 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 Py_UNICODE sign;
4984 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004985 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986
4987 fmt++;
4988 if (*fmt == '(') {
4989 Py_UNICODE *keystart;
4990 int keylen;
4991 PyObject *key;
4992 int pcount = 1;
4993
4994 if (dict == NULL) {
4995 PyErr_SetString(PyExc_TypeError,
4996 "format requires a mapping");
4997 goto onError;
4998 }
4999 ++fmt;
5000 --fmtcnt;
5001 keystart = fmt;
5002 /* Skip over balanced parentheses */
5003 while (pcount > 0 && --fmtcnt >= 0) {
5004 if (*fmt == ')')
5005 --pcount;
5006 else if (*fmt == '(')
5007 ++pcount;
5008 fmt++;
5009 }
5010 keylen = fmt - keystart - 1;
5011 if (fmtcnt < 0 || pcount > 0) {
5012 PyErr_SetString(PyExc_ValueError,
5013 "incomplete format key");
5014 goto onError;
5015 }
Fred Drakee4315f52000-05-09 19:53:39 +00005016 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 then looked up since Python uses strings to hold
5018 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005019 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 key = PyUnicode_EncodeUTF8(keystart,
5021 keylen,
5022 NULL);
5023 if (key == NULL)
5024 goto onError;
5025 if (args_owned) {
5026 Py_DECREF(args);
5027 args_owned = 0;
5028 }
5029 args = PyObject_GetItem(dict, key);
5030 Py_DECREF(key);
5031 if (args == NULL) {
5032 goto onError;
5033 }
5034 args_owned = 1;
5035 arglen = -1;
5036 argidx = -2;
5037 }
5038 while (--fmtcnt >= 0) {
5039 switch (c = *fmt++) {
5040 case '-': flags |= F_LJUST; continue;
5041 case '+': flags |= F_SIGN; continue;
5042 case ' ': flags |= F_BLANK; continue;
5043 case '#': flags |= F_ALT; continue;
5044 case '0': flags |= F_ZERO; continue;
5045 }
5046 break;
5047 }
5048 if (c == '*') {
5049 v = getnextarg(args, arglen, &argidx);
5050 if (v == NULL)
5051 goto onError;
5052 if (!PyInt_Check(v)) {
5053 PyErr_SetString(PyExc_TypeError,
5054 "* wants int");
5055 goto onError;
5056 }
5057 width = PyInt_AsLong(v);
5058 if (width < 0) {
5059 flags |= F_LJUST;
5060 width = -width;
5061 }
5062 if (--fmtcnt >= 0)
5063 c = *fmt++;
5064 }
5065 else if (c >= '0' && c <= '9') {
5066 width = c - '0';
5067 while (--fmtcnt >= 0) {
5068 c = *fmt++;
5069 if (c < '0' || c > '9')
5070 break;
5071 if ((width*10) / 10 != width) {
5072 PyErr_SetString(PyExc_ValueError,
5073 "width too big");
5074 goto onError;
5075 }
5076 width = width*10 + (c - '0');
5077 }
5078 }
5079 if (c == '.') {
5080 prec = 0;
5081 if (--fmtcnt >= 0)
5082 c = *fmt++;
5083 if (c == '*') {
5084 v = getnextarg(args, arglen, &argidx);
5085 if (v == NULL)
5086 goto onError;
5087 if (!PyInt_Check(v)) {
5088 PyErr_SetString(PyExc_TypeError,
5089 "* wants int");
5090 goto onError;
5091 }
5092 prec = PyInt_AsLong(v);
5093 if (prec < 0)
5094 prec = 0;
5095 if (--fmtcnt >= 0)
5096 c = *fmt++;
5097 }
5098 else if (c >= '0' && c <= '9') {
5099 prec = c - '0';
5100 while (--fmtcnt >= 0) {
5101 c = Py_CHARMASK(*fmt++);
5102 if (c < '0' || c > '9')
5103 break;
5104 if ((prec*10) / 10 != prec) {
5105 PyErr_SetString(PyExc_ValueError,
5106 "prec too big");
5107 goto onError;
5108 }
5109 prec = prec*10 + (c - '0');
5110 }
5111 }
5112 } /* prec */
5113 if (fmtcnt >= 0) {
5114 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 if (--fmtcnt >= 0)
5116 c = *fmt++;
5117 }
5118 }
5119 if (fmtcnt < 0) {
5120 PyErr_SetString(PyExc_ValueError,
5121 "incomplete format");
5122 goto onError;
5123 }
5124 if (c != '%') {
5125 v = getnextarg(args, arglen, &argidx);
5126 if (v == NULL)
5127 goto onError;
5128 }
5129 sign = 0;
5130 fill = ' ';
5131 switch (c) {
5132
5133 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005134 pbuf = formatbuf;
5135 /* presume that buffer length is at least 1 */
5136 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 len = 1;
5138 break;
5139
5140 case 's':
5141 case 'r':
5142 if (PyUnicode_Check(v) && c == 's') {
5143 temp = v;
5144 Py_INCREF(temp);
5145 }
5146 else {
5147 PyObject *unicode;
5148 if (c == 's')
5149 temp = PyObject_Str(v);
5150 else
5151 temp = PyObject_Repr(v);
5152 if (temp == NULL)
5153 goto onError;
5154 if (!PyString_Check(temp)) {
5155 /* XXX Note: this should never happen, since
5156 PyObject_Repr() and PyObject_Str() assure
5157 this */
5158 Py_DECREF(temp);
5159 PyErr_SetString(PyExc_TypeError,
5160 "%s argument has non-string str()");
5161 goto onError;
5162 }
Fred Drakee4315f52000-05-09 19:53:39 +00005163 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005165 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 "strict");
5167 Py_DECREF(temp);
5168 temp = unicode;
5169 if (temp == NULL)
5170 goto onError;
5171 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005172 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 len = PyUnicode_GET_SIZE(temp);
5174 if (prec >= 0 && len > prec)
5175 len = prec;
5176 break;
5177
5178 case 'i':
5179 case 'd':
5180 case 'u':
5181 case 'o':
5182 case 'x':
5183 case 'X':
5184 if (c == 'i')
5185 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005186 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005187 temp = formatlong(v, flags, prec, c);
5188 if (!temp)
5189 goto onError;
5190 pbuf = PyUnicode_AS_UNICODE(temp);
5191 len = PyUnicode_GET_SIZE(temp);
5192 /* unbounded ints can always produce
5193 a sign character! */
5194 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005196 else {
5197 pbuf = formatbuf;
5198 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5199 flags, prec, c, v);
5200 if (len < 0)
5201 goto onError;
5202 /* only d conversion is signed */
5203 sign = c == 'd';
5204 }
5205 if (flags & F_ZERO)
5206 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 break;
5208
5209 case 'e':
5210 case 'E':
5211 case 'f':
5212 case 'g':
5213 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005214 pbuf = formatbuf;
5215 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5216 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 if (len < 0)
5218 goto onError;
5219 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005220 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 fill = '0';
5222 break;
5223
5224 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005225 pbuf = formatbuf;
5226 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 if (len < 0)
5228 goto onError;
5229 break;
5230
5231 default:
5232 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005233 "unsupported format character '%c' (0x%x) "
5234 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005235 (31<=c && c<=126) ? c : '?',
5236 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 goto onError;
5238 }
5239 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005240 if (*pbuf == '-' || *pbuf == '+') {
5241 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 len--;
5243 }
5244 else if (flags & F_SIGN)
5245 sign = '+';
5246 else if (flags & F_BLANK)
5247 sign = ' ';
5248 else
5249 sign = 0;
5250 }
5251 if (width < len)
5252 width = len;
5253 if (rescnt < width + (sign != 0)) {
5254 reslen -= rescnt;
5255 rescnt = width + fmtcnt + 100;
5256 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005257 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 return NULL;
5259 res = PyUnicode_AS_UNICODE(result)
5260 + reslen - rescnt;
5261 }
5262 if (sign) {
5263 if (fill != ' ')
5264 *res++ = sign;
5265 rescnt--;
5266 if (width > len)
5267 width--;
5268 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005269 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5270 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005271 assert(pbuf[1] == c);
5272 if (fill != ' ') {
5273 *res++ = *pbuf++;
5274 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005275 }
Tim Petersfff53252001-04-12 18:38:48 +00005276 rescnt -= 2;
5277 width -= 2;
5278 if (width < 0)
5279 width = 0;
5280 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 if (width > len && !(flags & F_LJUST)) {
5283 do {
5284 --rescnt;
5285 *res++ = fill;
5286 } while (--width > len);
5287 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005288 if (fill == ' ') {
5289 if (sign)
5290 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005291 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005292 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005293 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005294 *res++ = *pbuf++;
5295 *res++ = *pbuf++;
5296 }
5297 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005298 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 res += len;
5300 rescnt -= len;
5301 while (--width >= len) {
5302 --rescnt;
5303 *res++ = ' ';
5304 }
5305 if (dict && (argidx < arglen) && c != '%') {
5306 PyErr_SetString(PyExc_TypeError,
5307 "not all arguments converted");
5308 goto onError;
5309 }
5310 Py_XDECREF(temp);
5311 } /* '%' */
5312 } /* until end */
5313 if (argidx < arglen && !dict) {
5314 PyErr_SetString(PyExc_TypeError,
5315 "not all arguments converted");
5316 goto onError;
5317 }
5318
5319 if (args_owned) {
5320 Py_DECREF(args);
5321 }
5322 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005323 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 return (PyObject *)result;
5326
5327 onError:
5328 Py_XDECREF(result);
5329 Py_DECREF(uformat);
5330 if (args_owned) {
5331 Py_DECREF(args);
5332 }
5333 return NULL;
5334}
5335
5336static PyBufferProcs unicode_as_buffer = {
5337 (getreadbufferproc) unicode_buffer_getreadbuf,
5338 (getwritebufferproc) unicode_buffer_getwritebuf,
5339 (getsegcountproc) unicode_buffer_getsegcount,
5340 (getcharbufferproc) unicode_buffer_getcharbuf,
5341};
5342
Tim Peters6d6c1a32001-08-02 04:15:00 +00005343static PyObject *
5344unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5345{
5346 PyObject *x = NULL;
5347 static char *kwlist[] = {"string", "encoding", "errors", 0};
5348 char *encoding = NULL;
5349 char *errors = NULL;
5350
5351 assert(type == &PyUnicode_Type);
5352 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5353 kwlist, &x, &encoding, &errors))
5354 return NULL;
5355 if (x == NULL)
5356 return (PyObject *)_PyUnicode_New(0);
5357 return PyUnicode_FromEncodedObject(x, encoding, errors);
5358}
5359
5360static char unicode_doc[] =
5361"unicode(string [, encoding[, errors]]) -> object\n\
5362\n\
5363Create a new Unicode object from the given encoded string.\n\
5364encoding defaults to the current default string encoding and \n\
5365errors, defining the error handling, to 'strict'.";
5366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367PyTypeObject PyUnicode_Type = {
5368 PyObject_HEAD_INIT(&PyType_Type)
5369 0, /* ob_size */
5370 "unicode", /* tp_name */
5371 sizeof(PyUnicodeObject), /* tp_size */
5372 0, /* tp_itemsize */
5373 /* Slots */
5374 (destructor)_PyUnicode_Free, /* tp_dealloc */
5375 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005376 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 0, /* tp_setattr */
5378 (cmpfunc) unicode_compare, /* tp_compare */
5379 (reprfunc) unicode_repr, /* tp_repr */
5380 0, /* tp_as_number */
5381 &unicode_as_sequence, /* tp_as_sequence */
5382 0, /* tp_as_mapping */
5383 (hashfunc) unicode_hash, /* tp_hash*/
5384 0, /* tp_call*/
5385 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005386 PyObject_GenericGetAttr, /* tp_getattro */
5387 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 &unicode_as_buffer, /* tp_as_buffer */
5389 Py_TPFLAGS_DEFAULT, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005390 unicode_doc, /* tp_doc */
5391 0, /* tp_traverse */
5392 0, /* tp_clear */
5393 0, /* tp_richcompare */
5394 0, /* tp_weaklistoffset */
5395 0, /* tp_iter */
5396 0, /* tp_iternext */
5397 unicode_methods, /* tp_methods */
5398 0, /* tp_members */
5399 0, /* tp_getset */
5400 0, /* tp_base */
5401 0, /* tp_dict */
5402 0, /* tp_descr_get */
5403 0, /* tp_descr_set */
5404 0, /* tp_dictoffset */
5405 0, /* tp_init */
5406 0, /* tp_alloc */
5407 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408};
5409
5410/* Initialize the Unicode implementation */
5411
Thomas Wouters78890102000-07-22 19:25:51 +00005412void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005414 int i;
5415
Fred Drakee4315f52000-05-09 19:53:39 +00005416 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005417 unicode_freelist = NULL;
5418 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005420 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005421 for (i = 0; i < 256; i++)
5422 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423}
5424
5425/* Finalize the Unicode implementation */
5426
5427void
Thomas Wouters78890102000-07-22 19:25:51 +00005428_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005430 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005431 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005433 Py_XDECREF(unicode_empty);
5434 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005435
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005436 for (i = 0; i < 256; i++) {
5437 if (unicode_latin1[i]) {
5438 Py_DECREF(unicode_latin1[i]);
5439 unicode_latin1[i] = NULL;
5440 }
5441 }
5442
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005443 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 PyUnicodeObject *v = u;
5445 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005446 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005447 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005448 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005449 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005451 unicode_freelist = NULL;
5452 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453}