blob: 7b12594f72f4b89df5d7a9b7416b5a0247ec8bf6 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p = NULL, *buf = NULL;
1114 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001115 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
1117 /* Escaped strings will always be longer than the resulting
1118 Unicode string, so we start with size here and then reduce the
1119 length after conversion to the true value. */
1120 v = _PyUnicode_New(size);
1121 if (v == NULL)
1122 goto onError;
1123 if (size == 0)
1124 return (PyObject *)v;
1125 p = buf = PyUnicode_AS_UNICODE(v);
1126 end = s + size;
1127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 int i;
1131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
1134 *p++ = (unsigned char)*s++;
1135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhdf846752000-09-03 11:29:49 +00001167 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001169 for (x = 0, i = 0; i < 2; i++) {
1170 c = (unsigned char)s[i];
1171 if (!isxdigit(c)) {
1172 if (unicodeescape_decoding_error(&s, &x, errors,
1173 "truncated \\xXX"))
1174 goto onError;
1175 i++;
1176 break;
1177 }
1178 x = (x<<4) & ~0xF;
1179 if (c >= '0' && c <= '9')
1180 x += c - '0';
1181 else if (c >= 'a' && c <= 'f')
1182 x += 10 + c - 'a';
1183 else
1184 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001186 s += i;
1187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 break;
1189
1190 /* \uXXXX with 4 hex digits */
1191 case 'u':
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \UXXXXXXXX with 8 hex digits */
1214 case 'U':
1215 for (chr = 0, i = 0; i < 8; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\uXXXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 chr = (chr<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 chr += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 chr += 10 + c - 'a';
1229 else
1230 chr += 10 + c - 'A';
1231 }
1232 s += i;
1233 goto store;
1234
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001235 case 'N':
1236 /* Ok, we need to deal with Unicode Character Names now,
1237 * make sure we've imported the hash table data...
1238 */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239 if (ucnhash_CAPI == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001240 PyObject *mod = 0, *v = 0;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241 mod = PyImport_ImportModule("unicodedata");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001242 if (mod == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001243 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244 v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 Py_DECREF(mod);
1246 if (v == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001247 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001249 Py_DECREF(v);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 if (ucnhash_CAPI == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001251 goto ucnhashError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001252 }
1253
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001255 const char *start = s + 1;
1256 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001257
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001258 /* look for the closing brace */
1259 while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001260 endBrace++;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 if (endBrace != end && *endBrace == '}') {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001262 if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001263 if (unicodeescape_decoding_error(
1264 &s, &x, errors,
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001265 "Invalid Unicode Character Name")
1266 )
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001267 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001268 goto ucnFallthrough;
1269 }
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001271 goto store;
1272 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 if (unicodeescape_decoding_error(
1274 &s, &x, errors,
1275 "Unicode name missing closing brace"))
1276 goto onError;
1277 goto ucnFallthrough;
1278 }
1279 break;
1280 }
1281 if (unicodeescape_decoding_error(
1282 &s, &x, errors,
1283 "Missing opening brace for Unicode Character Name escape"))
1284 goto onError;
1285ucnFallthrough:
1286 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001287 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 *p++ = '\\';
1289 *p++ = (unsigned char)s[-1];
1290 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291store:
1292 /* when we get here, chr is a 32-bit unicode character */
1293 if (chr <= 0xffff)
1294 /* UCS-2 character */
1295 *p++ = (Py_UNICODE) chr;
1296 else if (chr <= 0x10ffff) {
1297 /* UCS-4 character. store as two surrogate characters */
1298 chr -= 0x10000L;
1299 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301 } else {
1302 if (unicodeescape_decoding_error(
1303 &s, &x, errors,
1304 "Illegal Unicode character")
1305 )
1306 goto onError;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001310 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return (PyObject *)v;
1313
Fredrik Lundhf6056062001-01-20 11:15:25 +00001314 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001315 PyErr_SetString(
1316 PyExc_UnicodeError,
1317 "\\N escapes not supported (can't load unicodedata module)"
1318 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001319 return NULL;
1320
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 onError:
1322 Py_XDECREF(v);
1323 return NULL;
1324}
1325
1326/* Return a Unicode-Escape string version of the Unicode object.
1327
1328 If quotes is true, the string is enclosed in u"" or u'' quotes as
1329 appropriate.
1330
1331*/
1332
Barry Warsaw51ac5802000-03-20 16:36:48 +00001333static const Py_UNICODE *findchar(const Py_UNICODE *s,
1334 int size,
1335 Py_UNICODE ch);
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337static
1338PyObject *unicodeescape_string(const Py_UNICODE *s,
1339 int size,
1340 int quotes)
1341{
1342 PyObject *repr;
1343 char *p;
1344 char *q;
1345
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001346 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347
1348 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1349 if (repr == NULL)
1350 return NULL;
1351
1352 p = q = PyString_AS_STRING(repr);
1353
1354 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 *p++ = 'u';
1356 *p++ = (findchar(s, size, '\'') &&
1357 !findchar(s, size, '"')) ? '"' : '\'';
1358 }
1359 while (size-- > 0) {
1360 Py_UNICODE ch = *s++;
1361 /* Escape quotes */
1362 if (quotes && (ch == q[1] || ch == '\\')) {
1363 *p++ = '\\';
1364 *p++ = (char) ch;
1365 }
1366 /* Map 16-bit characters to '\uxxxx' */
1367 else if (ch >= 256) {
1368 *p++ = '\\';
1369 *p++ = 'u';
1370 *p++ = hexdigit[(ch >> 12) & 0xf];
1371 *p++ = hexdigit[(ch >> 8) & 0xf];
1372 *p++ = hexdigit[(ch >> 4) & 0xf];
1373 *p++ = hexdigit[ch & 15];
1374 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001375 /* Map special whitespace to '\t', \n', '\r' */
1376 else if (ch == '\t') {
1377 *p++ = '\\';
1378 *p++ = 't';
1379 }
1380 else if (ch == '\n') {
1381 *p++ = '\\';
1382 *p++ = 'n';
1383 }
1384 else if (ch == '\r') {
1385 *p++ = '\\';
1386 *p++ = 'r';
1387 }
1388 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 else if (ch < ' ' || ch >= 128) {
1390 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001391 *p++ = 'x';
1392 *p++ = hexdigit[(ch >> 4) & 0xf];
1393 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 }
1395 /* Copy everything else as-is */
1396 else
1397 *p++ = (char) ch;
1398 }
1399 if (quotes)
1400 *p++ = q[1];
1401
1402 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001403 if (_PyString_Resize(&repr, p - q))
1404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405
1406 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001407
1408 onError:
1409 Py_DECREF(repr);
1410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411}
1412
1413PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1414 int size)
1415{
1416 return unicodeescape_string(s, size, 0);
1417}
1418
1419PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1420{
1421 if (!PyUnicode_Check(unicode)) {
1422 PyErr_BadArgument();
1423 return NULL;
1424 }
1425 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1426 PyUnicode_GET_SIZE(unicode));
1427}
1428
1429/* --- Raw Unicode Escape Codec ------------------------------------------- */
1430
1431PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1432 int size,
1433 const char *errors)
1434{
1435 PyUnicodeObject *v;
1436 Py_UNICODE *p, *buf;
1437 const char *end;
1438 const char *bs;
1439
1440 /* Escaped strings will always be longer than the resulting
1441 Unicode string, so we start with size here and then reduce the
1442 length after conversion to the true value. */
1443 v = _PyUnicode_New(size);
1444 if (v == NULL)
1445 goto onError;
1446 if (size == 0)
1447 return (PyObject *)v;
1448 p = buf = PyUnicode_AS_UNICODE(v);
1449 end = s + size;
1450 while (s < end) {
1451 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001452 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453 int i;
1454
1455 /* Non-escape characters are interpreted as Unicode ordinals */
1456 if (*s != '\\') {
1457 *p++ = (unsigned char)*s++;
1458 continue;
1459 }
1460
1461 /* \u-escapes are only interpreted iff the number of leading
1462 backslashes if odd */
1463 bs = s;
1464 for (;s < end;) {
1465 if (*s != '\\')
1466 break;
1467 *p++ = (unsigned char)*s++;
1468 }
1469 if (((s - bs) & 1) == 0 ||
1470 s >= end ||
1471 *s != 'u') {
1472 continue;
1473 }
1474 p--;
1475 s++;
1476
1477 /* \uXXXX with 4 hex digits */
1478 for (x = 0, i = 0; i < 4; i++) {
1479 c = (unsigned char)s[i];
1480 if (!isxdigit(c)) {
1481 if (unicodeescape_decoding_error(&s, &x, errors,
1482 "truncated \\uXXXX"))
1483 goto onError;
1484 i++;
1485 break;
1486 }
1487 x = (x<<4) & ~0xF;
1488 if (c >= '0' && c <= '9')
1489 x += c - '0';
1490 else if (c >= 'a' && c <= 'f')
1491 x += 10 + c - 'a';
1492 else
1493 x += 10 + c - 'A';
1494 }
1495 s += i;
1496 *p++ = x;
1497 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001498 if (_PyUnicode_Resize(v, (int)(p - buf)))
1499 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 return (PyObject *)v;
1501
1502 onError:
1503 Py_XDECREF(v);
1504 return NULL;
1505}
1506
1507PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1508 int size)
1509{
1510 PyObject *repr;
1511 char *p;
1512 char *q;
1513
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001514 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515
1516 repr = PyString_FromStringAndSize(NULL, 6 * size);
1517 if (repr == NULL)
1518 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001519 if (size == 0)
1520 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521
1522 p = q = PyString_AS_STRING(repr);
1523 while (size-- > 0) {
1524 Py_UNICODE ch = *s++;
1525 /* Map 16-bit characters to '\uxxxx' */
1526 if (ch >= 256) {
1527 *p++ = '\\';
1528 *p++ = 'u';
1529 *p++ = hexdigit[(ch >> 12) & 0xf];
1530 *p++ = hexdigit[(ch >> 8) & 0xf];
1531 *p++ = hexdigit[(ch >> 4) & 0xf];
1532 *p++ = hexdigit[ch & 15];
1533 }
1534 /* Copy everything else as-is */
1535 else
1536 *p++ = (char) ch;
1537 }
1538 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001539 if (_PyString_Resize(&repr, p - q))
1540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541
1542 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001543
1544 onError:
1545 Py_DECREF(repr);
1546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547}
1548
1549PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1550{
1551 if (!PyUnicode_Check(unicode)) {
1552 PyErr_BadArgument();
1553 return NULL;
1554 }
1555 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1556 PyUnicode_GET_SIZE(unicode));
1557}
1558
1559/* --- Latin-1 Codec ------------------------------------------------------ */
1560
1561PyObject *PyUnicode_DecodeLatin1(const char *s,
1562 int size,
1563 const char *errors)
1564{
1565 PyUnicodeObject *v;
1566 Py_UNICODE *p;
1567
1568 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1569 v = _PyUnicode_New(size);
1570 if (v == NULL)
1571 goto onError;
1572 if (size == 0)
1573 return (PyObject *)v;
1574 p = PyUnicode_AS_UNICODE(v);
1575 while (size-- > 0)
1576 *p++ = (unsigned char)*s++;
1577 return (PyObject *)v;
1578
1579 onError:
1580 Py_XDECREF(v);
1581 return NULL;
1582}
1583
1584static
1585int latin1_encoding_error(const Py_UNICODE **source,
1586 char **dest,
1587 const char *errors,
1588 const char *details)
1589{
1590 if ((errors == NULL) ||
1591 (strcmp(errors,"strict") == 0)) {
1592 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001593 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 details);
1595 return -1;
1596 }
1597 else if (strcmp(errors,"ignore") == 0) {
1598 return 0;
1599 }
1600 else if (strcmp(errors,"replace") == 0) {
1601 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001602 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 return 0;
1604 }
1605 else {
1606 PyErr_Format(PyExc_ValueError,
1607 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001608 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 errors);
1610 return -1;
1611 }
1612}
1613
1614PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1615 int size,
1616 const char *errors)
1617{
1618 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 repr = PyString_FromStringAndSize(NULL, size);
1622 if (repr == NULL)
1623 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 if (size == 0)
1625 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626
1627 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001628 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 while (size-- > 0) {
1630 Py_UNICODE ch = *p++;
1631 if (ch >= 256) {
1632 if (latin1_encoding_error(&p, &s, errors,
1633 "ordinal not in range(256)"))
1634 goto onError;
1635 }
1636 else
1637 *s++ = (char)ch;
1638 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001639 /* Resize if error handling skipped some characters */
1640 if (s - start < PyString_GET_SIZE(repr))
1641 if (_PyString_Resize(&repr, s - start))
1642 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 return repr;
1644
1645 onError:
1646 Py_DECREF(repr);
1647 return NULL;
1648}
1649
1650PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1651{
1652 if (!PyUnicode_Check(unicode)) {
1653 PyErr_BadArgument();
1654 return NULL;
1655 }
1656 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1657 PyUnicode_GET_SIZE(unicode),
1658 NULL);
1659}
1660
1661/* --- 7-bit ASCII Codec -------------------------------------------------- */
1662
1663static
1664int ascii_decoding_error(const char **source,
1665 Py_UNICODE **dest,
1666 const char *errors,
1667 const char *details)
1668{
1669 if ((errors == NULL) ||
1670 (strcmp(errors,"strict") == 0)) {
1671 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001672 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 details);
1674 return -1;
1675 }
1676 else if (strcmp(errors,"ignore") == 0) {
1677 return 0;
1678 }
1679 else if (strcmp(errors,"replace") == 0) {
1680 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1681 (*dest)++;
1682 return 0;
1683 }
1684 else {
1685 PyErr_Format(PyExc_ValueError,
1686 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001687 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 errors);
1689 return -1;
1690 }
1691}
1692
1693PyObject *PyUnicode_DecodeASCII(const char *s,
1694 int size,
1695 const char *errors)
1696{
1697 PyUnicodeObject *v;
1698 Py_UNICODE *p;
1699
1700 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1701 v = _PyUnicode_New(size);
1702 if (v == NULL)
1703 goto onError;
1704 if (size == 0)
1705 return (PyObject *)v;
1706 p = PyUnicode_AS_UNICODE(v);
1707 while (size-- > 0) {
1708 register unsigned char c;
1709
1710 c = (unsigned char)*s++;
1711 if (c < 128)
1712 *p++ = c;
1713 else if (ascii_decoding_error(&s, &p, errors,
1714 "ordinal not in range(128)"))
1715 goto onError;
1716 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001717 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1718 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 return (PyObject *)v;
1721
1722 onError:
1723 Py_XDECREF(v);
1724 return NULL;
1725}
1726
1727static
1728int ascii_encoding_error(const Py_UNICODE **source,
1729 char **dest,
1730 const char *errors,
1731 const char *details)
1732{
1733 if ((errors == NULL) ||
1734 (strcmp(errors,"strict") == 0)) {
1735 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 details);
1738 return -1;
1739 }
1740 else if (strcmp(errors,"ignore") == 0) {
1741 return 0;
1742 }
1743 else if (strcmp(errors,"replace") == 0) {
1744 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001745 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 return 0;
1747 }
1748 else {
1749 PyErr_Format(PyExc_ValueError,
1750 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001751 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 errors);
1753 return -1;
1754 }
1755}
1756
1757PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1758 int size,
1759 const char *errors)
1760{
1761 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001762 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 repr = PyString_FromStringAndSize(NULL, size);
1765 if (repr == NULL)
1766 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001767 if (size == 0)
1768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001771 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 if (ch >= 128) {
1775 if (ascii_encoding_error(&p, &s, errors,
1776 "ordinal not in range(128)"))
1777 goto onError;
1778 }
1779 else
1780 *s++ = (char)ch;
1781 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001782 /* Resize if error handling skipped some characters */
1783 if (s - start < PyString_GET_SIZE(repr))
1784 if (_PyString_Resize(&repr, s - start))
1785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 return repr;
1787
1788 onError:
1789 Py_DECREF(repr);
1790 return NULL;
1791}
1792
1793PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1794{
1795 if (!PyUnicode_Check(unicode)) {
1796 PyErr_BadArgument();
1797 return NULL;
1798 }
1799 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1800 PyUnicode_GET_SIZE(unicode),
1801 NULL);
1802}
1803
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001804#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001805
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001806/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001807
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001808PyObject *PyUnicode_DecodeMBCS(const char *s,
1809 int size,
1810 const char *errors)
1811{
1812 PyUnicodeObject *v;
1813 Py_UNICODE *p;
1814
1815 /* First get the size of the result */
1816 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001817 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001818 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1819
1820 v = _PyUnicode_New(usize);
1821 if (v == NULL)
1822 return NULL;
1823 if (usize == 0)
1824 return (PyObject *)v;
1825 p = PyUnicode_AS_UNICODE(v);
1826 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1827 Py_DECREF(v);
1828 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1829 }
1830
1831 return (PyObject *)v;
1832}
1833
1834PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1835 int size,
1836 const char *errors)
1837{
1838 PyObject *repr;
1839 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001840 DWORD mbcssize;
1841
1842 /* If there are no characters, bail now! */
1843 if (size==0)
1844 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001845
1846 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001847 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848 if (mbcssize==0)
1849 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1850
1851 repr = PyString_FromStringAndSize(NULL, mbcssize);
1852 if (repr == NULL)
1853 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001854 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001855 return repr;
1856
1857 /* Do the conversion */
1858 s = PyString_AS_STRING(repr);
1859 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1860 Py_DECREF(repr);
1861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862 }
1863 return repr;
1864}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001865
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001866#endif /* MS_WIN32 */
1867
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868/* --- Character Mapping Codec -------------------------------------------- */
1869
1870static
1871int charmap_decoding_error(const char **source,
1872 Py_UNICODE **dest,
1873 const char *errors,
1874 const char *details)
1875{
1876 if ((errors == NULL) ||
1877 (strcmp(errors,"strict") == 0)) {
1878 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001879 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 details);
1881 return -1;
1882 }
1883 else if (strcmp(errors,"ignore") == 0) {
1884 return 0;
1885 }
1886 else if (strcmp(errors,"replace") == 0) {
1887 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1888 (*dest)++;
1889 return 0;
1890 }
1891 else {
1892 PyErr_Format(PyExc_ValueError,
1893 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001894 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 errors);
1896 return -1;
1897 }
1898}
1899
1900PyObject *PyUnicode_DecodeCharmap(const char *s,
1901 int size,
1902 PyObject *mapping,
1903 const char *errors)
1904{
1905 PyUnicodeObject *v;
1906 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001907 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
1909 /* Default to Latin-1 */
1910 if (mapping == NULL)
1911 return PyUnicode_DecodeLatin1(s, size, errors);
1912
1913 v = _PyUnicode_New(size);
1914 if (v == NULL)
1915 goto onError;
1916 if (size == 0)
1917 return (PyObject *)v;
1918 p = PyUnicode_AS_UNICODE(v);
1919 while (size-- > 0) {
1920 unsigned char ch = *s++;
1921 PyObject *w, *x;
1922
1923 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1924 w = PyInt_FromLong((long)ch);
1925 if (w == NULL)
1926 goto onError;
1927 x = PyObject_GetItem(mapping, w);
1928 Py_DECREF(w);
1929 if (x == NULL) {
1930 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001931 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001933 x = Py_None;
1934 Py_INCREF(x);
1935 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001936 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
1938
1939 /* Apply mapping */
1940 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001941 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 if (value < 0 || value > 65535) {
1943 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001944 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 Py_DECREF(x);
1946 goto onError;
1947 }
1948 *p++ = (Py_UNICODE)value;
1949 }
1950 else if (x == Py_None) {
1951 /* undefined mapping */
1952 if (charmap_decoding_error(&s, &p, errors,
1953 "character maps to <undefined>")) {
1954 Py_DECREF(x);
1955 goto onError;
1956 }
1957 }
1958 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001959 int targetsize = PyUnicode_GET_SIZE(x);
1960
1961 if (targetsize == 1)
1962 /* 1-1 mapping */
1963 *p++ = *PyUnicode_AS_UNICODE(x);
1964
1965 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001967 if (targetsize > extrachars) {
1968 /* resize first */
1969 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1970 int needed = (targetsize - extrachars) + \
1971 (targetsize << 2);
1972 extrachars += needed;
1973 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001974 Py_DECREF(x);
1975 goto onError;
1976 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001977 p = PyUnicode_AS_UNICODE(v) + oldpos;
1978 }
1979 Py_UNICODE_COPY(p,
1980 PyUnicode_AS_UNICODE(x),
1981 targetsize);
1982 p += targetsize;
1983 extrachars -= targetsize;
1984 }
1985 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 }
1987 else {
1988 /* wrong return value */
1989 PyErr_SetString(PyExc_TypeError,
1990 "character mapping must return integer, None or unicode");
1991 Py_DECREF(x);
1992 goto onError;
1993 }
1994 Py_DECREF(x);
1995 }
1996 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1997 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1998 goto onError;
1999 return (PyObject *)v;
2000
2001 onError:
2002 Py_XDECREF(v);
2003 return NULL;
2004}
2005
2006static
2007int charmap_encoding_error(const Py_UNICODE **source,
2008 char **dest,
2009 const char *errors,
2010 const char *details)
2011{
2012 if ((errors == NULL) ||
2013 (strcmp(errors,"strict") == 0)) {
2014 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002015 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 details);
2017 return -1;
2018 }
2019 else if (strcmp(errors,"ignore") == 0) {
2020 return 0;
2021 }
2022 else if (strcmp(errors,"replace") == 0) {
2023 **dest = '?';
2024 (*dest)++;
2025 return 0;
2026 }
2027 else {
2028 PyErr_Format(PyExc_ValueError,
2029 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002030 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 errors);
2032 return -1;
2033 }
2034}
2035
2036PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2037 int size,
2038 PyObject *mapping,
2039 const char *errors)
2040{
2041 PyObject *v;
2042 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002043 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Default to Latin-1 */
2046 if (mapping == NULL)
2047 return PyUnicode_EncodeLatin1(p, size, errors);
2048
2049 v = PyString_FromStringAndSize(NULL, size);
2050 if (v == NULL)
2051 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002052 if (size == 0)
2053 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 s = PyString_AS_STRING(v);
2055 while (size-- > 0) {
2056 Py_UNICODE ch = *p++;
2057 PyObject *w, *x;
2058
2059 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2060 w = PyInt_FromLong((long)ch);
2061 if (w == NULL)
2062 goto onError;
2063 x = PyObject_GetItem(mapping, w);
2064 Py_DECREF(w);
2065 if (x == NULL) {
2066 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002067 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002069 x = Py_None;
2070 Py_INCREF(x);
2071 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002072 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 }
2074
2075 /* Apply mapping */
2076 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002077 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (value < 0 || value > 255) {
2079 PyErr_SetString(PyExc_TypeError,
2080 "character mapping must be in range(256)");
2081 Py_DECREF(x);
2082 goto onError;
2083 }
2084 *s++ = (char)value;
2085 }
2086 else if (x == Py_None) {
2087 /* undefined mapping */
2088 if (charmap_encoding_error(&p, &s, errors,
2089 "character maps to <undefined>")) {
2090 Py_DECREF(x);
2091 goto onError;
2092 }
2093 }
2094 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002095 int targetsize = PyString_GET_SIZE(x);
2096
2097 if (targetsize == 1)
2098 /* 1-1 mapping */
2099 *s++ = *PyString_AS_STRING(x);
2100
2101 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002103 if (targetsize > extrachars) {
2104 /* resize first */
2105 int oldpos = (int)(s - PyString_AS_STRING(v));
2106 int needed = (targetsize - extrachars) + \
2107 (targetsize << 2);
2108 extrachars += needed;
2109 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002110 Py_DECREF(x);
2111 goto onError;
2112 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002113 s = PyString_AS_STRING(v) + oldpos;
2114 }
2115 memcpy(s,
2116 PyString_AS_STRING(x),
2117 targetsize);
2118 s += targetsize;
2119 extrachars -= targetsize;
2120 }
2121 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 }
2123 else {
2124 /* wrong return value */
2125 PyErr_SetString(PyExc_TypeError,
2126 "character mapping must return integer, None or unicode");
2127 Py_DECREF(x);
2128 goto onError;
2129 }
2130 Py_DECREF(x);
2131 }
2132 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2133 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2134 goto onError;
2135 return v;
2136
2137 onError:
2138 Py_DECREF(v);
2139 return NULL;
2140}
2141
2142PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2143 PyObject *mapping)
2144{
2145 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2146 PyErr_BadArgument();
2147 return NULL;
2148 }
2149 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2150 PyUnicode_GET_SIZE(unicode),
2151 mapping,
2152 NULL);
2153}
2154
2155static
2156int translate_error(const Py_UNICODE **source,
2157 Py_UNICODE **dest,
2158 const char *errors,
2159 const char *details)
2160{
2161 if ((errors == NULL) ||
2162 (strcmp(errors,"strict") == 0)) {
2163 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 details);
2166 return -1;
2167 }
2168 else if (strcmp(errors,"ignore") == 0) {
2169 return 0;
2170 }
2171 else if (strcmp(errors,"replace") == 0) {
2172 **dest = '?';
2173 (*dest)++;
2174 return 0;
2175 }
2176 else {
2177 PyErr_Format(PyExc_ValueError,
2178 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 errors);
2181 return -1;
2182 }
2183}
2184
2185PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2186 int size,
2187 PyObject *mapping,
2188 const char *errors)
2189{
2190 PyUnicodeObject *v;
2191 Py_UNICODE *p;
2192
2193 if (mapping == NULL) {
2194 PyErr_BadArgument();
2195 return NULL;
2196 }
2197
2198 /* Output will never be longer than input */
2199 v = _PyUnicode_New(size);
2200 if (v == NULL)
2201 goto onError;
2202 if (size == 0)
2203 goto done;
2204 p = PyUnicode_AS_UNICODE(v);
2205 while (size-- > 0) {
2206 Py_UNICODE ch = *s++;
2207 PyObject *w, *x;
2208
2209 /* Get mapping */
2210 w = PyInt_FromLong(ch);
2211 if (w == NULL)
2212 goto onError;
2213 x = PyObject_GetItem(mapping, w);
2214 Py_DECREF(w);
2215 if (x == NULL) {
2216 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2217 /* No mapping found: default to 1-1 mapping */
2218 PyErr_Clear();
2219 *p++ = ch;
2220 continue;
2221 }
2222 goto onError;
2223 }
2224
2225 /* Apply mapping */
2226 if (PyInt_Check(x))
2227 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2228 else if (x == Py_None) {
2229 /* undefined mapping */
2230 if (translate_error(&s, &p, errors,
2231 "character maps to <undefined>")) {
2232 Py_DECREF(x);
2233 goto onError;
2234 }
2235 }
2236 else if (PyUnicode_Check(x)) {
2237 if (PyUnicode_GET_SIZE(x) != 1) {
2238 /* 1-n mapping */
2239 PyErr_SetString(PyExc_NotImplementedError,
2240 "1-n mappings are currently not implemented");
2241 Py_DECREF(x);
2242 goto onError;
2243 }
2244 *p++ = *PyUnicode_AS_UNICODE(x);
2245 }
2246 else {
2247 /* wrong return value */
2248 PyErr_SetString(PyExc_TypeError,
2249 "translate mapping must return integer, None or unicode");
2250 Py_DECREF(x);
2251 goto onError;
2252 }
2253 Py_DECREF(x);
2254 }
2255 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002256 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258
2259 done:
2260 return (PyObject *)v;
2261
2262 onError:
2263 Py_XDECREF(v);
2264 return NULL;
2265}
2266
2267PyObject *PyUnicode_Translate(PyObject *str,
2268 PyObject *mapping,
2269 const char *errors)
2270{
2271 PyObject *result;
2272
2273 str = PyUnicode_FromObject(str);
2274 if (str == NULL)
2275 goto onError;
2276 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2277 PyUnicode_GET_SIZE(str),
2278 mapping,
2279 errors);
2280 Py_DECREF(str);
2281 return result;
2282
2283 onError:
2284 Py_XDECREF(str);
2285 return NULL;
2286}
2287
Guido van Rossum9e896b32000-04-05 20:11:21 +00002288/* --- Decimal Encoder ---------------------------------------------------- */
2289
2290int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2291 int length,
2292 char *output,
2293 const char *errors)
2294{
2295 Py_UNICODE *p, *end;
2296
2297 if (output == NULL) {
2298 PyErr_BadArgument();
2299 return -1;
2300 }
2301
2302 p = s;
2303 end = s + length;
2304 while (p < end) {
2305 register Py_UNICODE ch = *p++;
2306 int decimal;
2307
2308 if (Py_UNICODE_ISSPACE(ch)) {
2309 *output++ = ' ';
2310 continue;
2311 }
2312 decimal = Py_UNICODE_TODECIMAL(ch);
2313 if (decimal >= 0) {
2314 *output++ = '0' + decimal;
2315 continue;
2316 }
Guido van Rossumba477042000-04-06 18:18:10 +00002317 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002318 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002319 continue;
2320 }
2321 /* All other characters are considered invalid */
2322 if (errors == NULL || strcmp(errors, "strict") == 0) {
2323 PyErr_SetString(PyExc_ValueError,
2324 "invalid decimal Unicode string");
2325 goto onError;
2326 }
2327 else if (strcmp(errors, "ignore") == 0)
2328 continue;
2329 else if (strcmp(errors, "replace") == 0) {
2330 *output++ = '?';
2331 continue;
2332 }
2333 }
2334 /* 0-terminate the output string */
2335 *output++ = '\0';
2336 return 0;
2337
2338 onError:
2339 return -1;
2340}
2341
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342/* --- Helpers ------------------------------------------------------------ */
2343
2344static
2345int count(PyUnicodeObject *self,
2346 int start,
2347 int end,
2348 PyUnicodeObject *substring)
2349{
2350 int count = 0;
2351
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002352 if (start < 0)
2353 start += self->length;
2354 if (start < 0)
2355 start = 0;
2356 if (end > self->length)
2357 end = self->length;
2358 if (end < 0)
2359 end += self->length;
2360 if (end < 0)
2361 end = 0;
2362
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002363 if (substring->length == 0)
2364 return (end - start + 1);
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 end -= substring->length;
2367
2368 while (start <= end)
2369 if (Py_UNICODE_MATCH(self, start, substring)) {
2370 count++;
2371 start += substring->length;
2372 } else
2373 start++;
2374
2375 return count;
2376}
2377
2378int PyUnicode_Count(PyObject *str,
2379 PyObject *substr,
2380 int start,
2381 int end)
2382{
2383 int result;
2384
2385 str = PyUnicode_FromObject(str);
2386 if (str == NULL)
2387 return -1;
2388 substr = PyUnicode_FromObject(substr);
2389 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002390 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 return -1;
2392 }
2393
2394 result = count((PyUnicodeObject *)str,
2395 start, end,
2396 (PyUnicodeObject *)substr);
2397
2398 Py_DECREF(str);
2399 Py_DECREF(substr);
2400 return result;
2401}
2402
2403static
2404int findstring(PyUnicodeObject *self,
2405 PyUnicodeObject *substring,
2406 int start,
2407 int end,
2408 int direction)
2409{
2410 if (start < 0)
2411 start += self->length;
2412 if (start < 0)
2413 start = 0;
2414
2415 if (substring->length == 0)
2416 return start;
2417
2418 if (end > self->length)
2419 end = self->length;
2420 if (end < 0)
2421 end += self->length;
2422 if (end < 0)
2423 end = 0;
2424
2425 end -= substring->length;
2426
2427 if (direction < 0) {
2428 for (; end >= start; end--)
2429 if (Py_UNICODE_MATCH(self, end, substring))
2430 return end;
2431 } else {
2432 for (; start <= end; start++)
2433 if (Py_UNICODE_MATCH(self, start, substring))
2434 return start;
2435 }
2436
2437 return -1;
2438}
2439
2440int PyUnicode_Find(PyObject *str,
2441 PyObject *substr,
2442 int start,
2443 int end,
2444 int direction)
2445{
2446 int result;
2447
2448 str = PyUnicode_FromObject(str);
2449 if (str == NULL)
2450 return -1;
2451 substr = PyUnicode_FromObject(substr);
2452 if (substr == NULL) {
2453 Py_DECREF(substr);
2454 return -1;
2455 }
2456
2457 result = findstring((PyUnicodeObject *)str,
2458 (PyUnicodeObject *)substr,
2459 start, end, direction);
2460 Py_DECREF(str);
2461 Py_DECREF(substr);
2462 return result;
2463}
2464
2465static
2466int tailmatch(PyUnicodeObject *self,
2467 PyUnicodeObject *substring,
2468 int start,
2469 int end,
2470 int direction)
2471{
2472 if (start < 0)
2473 start += self->length;
2474 if (start < 0)
2475 start = 0;
2476
2477 if (substring->length == 0)
2478 return 1;
2479
2480 if (end > self->length)
2481 end = self->length;
2482 if (end < 0)
2483 end += self->length;
2484 if (end < 0)
2485 end = 0;
2486
2487 end -= substring->length;
2488 if (end < start)
2489 return 0;
2490
2491 if (direction > 0) {
2492 if (Py_UNICODE_MATCH(self, end, substring))
2493 return 1;
2494 } else {
2495 if (Py_UNICODE_MATCH(self, start, substring))
2496 return 1;
2497 }
2498
2499 return 0;
2500}
2501
2502int PyUnicode_Tailmatch(PyObject *str,
2503 PyObject *substr,
2504 int start,
2505 int end,
2506 int direction)
2507{
2508 int result;
2509
2510 str = PyUnicode_FromObject(str);
2511 if (str == NULL)
2512 return -1;
2513 substr = PyUnicode_FromObject(substr);
2514 if (substr == NULL) {
2515 Py_DECREF(substr);
2516 return -1;
2517 }
2518
2519 result = tailmatch((PyUnicodeObject *)str,
2520 (PyUnicodeObject *)substr,
2521 start, end, direction);
2522 Py_DECREF(str);
2523 Py_DECREF(substr);
2524 return result;
2525}
2526
2527static
2528const Py_UNICODE *findchar(const Py_UNICODE *s,
2529 int size,
2530 Py_UNICODE ch)
2531{
2532 /* like wcschr, but doesn't stop at NULL characters */
2533
2534 while (size-- > 0) {
2535 if (*s == ch)
2536 return s;
2537 s++;
2538 }
2539
2540 return NULL;
2541}
2542
2543/* Apply fixfct filter to the Unicode object self and return a
2544 reference to the modified object */
2545
2546static
2547PyObject *fixup(PyUnicodeObject *self,
2548 int (*fixfct)(PyUnicodeObject *s))
2549{
2550
2551 PyUnicodeObject *u;
2552
2553 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2554 self->length);
2555 if (u == NULL)
2556 return NULL;
2557 if (!fixfct(u)) {
2558 /* fixfct should return TRUE if it modified the buffer. If
2559 FALSE, return a reference to the original buffer instead
2560 (to save space, not time) */
2561 Py_INCREF(self);
2562 Py_DECREF(u);
2563 return (PyObject*) self;
2564 }
2565 return (PyObject*) u;
2566}
2567
2568static
2569int fixupper(PyUnicodeObject *self)
2570{
2571 int len = self->length;
2572 Py_UNICODE *s = self->str;
2573 int status = 0;
2574
2575 while (len-- > 0) {
2576 register Py_UNICODE ch;
2577
2578 ch = Py_UNICODE_TOUPPER(*s);
2579 if (ch != *s) {
2580 status = 1;
2581 *s = ch;
2582 }
2583 s++;
2584 }
2585
2586 return status;
2587}
2588
2589static
2590int fixlower(PyUnicodeObject *self)
2591{
2592 int len = self->length;
2593 Py_UNICODE *s = self->str;
2594 int status = 0;
2595
2596 while (len-- > 0) {
2597 register Py_UNICODE ch;
2598
2599 ch = Py_UNICODE_TOLOWER(*s);
2600 if (ch != *s) {
2601 status = 1;
2602 *s = ch;
2603 }
2604 s++;
2605 }
2606
2607 return status;
2608}
2609
2610static
2611int fixswapcase(PyUnicodeObject *self)
2612{
2613 int len = self->length;
2614 Py_UNICODE *s = self->str;
2615 int status = 0;
2616
2617 while (len-- > 0) {
2618 if (Py_UNICODE_ISUPPER(*s)) {
2619 *s = Py_UNICODE_TOLOWER(*s);
2620 status = 1;
2621 } else if (Py_UNICODE_ISLOWER(*s)) {
2622 *s = Py_UNICODE_TOUPPER(*s);
2623 status = 1;
2624 }
2625 s++;
2626 }
2627
2628 return status;
2629}
2630
2631static
2632int fixcapitalize(PyUnicodeObject *self)
2633{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002634 int len = self->length;
2635 Py_UNICODE *s = self->str;
2636 int status = 0;
2637
2638 if (len == 0)
2639 return 0;
2640 if (Py_UNICODE_ISLOWER(*s)) {
2641 *s = Py_UNICODE_TOUPPER(*s);
2642 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002644 s++;
2645 while (--len > 0) {
2646 if (Py_UNICODE_ISUPPER(*s)) {
2647 *s = Py_UNICODE_TOLOWER(*s);
2648 status = 1;
2649 }
2650 s++;
2651 }
2652 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653}
2654
2655static
2656int fixtitle(PyUnicodeObject *self)
2657{
2658 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2659 register Py_UNICODE *e;
2660 int previous_is_cased;
2661
2662 /* Shortcut for single character strings */
2663 if (PyUnicode_GET_SIZE(self) == 1) {
2664 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2665 if (*p != ch) {
2666 *p = ch;
2667 return 1;
2668 }
2669 else
2670 return 0;
2671 }
2672
2673 e = p + PyUnicode_GET_SIZE(self);
2674 previous_is_cased = 0;
2675 for (; p < e; p++) {
2676 register const Py_UNICODE ch = *p;
2677
2678 if (previous_is_cased)
2679 *p = Py_UNICODE_TOLOWER(ch);
2680 else
2681 *p = Py_UNICODE_TOTITLE(ch);
2682
2683 if (Py_UNICODE_ISLOWER(ch) ||
2684 Py_UNICODE_ISUPPER(ch) ||
2685 Py_UNICODE_ISTITLE(ch))
2686 previous_is_cased = 1;
2687 else
2688 previous_is_cased = 0;
2689 }
2690 return 1;
2691}
2692
2693PyObject *PyUnicode_Join(PyObject *separator,
2694 PyObject *seq)
2695{
2696 Py_UNICODE *sep;
2697 int seplen;
2698 PyUnicodeObject *res = NULL;
2699 int reslen = 0;
2700 Py_UNICODE *p;
2701 int seqlen = 0;
2702 int sz = 100;
2703 int i;
2704
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002705 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 if (seqlen < 0 && PyErr_Occurred())
2707 return NULL;
2708
2709 if (separator == NULL) {
2710 Py_UNICODE blank = ' ';
2711 sep = &blank;
2712 seplen = 1;
2713 }
2714 else {
2715 separator = PyUnicode_FromObject(separator);
2716 if (separator == NULL)
2717 return NULL;
2718 sep = PyUnicode_AS_UNICODE(separator);
2719 seplen = PyUnicode_GET_SIZE(separator);
2720 }
2721
2722 res = _PyUnicode_New(sz);
2723 if (res == NULL)
2724 goto onError;
2725 p = PyUnicode_AS_UNICODE(res);
2726 reslen = 0;
2727
2728 for (i = 0; i < seqlen; i++) {
2729 int itemlen;
2730 PyObject *item;
2731
2732 item = PySequence_GetItem(seq, i);
2733 if (item == NULL)
2734 goto onError;
2735 if (!PyUnicode_Check(item)) {
2736 PyObject *v;
2737 v = PyUnicode_FromObject(item);
2738 Py_DECREF(item);
2739 item = v;
2740 if (item == NULL)
2741 goto onError;
2742 }
2743 itemlen = PyUnicode_GET_SIZE(item);
2744 while (reslen + itemlen + seplen >= sz) {
2745 if (_PyUnicode_Resize(res, sz*2))
2746 goto onError;
2747 sz *= 2;
2748 p = PyUnicode_AS_UNICODE(res) + reslen;
2749 }
2750 if (i > 0) {
2751 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2752 p += seplen;
2753 reslen += seplen;
2754 }
2755 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2756 p += itemlen;
2757 reslen += itemlen;
2758 Py_DECREF(item);
2759 }
2760 if (_PyUnicode_Resize(res, reslen))
2761 goto onError;
2762
2763 Py_XDECREF(separator);
2764 return (PyObject *)res;
2765
2766 onError:
2767 Py_XDECREF(separator);
2768 Py_DECREF(res);
2769 return NULL;
2770}
2771
2772static
2773PyUnicodeObject *pad(PyUnicodeObject *self,
2774 int left,
2775 int right,
2776 Py_UNICODE fill)
2777{
2778 PyUnicodeObject *u;
2779
2780 if (left < 0)
2781 left = 0;
2782 if (right < 0)
2783 right = 0;
2784
2785 if (left == 0 && right == 0) {
2786 Py_INCREF(self);
2787 return self;
2788 }
2789
2790 u = _PyUnicode_New(left + self->length + right);
2791 if (u) {
2792 if (left)
2793 Py_UNICODE_FILL(u->str, fill, left);
2794 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2795 if (right)
2796 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2797 }
2798
2799 return u;
2800}
2801
2802#define SPLIT_APPEND(data, left, right) \
2803 str = PyUnicode_FromUnicode(data + left, right - left); \
2804 if (!str) \
2805 goto onError; \
2806 if (PyList_Append(list, str)) { \
2807 Py_DECREF(str); \
2808 goto onError; \
2809 } \
2810 else \
2811 Py_DECREF(str);
2812
2813static
2814PyObject *split_whitespace(PyUnicodeObject *self,
2815 PyObject *list,
2816 int maxcount)
2817{
2818 register int i;
2819 register int j;
2820 int len = self->length;
2821 PyObject *str;
2822
2823 for (i = j = 0; i < len; ) {
2824 /* find a token */
2825 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2826 i++;
2827 j = i;
2828 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2829 i++;
2830 if (j < i) {
2831 if (maxcount-- <= 0)
2832 break;
2833 SPLIT_APPEND(self->str, j, i);
2834 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2835 i++;
2836 j = i;
2837 }
2838 }
2839 if (j < len) {
2840 SPLIT_APPEND(self->str, j, len);
2841 }
2842 return list;
2843
2844 onError:
2845 Py_DECREF(list);
2846 return NULL;
2847}
2848
2849PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002850 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851{
2852 register int i;
2853 register int j;
2854 int len;
2855 PyObject *list;
2856 PyObject *str;
2857 Py_UNICODE *data;
2858
2859 string = PyUnicode_FromObject(string);
2860 if (string == NULL)
2861 return NULL;
2862 data = PyUnicode_AS_UNICODE(string);
2863 len = PyUnicode_GET_SIZE(string);
2864
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 list = PyList_New(0);
2866 if (!list)
2867 goto onError;
2868
2869 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002870 int eol;
2871
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 /* Find a line and append it */
2873 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2874 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
2876 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002877 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 if (i < len) {
2879 if (data[i] == '\r' && i + 1 < len &&
2880 data[i+1] == '\n')
2881 i += 2;
2882 else
2883 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002884 if (keepends)
2885 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 }
Guido van Rossum86662912000-04-11 15:38:46 +00002887 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 j = i;
2889 }
2890 if (j < len) {
2891 SPLIT_APPEND(data, j, len);
2892 }
2893
2894 Py_DECREF(string);
2895 return list;
2896
2897 onError:
2898 Py_DECREF(list);
2899 Py_DECREF(string);
2900 return NULL;
2901}
2902
2903static
2904PyObject *split_char(PyUnicodeObject *self,
2905 PyObject *list,
2906 Py_UNICODE ch,
2907 int maxcount)
2908{
2909 register int i;
2910 register int j;
2911 int len = self->length;
2912 PyObject *str;
2913
2914 for (i = j = 0; i < len; ) {
2915 if (self->str[i] == ch) {
2916 if (maxcount-- <= 0)
2917 break;
2918 SPLIT_APPEND(self->str, j, i);
2919 i = j = i + 1;
2920 } else
2921 i++;
2922 }
2923 if (j <= len) {
2924 SPLIT_APPEND(self->str, j, len);
2925 }
2926 return list;
2927
2928 onError:
2929 Py_DECREF(list);
2930 return NULL;
2931}
2932
2933static
2934PyObject *split_substring(PyUnicodeObject *self,
2935 PyObject *list,
2936 PyUnicodeObject *substring,
2937 int maxcount)
2938{
2939 register int i;
2940 register int j;
2941 int len = self->length;
2942 int sublen = substring->length;
2943 PyObject *str;
2944
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002945 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 if (Py_UNICODE_MATCH(self, i, substring)) {
2947 if (maxcount-- <= 0)
2948 break;
2949 SPLIT_APPEND(self->str, j, i);
2950 i = j = i + sublen;
2951 } else
2952 i++;
2953 }
2954 if (j <= len) {
2955 SPLIT_APPEND(self->str, j, len);
2956 }
2957 return list;
2958
2959 onError:
2960 Py_DECREF(list);
2961 return NULL;
2962}
2963
2964#undef SPLIT_APPEND
2965
2966static
2967PyObject *split(PyUnicodeObject *self,
2968 PyUnicodeObject *substring,
2969 int maxcount)
2970{
2971 PyObject *list;
2972
2973 if (maxcount < 0)
2974 maxcount = INT_MAX;
2975
2976 list = PyList_New(0);
2977 if (!list)
2978 return NULL;
2979
2980 if (substring == NULL)
2981 return split_whitespace(self,list,maxcount);
2982
2983 else if (substring->length == 1)
2984 return split_char(self,list,substring->str[0],maxcount);
2985
2986 else if (substring->length == 0) {
2987 Py_DECREF(list);
2988 PyErr_SetString(PyExc_ValueError, "empty separator");
2989 return NULL;
2990 }
2991 else
2992 return split_substring(self,list,substring,maxcount);
2993}
2994
2995static
2996PyObject *strip(PyUnicodeObject *self,
2997 int left,
2998 int right)
2999{
3000 Py_UNICODE *p = self->str;
3001 int start = 0;
3002 int end = self->length;
3003
3004 if (left)
3005 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3006 start++;
3007
3008 if (right)
3009 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3010 end--;
3011
3012 if (start == 0 && end == self->length) {
3013 /* couldn't strip anything off, return original string */
3014 Py_INCREF(self);
3015 return (PyObject*) self;
3016 }
3017
3018 return (PyObject*) PyUnicode_FromUnicode(
3019 self->str + start,
3020 end - start
3021 );
3022}
3023
3024static
3025PyObject *replace(PyUnicodeObject *self,
3026 PyUnicodeObject *str1,
3027 PyUnicodeObject *str2,
3028 int maxcount)
3029{
3030 PyUnicodeObject *u;
3031
3032 if (maxcount < 0)
3033 maxcount = INT_MAX;
3034
3035 if (str1->length == 1 && str2->length == 1) {
3036 int i;
3037
3038 /* replace characters */
3039 if (!findchar(self->str, self->length, str1->str[0])) {
3040 /* nothing to replace, return original string */
3041 Py_INCREF(self);
3042 u = self;
3043 } else {
3044 Py_UNICODE u1 = str1->str[0];
3045 Py_UNICODE u2 = str2->str[0];
3046
3047 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3048 self->str,
3049 self->length
3050 );
3051 if (u)
3052 for (i = 0; i < u->length; i++)
3053 if (u->str[i] == u1) {
3054 if (--maxcount < 0)
3055 break;
3056 u->str[i] = u2;
3057 }
3058 }
3059
3060 } else {
3061 int n, i;
3062 Py_UNICODE *p;
3063
3064 /* replace strings */
3065 n = count(self, 0, self->length, str1);
3066 if (n > maxcount)
3067 n = maxcount;
3068 if (n == 0) {
3069 /* nothing to replace, return original string */
3070 Py_INCREF(self);
3071 u = self;
3072 } else {
3073 u = _PyUnicode_New(
3074 self->length + n * (str2->length - str1->length));
3075 if (u) {
3076 i = 0;
3077 p = u->str;
3078 while (i <= self->length - str1->length)
3079 if (Py_UNICODE_MATCH(self, i, str1)) {
3080 /* replace string segment */
3081 Py_UNICODE_COPY(p, str2->str, str2->length);
3082 p += str2->length;
3083 i += str1->length;
3084 if (--n <= 0) {
3085 /* copy remaining part */
3086 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3087 break;
3088 }
3089 } else
3090 *p++ = self->str[i++];
3091 }
3092 }
3093 }
3094
3095 return (PyObject *) u;
3096}
3097
3098/* --- Unicode Object Methods --------------------------------------------- */
3099
3100static char title__doc__[] =
3101"S.title() -> unicode\n\
3102\n\
3103Return a titlecased version of S, i.e. words start with title case\n\
3104characters, all remaining cased characters have lower case.";
3105
3106static PyObject*
3107unicode_title(PyUnicodeObject *self, PyObject *args)
3108{
3109 if (!PyArg_NoArgs(args))
3110 return NULL;
3111 return fixup(self, fixtitle);
3112}
3113
3114static char capitalize__doc__[] =
3115"S.capitalize() -> unicode\n\
3116\n\
3117Return a capitalized version of S, i.e. make the first character\n\
3118have upper case.";
3119
3120static PyObject*
3121unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3122{
3123 if (!PyArg_NoArgs(args))
3124 return NULL;
3125 return fixup(self, fixcapitalize);
3126}
3127
3128#if 0
3129static char capwords__doc__[] =
3130"S.capwords() -> unicode\n\
3131\n\
3132Apply .capitalize() to all words in S and return the result with\n\
3133normalized whitespace (all whitespace strings are replaced by ' ').";
3134
3135static PyObject*
3136unicode_capwords(PyUnicodeObject *self, PyObject *args)
3137{
3138 PyObject *list;
3139 PyObject *item;
3140 int i;
3141
3142 if (!PyArg_NoArgs(args))
3143 return NULL;
3144
3145 /* Split into words */
3146 list = split(self, NULL, -1);
3147 if (!list)
3148 return NULL;
3149
3150 /* Capitalize each word */
3151 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3152 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3153 fixcapitalize);
3154 if (item == NULL)
3155 goto onError;
3156 Py_DECREF(PyList_GET_ITEM(list, i));
3157 PyList_SET_ITEM(list, i, item);
3158 }
3159
3160 /* Join the words to form a new string */
3161 item = PyUnicode_Join(NULL, list);
3162
3163onError:
3164 Py_DECREF(list);
3165 return (PyObject *)item;
3166}
3167#endif
3168
3169static char center__doc__[] =
3170"S.center(width) -> unicode\n\
3171\n\
3172Return S centered in a Unicode string of length width. Padding is done\n\
3173using spaces.";
3174
3175static PyObject *
3176unicode_center(PyUnicodeObject *self, PyObject *args)
3177{
3178 int marg, left;
3179 int width;
3180
3181 if (!PyArg_ParseTuple(args, "i:center", &width))
3182 return NULL;
3183
3184 if (self->length >= width) {
3185 Py_INCREF(self);
3186 return (PyObject*) self;
3187 }
3188
3189 marg = width - self->length;
3190 left = marg / 2 + (marg & width & 1);
3191
3192 return (PyObject*) pad(self, left, marg - left, ' ');
3193}
3194
Marc-André Lemburge5034372000-08-08 08:04:29 +00003195#if 0
3196
3197/* This code should go into some future Unicode collation support
3198 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003199 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003200
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003201/* speedy UTF-16 code point order comparison */
3202/* gleaned from: */
3203/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3204
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003205static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003206{
3207 0, 0, 0, 0, 0, 0, 0, 0,
3208 0, 0, 0, 0, 0, 0, 0, 0,
3209 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003210 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003211};
3212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213static int
3214unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3215{
3216 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003217
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 Py_UNICODE *s1 = str1->str;
3219 Py_UNICODE *s2 = str2->str;
3220
3221 len1 = str1->length;
3222 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003223
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003225 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003226 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003227
3228 c1 = *s1++;
3229 c2 = *s2++;
3230 if (c1 > (1<<11) * 26)
3231 c1 += utf16Fixup[c1>>11];
3232 if (c2 > (1<<11) * 26)
3233 c2 += utf16Fixup[c2>>11];
3234
3235 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003236 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003237 if (diff)
3238 return (diff < 0) ? -1 : (diff != 0);
3239 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 }
3241
3242 return (len1 < len2) ? -1 : (len1 != len2);
3243}
3244
Marc-André Lemburge5034372000-08-08 08:04:29 +00003245#else
3246
3247static int
3248unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3249{
3250 register int len1, len2;
3251
3252 Py_UNICODE *s1 = str1->str;
3253 Py_UNICODE *s2 = str2->str;
3254
3255 len1 = str1->length;
3256 len2 = str2->length;
3257
3258 while (len1 > 0 && len2 > 0) {
3259 register long diff;
3260
3261 diff = (long)*s1++ - (long)*s2++;
3262 if (diff)
3263 return (diff < 0) ? -1 : (diff != 0);
3264 len1--; len2--;
3265 }
3266
3267 return (len1 < len2) ? -1 : (len1 != len2);
3268}
3269
3270#endif
3271
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272int PyUnicode_Compare(PyObject *left,
3273 PyObject *right)
3274{
3275 PyUnicodeObject *u = NULL, *v = NULL;
3276 int result;
3277
3278 /* Coerce the two arguments */
3279 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3280 if (u == NULL)
3281 goto onError;
3282 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3283 if (v == NULL)
3284 goto onError;
3285
Thomas Wouters7e474022000-07-16 12:04:32 +00003286 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 if (v == u) {
3288 Py_DECREF(u);
3289 Py_DECREF(v);
3290 return 0;
3291 }
3292
3293 result = unicode_compare(u, v);
3294
3295 Py_DECREF(u);
3296 Py_DECREF(v);
3297 return result;
3298
3299onError:
3300 Py_XDECREF(u);
3301 Py_XDECREF(v);
3302 return -1;
3303}
3304
Guido van Rossum403d68b2000-03-13 15:55:09 +00003305int PyUnicode_Contains(PyObject *container,
3306 PyObject *element)
3307{
3308 PyUnicodeObject *u = NULL, *v = NULL;
3309 int result;
3310 register const Py_UNICODE *p, *e;
3311 register Py_UNICODE ch;
3312
3313 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003314 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003315 if (v == NULL) {
3316 PyErr_SetString(PyExc_TypeError,
3317 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003318 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003319 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003320 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3321 if (u == NULL) {
3322 Py_DECREF(v);
3323 goto onError;
3324 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003325
3326 /* Check v in u */
3327 if (PyUnicode_GET_SIZE(v) != 1) {
3328 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003329 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003330 goto onError;
3331 }
3332 ch = *PyUnicode_AS_UNICODE(v);
3333 p = PyUnicode_AS_UNICODE(u);
3334 e = p + PyUnicode_GET_SIZE(u);
3335 result = 0;
3336 while (p < e) {
3337 if (*p++ == ch) {
3338 result = 1;
3339 break;
3340 }
3341 }
3342
3343 Py_DECREF(u);
3344 Py_DECREF(v);
3345 return result;
3346
3347onError:
3348 Py_XDECREF(u);
3349 Py_XDECREF(v);
3350 return -1;
3351}
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353/* Concat to string or Unicode object giving a new Unicode object. */
3354
3355PyObject *PyUnicode_Concat(PyObject *left,
3356 PyObject *right)
3357{
3358 PyUnicodeObject *u = NULL, *v = NULL, *w;
3359
3360 /* Coerce the two arguments */
3361 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3362 if (u == NULL)
3363 goto onError;
3364 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3365 if (v == NULL)
3366 goto onError;
3367
3368 /* Shortcuts */
3369 if (v == unicode_empty) {
3370 Py_DECREF(v);
3371 return (PyObject *)u;
3372 }
3373 if (u == unicode_empty) {
3374 Py_DECREF(u);
3375 return (PyObject *)v;
3376 }
3377
3378 /* Concat the two Unicode strings */
3379 w = _PyUnicode_New(u->length + v->length);
3380 if (w == NULL)
3381 goto onError;
3382 Py_UNICODE_COPY(w->str, u->str, u->length);
3383 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3384
3385 Py_DECREF(u);
3386 Py_DECREF(v);
3387 return (PyObject *)w;
3388
3389onError:
3390 Py_XDECREF(u);
3391 Py_XDECREF(v);
3392 return NULL;
3393}
3394
3395static char count__doc__[] =
3396"S.count(sub[, start[, end]]) -> int\n\
3397\n\
3398Return the number of occurrences of substring sub in Unicode string\n\
3399S[start:end]. Optional arguments start and end are\n\
3400interpreted as in slice notation.";
3401
3402static PyObject *
3403unicode_count(PyUnicodeObject *self, PyObject *args)
3404{
3405 PyUnicodeObject *substring;
3406 int start = 0;
3407 int end = INT_MAX;
3408 PyObject *result;
3409
Guido van Rossumb8872e62000-05-09 14:14:27 +00003410 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3411 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 return NULL;
3413
3414 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3415 (PyObject *)substring);
3416 if (substring == NULL)
3417 return NULL;
3418
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (start < 0)
3420 start += self->length;
3421 if (start < 0)
3422 start = 0;
3423 if (end > self->length)
3424 end = self->length;
3425 if (end < 0)
3426 end += self->length;
3427 if (end < 0)
3428 end = 0;
3429
3430 result = PyInt_FromLong((long) count(self, start, end, substring));
3431
3432 Py_DECREF(substring);
3433 return result;
3434}
3435
3436static char encode__doc__[] =
3437"S.encode([encoding[,errors]]) -> string\n\
3438\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003439Return an encoded string version of S. Default encoding is the current\n\
3440default string encoding. errors may be given to set a different error\n\
3441handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3442a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
3444static PyObject *
3445unicode_encode(PyUnicodeObject *self, PyObject *args)
3446{
3447 char *encoding = NULL;
3448 char *errors = NULL;
3449 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3450 return NULL;
3451 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3452}
3453
3454static char expandtabs__doc__[] =
3455"S.expandtabs([tabsize]) -> unicode\n\
3456\n\
3457Return a copy of S where all tab characters are expanded using spaces.\n\
3458If tabsize is not given, a tab size of 8 characters is assumed.";
3459
3460static PyObject*
3461unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3462{
3463 Py_UNICODE *e;
3464 Py_UNICODE *p;
3465 Py_UNICODE *q;
3466 int i, j;
3467 PyUnicodeObject *u;
3468 int tabsize = 8;
3469
3470 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3471 return NULL;
3472
Thomas Wouters7e474022000-07-16 12:04:32 +00003473 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 i = j = 0;
3475 e = self->str + self->length;
3476 for (p = self->str; p < e; p++)
3477 if (*p == '\t') {
3478 if (tabsize > 0)
3479 j += tabsize - (j % tabsize);
3480 }
3481 else {
3482 j++;
3483 if (*p == '\n' || *p == '\r') {
3484 i += j;
3485 j = 0;
3486 }
3487 }
3488
3489 /* Second pass: create output string and fill it */
3490 u = _PyUnicode_New(i + j);
3491 if (!u)
3492 return NULL;
3493
3494 j = 0;
3495 q = u->str;
3496
3497 for (p = self->str; p < e; p++)
3498 if (*p == '\t') {
3499 if (tabsize > 0) {
3500 i = tabsize - (j % tabsize);
3501 j += i;
3502 while (i--)
3503 *q++ = ' ';
3504 }
3505 }
3506 else {
3507 j++;
3508 *q++ = *p;
3509 if (*p == '\n' || *p == '\r')
3510 j = 0;
3511 }
3512
3513 return (PyObject*) u;
3514}
3515
3516static char find__doc__[] =
3517"S.find(sub [,start [,end]]) -> int\n\
3518\n\
3519Return the lowest index in S where substring sub is found,\n\
3520such that sub is contained within s[start,end]. Optional\n\
3521arguments start and end are interpreted as in slice notation.\n\
3522\n\
3523Return -1 on failure.";
3524
3525static PyObject *
3526unicode_find(PyUnicodeObject *self, PyObject *args)
3527{
3528 PyUnicodeObject *substring;
3529 int start = 0;
3530 int end = INT_MAX;
3531 PyObject *result;
3532
Guido van Rossumb8872e62000-05-09 14:14:27 +00003533 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3534 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 return NULL;
3536 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3537 (PyObject *)substring);
3538 if (substring == NULL)
3539 return NULL;
3540
3541 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3542
3543 Py_DECREF(substring);
3544 return result;
3545}
3546
3547static PyObject *
3548unicode_getitem(PyUnicodeObject *self, int index)
3549{
3550 if (index < 0 || index >= self->length) {
3551 PyErr_SetString(PyExc_IndexError, "string index out of range");
3552 return NULL;
3553 }
3554
3555 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3556}
3557
3558static long
3559unicode_hash(PyUnicodeObject *self)
3560{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003561 /* Since Unicode objects compare equal to their ASCII string
3562 counterparts, they should use the individual character values
3563 as basis for their hash value. This is needed to assure that
3564 strings and Unicode objects behave in the same way as
3565 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566
Fredrik Lundhdde61642000-07-10 18:27:47 +00003567 register int len;
3568 register Py_UNICODE *p;
3569 register long x;
3570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 if (self->hash != -1)
3572 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003573 len = PyUnicode_GET_SIZE(self);
3574 p = PyUnicode_AS_UNICODE(self);
3575 x = *p << 7;
3576 while (--len >= 0)
3577 x = (1000003*x) ^ *p++;
3578 x ^= PyUnicode_GET_SIZE(self);
3579 if (x == -1)
3580 x = -2;
3581 self->hash = x;
3582 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583}
3584
3585static char index__doc__[] =
3586"S.index(sub [,start [,end]]) -> int\n\
3587\n\
3588Like S.find() but raise ValueError when the substring is not found.";
3589
3590static PyObject *
3591unicode_index(PyUnicodeObject *self, PyObject *args)
3592{
3593 int result;
3594 PyUnicodeObject *substring;
3595 int start = 0;
3596 int end = INT_MAX;
3597
Guido van Rossumb8872e62000-05-09 14:14:27 +00003598 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3599 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 return NULL;
3601
3602 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3603 (PyObject *)substring);
3604 if (substring == NULL)
3605 return NULL;
3606
3607 result = findstring(self, substring, start, end, 1);
3608
3609 Py_DECREF(substring);
3610 if (result < 0) {
3611 PyErr_SetString(PyExc_ValueError, "substring not found");
3612 return NULL;
3613 }
3614 return PyInt_FromLong(result);
3615}
3616
3617static char islower__doc__[] =
3618"S.islower() -> int\n\
3619\n\
3620Return 1 if all cased characters in S are lowercase and there is\n\
3621at least one cased character in S, 0 otherwise.";
3622
3623static PyObject*
3624unicode_islower(PyUnicodeObject *self, PyObject *args)
3625{
3626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3627 register const Py_UNICODE *e;
3628 int cased;
3629
3630 if (!PyArg_NoArgs(args))
3631 return NULL;
3632
3633 /* Shortcut for single character strings */
3634 if (PyUnicode_GET_SIZE(self) == 1)
3635 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003637 /* Special case for empty strings */
3638 if (PyString_GET_SIZE(self) == 0)
3639 return PyInt_FromLong(0);
3640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 e = p + PyUnicode_GET_SIZE(self);
3642 cased = 0;
3643 for (; p < e; p++) {
3644 register const Py_UNICODE ch = *p;
3645
3646 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3647 return PyInt_FromLong(0);
3648 else if (!cased && Py_UNICODE_ISLOWER(ch))
3649 cased = 1;
3650 }
3651 return PyInt_FromLong(cased);
3652}
3653
3654static char isupper__doc__[] =
3655"S.isupper() -> int\n\
3656\n\
3657Return 1 if all cased characters in S are uppercase and there is\n\
3658at least one cased character in S, 0 otherwise.";
3659
3660static PyObject*
3661unicode_isupper(PyUnicodeObject *self, PyObject *args)
3662{
3663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3664 register const Py_UNICODE *e;
3665 int cased;
3666
3667 if (!PyArg_NoArgs(args))
3668 return NULL;
3669
3670 /* Shortcut for single character strings */
3671 if (PyUnicode_GET_SIZE(self) == 1)
3672 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3673
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003674 /* Special case for empty strings */
3675 if (PyString_GET_SIZE(self) == 0)
3676 return PyInt_FromLong(0);
3677
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 e = p + PyUnicode_GET_SIZE(self);
3679 cased = 0;
3680 for (; p < e; p++) {
3681 register const Py_UNICODE ch = *p;
3682
3683 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3684 return PyInt_FromLong(0);
3685 else if (!cased && Py_UNICODE_ISUPPER(ch))
3686 cased = 1;
3687 }
3688 return PyInt_FromLong(cased);
3689}
3690
3691static char istitle__doc__[] =
3692"S.istitle() -> int\n\
3693\n\
3694Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3695may only follow uncased characters and lowercase characters only cased\n\
3696ones. Return 0 otherwise.";
3697
3698static PyObject*
3699unicode_istitle(PyUnicodeObject *self, PyObject *args)
3700{
3701 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3702 register const Py_UNICODE *e;
3703 int cased, previous_is_cased;
3704
3705 if (!PyArg_NoArgs(args))
3706 return NULL;
3707
3708 /* Shortcut for single character strings */
3709 if (PyUnicode_GET_SIZE(self) == 1)
3710 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3711 (Py_UNICODE_ISUPPER(*p) != 0));
3712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003713 /* Special case for empty strings */
3714 if (PyString_GET_SIZE(self) == 0)
3715 return PyInt_FromLong(0);
3716
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 e = p + PyUnicode_GET_SIZE(self);
3718 cased = 0;
3719 previous_is_cased = 0;
3720 for (; p < e; p++) {
3721 register const Py_UNICODE ch = *p;
3722
3723 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3724 if (previous_is_cased)
3725 return PyInt_FromLong(0);
3726 previous_is_cased = 1;
3727 cased = 1;
3728 }
3729 else if (Py_UNICODE_ISLOWER(ch)) {
3730 if (!previous_is_cased)
3731 return PyInt_FromLong(0);
3732 previous_is_cased = 1;
3733 cased = 1;
3734 }
3735 else
3736 previous_is_cased = 0;
3737 }
3738 return PyInt_FromLong(cased);
3739}
3740
3741static char isspace__doc__[] =
3742"S.isspace() -> int\n\
3743\n\
3744Return 1 if there are only whitespace characters in S,\n\
37450 otherwise.";
3746
3747static PyObject*
3748unicode_isspace(PyUnicodeObject *self, PyObject *args)
3749{
3750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3751 register const Py_UNICODE *e;
3752
3753 if (!PyArg_NoArgs(args))
3754 return NULL;
3755
3756 /* Shortcut for single character strings */
3757 if (PyUnicode_GET_SIZE(self) == 1 &&
3758 Py_UNICODE_ISSPACE(*p))
3759 return PyInt_FromLong(1);
3760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003761 /* Special case for empty strings */
3762 if (PyString_GET_SIZE(self) == 0)
3763 return PyInt_FromLong(0);
3764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 e = p + PyUnicode_GET_SIZE(self);
3766 for (; p < e; p++) {
3767 if (!Py_UNICODE_ISSPACE(*p))
3768 return PyInt_FromLong(0);
3769 }
3770 return PyInt_FromLong(1);
3771}
3772
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003773static char isalpha__doc__[] =
3774"S.isalpha() -> int\n\
3775\n\
3776Return 1 if all characters in S are alphabetic\n\
3777and there is at least one character in S, 0 otherwise.";
3778
3779static PyObject*
3780unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3781{
3782 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3783 register const Py_UNICODE *e;
3784
3785 if (!PyArg_NoArgs(args))
3786 return NULL;
3787
3788 /* Shortcut for single character strings */
3789 if (PyUnicode_GET_SIZE(self) == 1 &&
3790 Py_UNICODE_ISALPHA(*p))
3791 return PyInt_FromLong(1);
3792
3793 /* Special case for empty strings */
3794 if (PyString_GET_SIZE(self) == 0)
3795 return PyInt_FromLong(0);
3796
3797 e = p + PyUnicode_GET_SIZE(self);
3798 for (; p < e; p++) {
3799 if (!Py_UNICODE_ISALPHA(*p))
3800 return PyInt_FromLong(0);
3801 }
3802 return PyInt_FromLong(1);
3803}
3804
3805static char isalnum__doc__[] =
3806"S.isalnum() -> int\n\
3807\n\
3808Return 1 if all characters in S are alphanumeric\n\
3809and there is at least one character in S, 0 otherwise.";
3810
3811static PyObject*
3812unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3813{
3814 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3815 register const Py_UNICODE *e;
3816
3817 if (!PyArg_NoArgs(args))
3818 return NULL;
3819
3820 /* Shortcut for single character strings */
3821 if (PyUnicode_GET_SIZE(self) == 1 &&
3822 Py_UNICODE_ISALNUM(*p))
3823 return PyInt_FromLong(1);
3824
3825 /* Special case for empty strings */
3826 if (PyString_GET_SIZE(self) == 0)
3827 return PyInt_FromLong(0);
3828
3829 e = p + PyUnicode_GET_SIZE(self);
3830 for (; p < e; p++) {
3831 if (!Py_UNICODE_ISALNUM(*p))
3832 return PyInt_FromLong(0);
3833 }
3834 return PyInt_FromLong(1);
3835}
3836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837static char isdecimal__doc__[] =
3838"S.isdecimal() -> int\n\
3839\n\
3840Return 1 if there are only decimal characters in S,\n\
38410 otherwise.";
3842
3843static PyObject*
3844unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3845{
3846 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3847 register const Py_UNICODE *e;
3848
3849 if (!PyArg_NoArgs(args))
3850 return NULL;
3851
3852 /* Shortcut for single character strings */
3853 if (PyUnicode_GET_SIZE(self) == 1 &&
3854 Py_UNICODE_ISDECIMAL(*p))
3855 return PyInt_FromLong(1);
3856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003857 /* Special case for empty strings */
3858 if (PyString_GET_SIZE(self) == 0)
3859 return PyInt_FromLong(0);
3860
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 e = p + PyUnicode_GET_SIZE(self);
3862 for (; p < e; p++) {
3863 if (!Py_UNICODE_ISDECIMAL(*p))
3864 return PyInt_FromLong(0);
3865 }
3866 return PyInt_FromLong(1);
3867}
3868
3869static char isdigit__doc__[] =
3870"S.isdigit() -> int\n\
3871\n\
3872Return 1 if there are only digit characters in S,\n\
38730 otherwise.";
3874
3875static PyObject*
3876unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3877{
3878 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3879 register const Py_UNICODE *e;
3880
3881 if (!PyArg_NoArgs(args))
3882 return NULL;
3883
3884 /* Shortcut for single character strings */
3885 if (PyUnicode_GET_SIZE(self) == 1 &&
3886 Py_UNICODE_ISDIGIT(*p))
3887 return PyInt_FromLong(1);
3888
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003889 /* Special case for empty strings */
3890 if (PyString_GET_SIZE(self) == 0)
3891 return PyInt_FromLong(0);
3892
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 e = p + PyUnicode_GET_SIZE(self);
3894 for (; p < e; p++) {
3895 if (!Py_UNICODE_ISDIGIT(*p))
3896 return PyInt_FromLong(0);
3897 }
3898 return PyInt_FromLong(1);
3899}
3900
3901static char isnumeric__doc__[] =
3902"S.isnumeric() -> int\n\
3903\n\
3904Return 1 if there are only numeric characters in S,\n\
39050 otherwise.";
3906
3907static PyObject*
3908unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3909{
3910 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3911 register const Py_UNICODE *e;
3912
3913 if (!PyArg_NoArgs(args))
3914 return NULL;
3915
3916 /* Shortcut for single character strings */
3917 if (PyUnicode_GET_SIZE(self) == 1 &&
3918 Py_UNICODE_ISNUMERIC(*p))
3919 return PyInt_FromLong(1);
3920
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003921 /* Special case for empty strings */
3922 if (PyString_GET_SIZE(self) == 0)
3923 return PyInt_FromLong(0);
3924
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 e = p + PyUnicode_GET_SIZE(self);
3926 for (; p < e; p++) {
3927 if (!Py_UNICODE_ISNUMERIC(*p))
3928 return PyInt_FromLong(0);
3929 }
3930 return PyInt_FromLong(1);
3931}
3932
3933static char join__doc__[] =
3934"S.join(sequence) -> unicode\n\
3935\n\
3936Return a string which is the concatenation of the strings in the\n\
3937sequence. The separator between elements is S.";
3938
3939static PyObject*
3940unicode_join(PyUnicodeObject *self, PyObject *args)
3941{
3942 PyObject *data;
3943 if (!PyArg_ParseTuple(args, "O:join", &data))
3944 return NULL;
3945
3946 return PyUnicode_Join((PyObject *)self, data);
3947}
3948
3949static int
3950unicode_length(PyUnicodeObject *self)
3951{
3952 return self->length;
3953}
3954
3955static char ljust__doc__[] =
3956"S.ljust(width) -> unicode\n\
3957\n\
3958Return S left justified in a Unicode string of length width. Padding is\n\
3959done using spaces.";
3960
3961static PyObject *
3962unicode_ljust(PyUnicodeObject *self, PyObject *args)
3963{
3964 int width;
3965 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3966 return NULL;
3967
3968 if (self->length >= width) {
3969 Py_INCREF(self);
3970 return (PyObject*) self;
3971 }
3972
3973 return (PyObject*) pad(self, 0, width - self->length, ' ');
3974}
3975
3976static char lower__doc__[] =
3977"S.lower() -> unicode\n\
3978\n\
3979Return a copy of the string S converted to lowercase.";
3980
3981static PyObject*
3982unicode_lower(PyUnicodeObject *self, PyObject *args)
3983{
3984 if (!PyArg_NoArgs(args))
3985 return NULL;
3986 return fixup(self, fixlower);
3987}
3988
3989static char lstrip__doc__[] =
3990"S.lstrip() -> unicode\n\
3991\n\
3992Return a copy of the string S with leading whitespace removed.";
3993
3994static PyObject *
3995unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3996{
3997 if (!PyArg_NoArgs(args))
3998 return NULL;
3999 return strip(self, 1, 0);
4000}
4001
4002static PyObject*
4003unicode_repeat(PyUnicodeObject *str, int len)
4004{
4005 PyUnicodeObject *u;
4006 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004007 int nchars;
4008 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
4010 if (len < 0)
4011 len = 0;
4012
4013 if (len == 1) {
4014 /* no repeat, return original string */
4015 Py_INCREF(str);
4016 return (PyObject*) str;
4017 }
Tim Peters8f422462000-09-09 06:13:41 +00004018
4019 /* ensure # of chars needed doesn't overflow int and # of bytes
4020 * needed doesn't overflow size_t
4021 */
4022 nchars = len * str->length;
4023 if (len && nchars / len != str->length) {
4024 PyErr_SetString(PyExc_OverflowError,
4025 "repeated string is too long");
4026 return NULL;
4027 }
4028 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4029 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4030 PyErr_SetString(PyExc_OverflowError,
4031 "repeated string is too long");
4032 return NULL;
4033 }
4034 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 if (!u)
4036 return NULL;
4037
4038 p = u->str;
4039
4040 while (len-- > 0) {
4041 Py_UNICODE_COPY(p, str->str, str->length);
4042 p += str->length;
4043 }
4044
4045 return (PyObject*) u;
4046}
4047
4048PyObject *PyUnicode_Replace(PyObject *obj,
4049 PyObject *subobj,
4050 PyObject *replobj,
4051 int maxcount)
4052{
4053 PyObject *self;
4054 PyObject *str1;
4055 PyObject *str2;
4056 PyObject *result;
4057
4058 self = PyUnicode_FromObject(obj);
4059 if (self == NULL)
4060 return NULL;
4061 str1 = PyUnicode_FromObject(subobj);
4062 if (str1 == NULL) {
4063 Py_DECREF(self);
4064 return NULL;
4065 }
4066 str2 = PyUnicode_FromObject(replobj);
4067 if (str2 == NULL) {
4068 Py_DECREF(self);
4069 Py_DECREF(str1);
4070 return NULL;
4071 }
4072 result = replace((PyUnicodeObject *)self,
4073 (PyUnicodeObject *)str1,
4074 (PyUnicodeObject *)str2,
4075 maxcount);
4076 Py_DECREF(self);
4077 Py_DECREF(str1);
4078 Py_DECREF(str2);
4079 return result;
4080}
4081
4082static char replace__doc__[] =
4083"S.replace (old, new[, maxsplit]) -> unicode\n\
4084\n\
4085Return a copy of S with all occurrences of substring\n\
4086old replaced by new. If the optional argument maxsplit is\n\
4087given, only the first maxsplit occurrences are replaced.";
4088
4089static PyObject*
4090unicode_replace(PyUnicodeObject *self, PyObject *args)
4091{
4092 PyUnicodeObject *str1;
4093 PyUnicodeObject *str2;
4094 int maxcount = -1;
4095 PyObject *result;
4096
4097 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4098 return NULL;
4099 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4100 if (str1 == NULL)
4101 return NULL;
4102 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4103 if (str2 == NULL)
4104 return NULL;
4105
4106 result = replace(self, str1, str2, maxcount);
4107
4108 Py_DECREF(str1);
4109 Py_DECREF(str2);
4110 return result;
4111}
4112
4113static
4114PyObject *unicode_repr(PyObject *unicode)
4115{
4116 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4117 PyUnicode_GET_SIZE(unicode),
4118 1);
4119}
4120
4121static char rfind__doc__[] =
4122"S.rfind(sub [,start [,end]]) -> int\n\
4123\n\
4124Return the highest index in S where substring sub is found,\n\
4125such that sub is contained within s[start,end]. Optional\n\
4126arguments start and end are interpreted as in slice notation.\n\
4127\n\
4128Return -1 on failure.";
4129
4130static PyObject *
4131unicode_rfind(PyUnicodeObject *self, PyObject *args)
4132{
4133 PyUnicodeObject *substring;
4134 int start = 0;
4135 int end = INT_MAX;
4136 PyObject *result;
4137
Guido van Rossumb8872e62000-05-09 14:14:27 +00004138 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4139 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 return NULL;
4141 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4142 (PyObject *)substring);
4143 if (substring == NULL)
4144 return NULL;
4145
4146 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4147
4148 Py_DECREF(substring);
4149 return result;
4150}
4151
4152static char rindex__doc__[] =
4153"S.rindex(sub [,start [,end]]) -> int\n\
4154\n\
4155Like S.rfind() but raise ValueError when the substring is not found.";
4156
4157static PyObject *
4158unicode_rindex(PyUnicodeObject *self, PyObject *args)
4159{
4160 int result;
4161 PyUnicodeObject *substring;
4162 int start = 0;
4163 int end = INT_MAX;
4164
Guido van Rossumb8872e62000-05-09 14:14:27 +00004165 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4166 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 return NULL;
4168 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4169 (PyObject *)substring);
4170 if (substring == NULL)
4171 return NULL;
4172
4173 result = findstring(self, substring, start, end, -1);
4174
4175 Py_DECREF(substring);
4176 if (result < 0) {
4177 PyErr_SetString(PyExc_ValueError, "substring not found");
4178 return NULL;
4179 }
4180 return PyInt_FromLong(result);
4181}
4182
4183static char rjust__doc__[] =
4184"S.rjust(width) -> unicode\n\
4185\n\
4186Return S right justified in a Unicode string of length width. Padding is\n\
4187done using spaces.";
4188
4189static PyObject *
4190unicode_rjust(PyUnicodeObject *self, PyObject *args)
4191{
4192 int width;
4193 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4194 return NULL;
4195
4196 if (self->length >= width) {
4197 Py_INCREF(self);
4198 return (PyObject*) self;
4199 }
4200
4201 return (PyObject*) pad(self, width - self->length, 0, ' ');
4202}
4203
4204static char rstrip__doc__[] =
4205"S.rstrip() -> unicode\n\
4206\n\
4207Return a copy of the string S with trailing whitespace removed.";
4208
4209static PyObject *
4210unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4211{
4212 if (!PyArg_NoArgs(args))
4213 return NULL;
4214 return strip(self, 0, 1);
4215}
4216
4217static PyObject*
4218unicode_slice(PyUnicodeObject *self, int start, int end)
4219{
4220 /* standard clamping */
4221 if (start < 0)
4222 start = 0;
4223 if (end < 0)
4224 end = 0;
4225 if (end > self->length)
4226 end = self->length;
4227 if (start == 0 && end == self->length) {
4228 /* full slice, return original string */
4229 Py_INCREF(self);
4230 return (PyObject*) self;
4231 }
4232 if (start > end)
4233 start = end;
4234 /* copy slice */
4235 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4236 end - start);
4237}
4238
4239PyObject *PyUnicode_Split(PyObject *s,
4240 PyObject *sep,
4241 int maxsplit)
4242{
4243 PyObject *result;
4244
4245 s = PyUnicode_FromObject(s);
4246 if (s == NULL)
4247 return NULL;
4248 if (sep != NULL) {
4249 sep = PyUnicode_FromObject(sep);
4250 if (sep == NULL) {
4251 Py_DECREF(s);
4252 return NULL;
4253 }
4254 }
4255
4256 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4257
4258 Py_DECREF(s);
4259 Py_XDECREF(sep);
4260 return result;
4261}
4262
4263static char split__doc__[] =
4264"S.split([sep [,maxsplit]]) -> list of strings\n\
4265\n\
4266Return a list of the words in S, using sep as the\n\
4267delimiter string. If maxsplit is given, at most maxsplit\n\
4268splits are done. If sep is not specified, any whitespace string\n\
4269is a separator.";
4270
4271static PyObject*
4272unicode_split(PyUnicodeObject *self, PyObject *args)
4273{
4274 PyObject *substring = Py_None;
4275 int maxcount = -1;
4276
4277 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4278 return NULL;
4279
4280 if (substring == Py_None)
4281 return split(self, NULL, maxcount);
4282 else if (PyUnicode_Check(substring))
4283 return split(self, (PyUnicodeObject *)substring, maxcount);
4284 else
4285 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4286}
4287
4288static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004289"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290\n\
4291Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004292Line breaks are not included in the resulting list unless keepends\n\
4293is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294
4295static PyObject*
4296unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4297{
Guido van Rossum86662912000-04-11 15:38:46 +00004298 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299
Guido van Rossum86662912000-04-11 15:38:46 +00004300 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 return NULL;
4302
Guido van Rossum86662912000-04-11 15:38:46 +00004303 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304}
4305
4306static
4307PyObject *unicode_str(PyUnicodeObject *self)
4308{
Fred Drakee4315f52000-05-09 19:53:39 +00004309 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310}
4311
4312static char strip__doc__[] =
4313"S.strip() -> unicode\n\
4314\n\
4315Return a copy of S with leading and trailing whitespace removed.";
4316
4317static PyObject *
4318unicode_strip(PyUnicodeObject *self, PyObject *args)
4319{
4320 if (!PyArg_NoArgs(args))
4321 return NULL;
4322 return strip(self, 1, 1);
4323}
4324
4325static char swapcase__doc__[] =
4326"S.swapcase() -> unicode\n\
4327\n\
4328Return a copy of S with uppercase characters converted to lowercase\n\
4329and vice versa.";
4330
4331static PyObject*
4332unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4333{
4334 if (!PyArg_NoArgs(args))
4335 return NULL;
4336 return fixup(self, fixswapcase);
4337}
4338
4339static char translate__doc__[] =
4340"S.translate(table) -> unicode\n\
4341\n\
4342Return a copy of the string S, where all characters have been mapped\n\
4343through the given translation table, which must be a mapping of\n\
4344Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4345are left untouched. Characters mapped to None are deleted.";
4346
4347static PyObject*
4348unicode_translate(PyUnicodeObject *self, PyObject *args)
4349{
4350 PyObject *table;
4351
4352 if (!PyArg_ParseTuple(args, "O:translate", &table))
4353 return NULL;
4354 return PyUnicode_TranslateCharmap(self->str,
4355 self->length,
4356 table,
4357 "ignore");
4358}
4359
4360static char upper__doc__[] =
4361"S.upper() -> unicode\n\
4362\n\
4363Return a copy of S converted to uppercase.";
4364
4365static PyObject*
4366unicode_upper(PyUnicodeObject *self, PyObject *args)
4367{
4368 if (!PyArg_NoArgs(args))
4369 return NULL;
4370 return fixup(self, fixupper);
4371}
4372
4373#if 0
4374static char zfill__doc__[] =
4375"S.zfill(width) -> unicode\n\
4376\n\
4377Pad a numeric string x with zeros on the left, to fill a field\n\
4378of the specified width. The string x is never truncated.";
4379
4380static PyObject *
4381unicode_zfill(PyUnicodeObject *self, PyObject *args)
4382{
4383 int fill;
4384 PyUnicodeObject *u;
4385
4386 int width;
4387 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4388 return NULL;
4389
4390 if (self->length >= width) {
4391 Py_INCREF(self);
4392 return (PyObject*) self;
4393 }
4394
4395 fill = width - self->length;
4396
4397 u = pad(self, fill, 0, '0');
4398
4399 if (u->str[fill] == '+' || u->str[fill] == '-') {
4400 /* move sign to beginning of string */
4401 u->str[0] = u->str[fill];
4402 u->str[fill] = '0';
4403 }
4404
4405 return (PyObject*) u;
4406}
4407#endif
4408
4409#if 0
4410static PyObject*
4411unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4412{
4413 if (!PyArg_NoArgs(args))
4414 return NULL;
4415 return PyInt_FromLong(unicode_freelist_size);
4416}
4417#endif
4418
4419static char startswith__doc__[] =
4420"S.startswith(prefix[, start[, end]]) -> int\n\
4421\n\
4422Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4423optional start, test S beginning at that position. With optional end, stop\n\
4424comparing S at that position.";
4425
4426static PyObject *
4427unicode_startswith(PyUnicodeObject *self,
4428 PyObject *args)
4429{
4430 PyUnicodeObject *substring;
4431 int start = 0;
4432 int end = INT_MAX;
4433 PyObject *result;
4434
Guido van Rossumb8872e62000-05-09 14:14:27 +00004435 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4436 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 return NULL;
4438 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4439 (PyObject *)substring);
4440 if (substring == NULL)
4441 return NULL;
4442
4443 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4444
4445 Py_DECREF(substring);
4446 return result;
4447}
4448
4449
4450static char endswith__doc__[] =
4451"S.endswith(suffix[, start[, end]]) -> int\n\
4452\n\
4453Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4454optional start, test S beginning at that position. With optional end, stop\n\
4455comparing S at that position.";
4456
4457static PyObject *
4458unicode_endswith(PyUnicodeObject *self,
4459 PyObject *args)
4460{
4461 PyUnicodeObject *substring;
4462 int start = 0;
4463 int end = INT_MAX;
4464 PyObject *result;
4465
Guido van Rossumb8872e62000-05-09 14:14:27 +00004466 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4467 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 return NULL;
4469 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4470 (PyObject *)substring);
4471 if (substring == NULL)
4472 return NULL;
4473
4474 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4475
4476 Py_DECREF(substring);
4477 return result;
4478}
4479
4480
4481static PyMethodDef unicode_methods[] = {
4482
4483 /* Order is according to common usage: often used methods should
4484 appear first, since lookup is done sequentially. */
4485
4486 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4487 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4488 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4489 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4490 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4491 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4492 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4493 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4494 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4495 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4496 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4497 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4498 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4499 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4500/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4501 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4502 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4503 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4504 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4505 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4506 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4507 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4508 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4509 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4510 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4511 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4512 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4513 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4514 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4515 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4516 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4517 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4518 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004519 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4520 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521#if 0
4522 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4523 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4524#endif
4525
4526#if 0
4527 /* This one is just used for debugging the implementation. */
4528 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4529#endif
4530
4531 {NULL, NULL}
4532};
4533
4534static PyObject *
4535unicode_getattr(PyUnicodeObject *self, char *name)
4536{
4537 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4538}
4539
4540static PySequenceMethods unicode_as_sequence = {
4541 (inquiry) unicode_length, /* sq_length */
4542 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4543 (intargfunc) unicode_repeat, /* sq_repeat */
4544 (intargfunc) unicode_getitem, /* sq_item */
4545 (intintargfunc) unicode_slice, /* sq_slice */
4546 0, /* sq_ass_item */
4547 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004548 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549};
4550
4551static int
4552unicode_buffer_getreadbuf(PyUnicodeObject *self,
4553 int index,
4554 const void **ptr)
4555{
4556 if (index != 0) {
4557 PyErr_SetString(PyExc_SystemError,
4558 "accessing non-existent unicode segment");
4559 return -1;
4560 }
4561 *ptr = (void *) self->str;
4562 return PyUnicode_GET_DATA_SIZE(self);
4563}
4564
4565static int
4566unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4567 const void **ptr)
4568{
4569 PyErr_SetString(PyExc_TypeError,
4570 "cannot use unicode as modifyable buffer");
4571 return -1;
4572}
4573
4574static int
4575unicode_buffer_getsegcount(PyUnicodeObject *self,
4576 int *lenp)
4577{
4578 if (lenp)
4579 *lenp = PyUnicode_GET_DATA_SIZE(self);
4580 return 1;
4581}
4582
4583static int
4584unicode_buffer_getcharbuf(PyUnicodeObject *self,
4585 int index,
4586 const void **ptr)
4587{
4588 PyObject *str;
4589
4590 if (index != 0) {
4591 PyErr_SetString(PyExc_SystemError,
4592 "accessing non-existent unicode segment");
4593 return -1;
4594 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004595 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 if (str == NULL)
4597 return -1;
4598 *ptr = (void *) PyString_AS_STRING(str);
4599 return PyString_GET_SIZE(str);
4600}
4601
4602/* Helpers for PyUnicode_Format() */
4603
4604static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004605getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606{
4607 int argidx = *p_argidx;
4608 if (argidx < arglen) {
4609 (*p_argidx)++;
4610 if (arglen < 0)
4611 return args;
4612 else
4613 return PyTuple_GetItem(args, argidx);
4614 }
4615 PyErr_SetString(PyExc_TypeError,
4616 "not enough arguments for format string");
4617 return NULL;
4618}
4619
4620#define F_LJUST (1<<0)
4621#define F_SIGN (1<<1)
4622#define F_BLANK (1<<2)
4623#define F_ALT (1<<3)
4624#define F_ZERO (1<<4)
4625
4626static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
4629 register int i;
4630 int len;
4631 va_list va;
4632 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634
4635 /* First, format the string as char array, then expand to Py_UNICODE
4636 array. */
4637 charbuffer = (char *)buffer;
4638 len = vsprintf(charbuffer, format, va);
4639 for (i = len - 1; i >= 0; i--)
4640 buffer[i] = (Py_UNICODE) charbuffer[i];
4641
4642 va_end(va);
4643 return len;
4644}
4645
4646static int
4647formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004648 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 int flags,
4650 int prec,
4651 int type,
4652 PyObject *v)
4653{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004654 /* fmt = '%#.' + `prec` + `type`
4655 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 char fmt[20];
4657 double x;
4658
4659 x = PyFloat_AsDouble(v);
4660 if (x == -1.0 && PyErr_Occurred())
4661 return -1;
4662 if (prec < 0)
4663 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4665 type = 'g';
4666 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004667 /* worst case length calc to ensure no buffer overrun:
4668 fmt = %#.<prec>g
4669 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4670 for any double rep.)
4671 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4672 If prec=0 the effective precision is 1 (the leading digit is
4673 always given), therefore increase by one to 10+prec. */
4674 if (buflen <= (size_t)10 + (size_t)prec) {
4675 PyErr_SetString(PyExc_OverflowError,
4676 "formatted float is too long (precision too long?)");
4677 return -1;
4678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 return usprintf(buf, fmt, x);
4680}
4681
Tim Peters38fd5b62000-09-21 05:43:11 +00004682static PyObject*
4683formatlong(PyObject *val, int flags, int prec, int type)
4684{
4685 char *buf;
4686 int i, len;
4687 PyObject *str; /* temporary string object. */
4688 PyUnicodeObject *result;
4689
4690 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4691 if (!str)
4692 return NULL;
4693 result = _PyUnicode_New(len);
4694 for (i = 0; i < len; i++)
4695 result->str[i] = buf[i];
4696 result->str[len] = 0;
4697 Py_DECREF(str);
4698 return (PyObject*)result;
4699}
4700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701static int
4702formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004703 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 int flags,
4705 int prec,
4706 int type,
4707 PyObject *v)
4708{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004709 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004710 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4711 + 1 + 1 = 24*/
4712 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 long x;
4714
4715 x = PyInt_AsLong(v);
4716 if (x == -1 && PyErr_Occurred())
4717 return -1;
4718 if (prec < 0)
4719 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004720 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4721 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4722 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4723 PyErr_SetString(PyExc_OverflowError,
4724 "formatted integer is too long (precision too long?)");
4725 return -1;
4726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4728 return usprintf(buf, fmt, x);
4729}
4730
4731static int
4732formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004733 size_t buflen,
4734 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004736 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004737 if (PyUnicode_Check(v)) {
4738 if (PyUnicode_GET_SIZE(v) != 1)
4739 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004743 else if (PyString_Check(v)) {
4744 if (PyString_GET_SIZE(v) != 1)
4745 goto onError;
4746 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
4749 else {
4750 /* Integer input truncated to a character */
4751 long x;
4752 x = PyInt_AsLong(v);
4753 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004754 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 buf[0] = (char) x;
4756 }
4757 buf[1] = '\0';
4758 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004759
4760 onError:
4761 PyErr_SetString(PyExc_TypeError,
4762 "%c requires int or char");
4763 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764}
4765
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004766/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4767
4768 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4769 chars are formatted. XXX This is a magic number. Each formatting
4770 routine does bounds checking to ensure no overflow, but a better
4771 solution may be to malloc a buffer of appropriate size for each
4772 format. For now, the current solution is sufficient.
4773*/
4774#define FORMATBUFLEN (size_t)120
4775
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776PyObject *PyUnicode_Format(PyObject *format,
4777 PyObject *args)
4778{
4779 Py_UNICODE *fmt, *res;
4780 int fmtcnt, rescnt, reslen, arglen, argidx;
4781 int args_owned = 0;
4782 PyUnicodeObject *result = NULL;
4783 PyObject *dict = NULL;
4784 PyObject *uformat;
4785
4786 if (format == NULL || args == NULL) {
4787 PyErr_BadInternalCall();
4788 return NULL;
4789 }
4790 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004791 if (uformat == NULL)
4792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 fmt = PyUnicode_AS_UNICODE(uformat);
4794 fmtcnt = PyUnicode_GET_SIZE(uformat);
4795
4796 reslen = rescnt = fmtcnt + 100;
4797 result = _PyUnicode_New(reslen);
4798 if (result == NULL)
4799 goto onError;
4800 res = PyUnicode_AS_UNICODE(result);
4801
4802 if (PyTuple_Check(args)) {
4803 arglen = PyTuple_Size(args);
4804 argidx = 0;
4805 }
4806 else {
4807 arglen = -1;
4808 argidx = -2;
4809 }
4810 if (args->ob_type->tp_as_mapping)
4811 dict = args;
4812
4813 while (--fmtcnt >= 0) {
4814 if (*fmt != '%') {
4815 if (--rescnt < 0) {
4816 rescnt = fmtcnt + 100;
4817 reslen += rescnt;
4818 if (_PyUnicode_Resize(result, reslen) < 0)
4819 return NULL;
4820 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4821 --rescnt;
4822 }
4823 *res++ = *fmt++;
4824 }
4825 else {
4826 /* Got a format specifier */
4827 int flags = 0;
4828 int width = -1;
4829 int prec = -1;
4830 int size = 0;
4831 Py_UNICODE c = '\0';
4832 Py_UNICODE fill;
4833 PyObject *v = NULL;
4834 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004835 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 Py_UNICODE sign;
4837 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004838 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
4840 fmt++;
4841 if (*fmt == '(') {
4842 Py_UNICODE *keystart;
4843 int keylen;
4844 PyObject *key;
4845 int pcount = 1;
4846
4847 if (dict == NULL) {
4848 PyErr_SetString(PyExc_TypeError,
4849 "format requires a mapping");
4850 goto onError;
4851 }
4852 ++fmt;
4853 --fmtcnt;
4854 keystart = fmt;
4855 /* Skip over balanced parentheses */
4856 while (pcount > 0 && --fmtcnt >= 0) {
4857 if (*fmt == ')')
4858 --pcount;
4859 else if (*fmt == '(')
4860 ++pcount;
4861 fmt++;
4862 }
4863 keylen = fmt - keystart - 1;
4864 if (fmtcnt < 0 || pcount > 0) {
4865 PyErr_SetString(PyExc_ValueError,
4866 "incomplete format key");
4867 goto onError;
4868 }
Fred Drakee4315f52000-05-09 19:53:39 +00004869 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 then looked up since Python uses strings to hold
4871 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004872 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 key = PyUnicode_EncodeUTF8(keystart,
4874 keylen,
4875 NULL);
4876 if (key == NULL)
4877 goto onError;
4878 if (args_owned) {
4879 Py_DECREF(args);
4880 args_owned = 0;
4881 }
4882 args = PyObject_GetItem(dict, key);
4883 Py_DECREF(key);
4884 if (args == NULL) {
4885 goto onError;
4886 }
4887 args_owned = 1;
4888 arglen = -1;
4889 argidx = -2;
4890 }
4891 while (--fmtcnt >= 0) {
4892 switch (c = *fmt++) {
4893 case '-': flags |= F_LJUST; continue;
4894 case '+': flags |= F_SIGN; continue;
4895 case ' ': flags |= F_BLANK; continue;
4896 case '#': flags |= F_ALT; continue;
4897 case '0': flags |= F_ZERO; continue;
4898 }
4899 break;
4900 }
4901 if (c == '*') {
4902 v = getnextarg(args, arglen, &argidx);
4903 if (v == NULL)
4904 goto onError;
4905 if (!PyInt_Check(v)) {
4906 PyErr_SetString(PyExc_TypeError,
4907 "* wants int");
4908 goto onError;
4909 }
4910 width = PyInt_AsLong(v);
4911 if (width < 0) {
4912 flags |= F_LJUST;
4913 width = -width;
4914 }
4915 if (--fmtcnt >= 0)
4916 c = *fmt++;
4917 }
4918 else if (c >= '0' && c <= '9') {
4919 width = c - '0';
4920 while (--fmtcnt >= 0) {
4921 c = *fmt++;
4922 if (c < '0' || c > '9')
4923 break;
4924 if ((width*10) / 10 != width) {
4925 PyErr_SetString(PyExc_ValueError,
4926 "width too big");
4927 goto onError;
4928 }
4929 width = width*10 + (c - '0');
4930 }
4931 }
4932 if (c == '.') {
4933 prec = 0;
4934 if (--fmtcnt >= 0)
4935 c = *fmt++;
4936 if (c == '*') {
4937 v = getnextarg(args, arglen, &argidx);
4938 if (v == NULL)
4939 goto onError;
4940 if (!PyInt_Check(v)) {
4941 PyErr_SetString(PyExc_TypeError,
4942 "* wants int");
4943 goto onError;
4944 }
4945 prec = PyInt_AsLong(v);
4946 if (prec < 0)
4947 prec = 0;
4948 if (--fmtcnt >= 0)
4949 c = *fmt++;
4950 }
4951 else if (c >= '0' && c <= '9') {
4952 prec = c - '0';
4953 while (--fmtcnt >= 0) {
4954 c = Py_CHARMASK(*fmt++);
4955 if (c < '0' || c > '9')
4956 break;
4957 if ((prec*10) / 10 != prec) {
4958 PyErr_SetString(PyExc_ValueError,
4959 "prec too big");
4960 goto onError;
4961 }
4962 prec = prec*10 + (c - '0');
4963 }
4964 }
4965 } /* prec */
4966 if (fmtcnt >= 0) {
4967 if (c == 'h' || c == 'l' || c == 'L') {
4968 size = c;
4969 if (--fmtcnt >= 0)
4970 c = *fmt++;
4971 }
4972 }
4973 if (fmtcnt < 0) {
4974 PyErr_SetString(PyExc_ValueError,
4975 "incomplete format");
4976 goto onError;
4977 }
4978 if (c != '%') {
4979 v = getnextarg(args, arglen, &argidx);
4980 if (v == NULL)
4981 goto onError;
4982 }
4983 sign = 0;
4984 fill = ' ';
4985 switch (c) {
4986
4987 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004988 pbuf = formatbuf;
4989 /* presume that buffer length is at least 1 */
4990 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 len = 1;
4992 break;
4993
4994 case 's':
4995 case 'r':
4996 if (PyUnicode_Check(v) && c == 's') {
4997 temp = v;
4998 Py_INCREF(temp);
4999 }
5000 else {
5001 PyObject *unicode;
5002 if (c == 's')
5003 temp = PyObject_Str(v);
5004 else
5005 temp = PyObject_Repr(v);
5006 if (temp == NULL)
5007 goto onError;
5008 if (!PyString_Check(temp)) {
5009 /* XXX Note: this should never happen, since
5010 PyObject_Repr() and PyObject_Str() assure
5011 this */
5012 Py_DECREF(temp);
5013 PyErr_SetString(PyExc_TypeError,
5014 "%s argument has non-string str()");
5015 goto onError;
5016 }
Fred Drakee4315f52000-05-09 19:53:39 +00005017 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005019 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 "strict");
5021 Py_DECREF(temp);
5022 temp = unicode;
5023 if (temp == NULL)
5024 goto onError;
5025 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005026 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 len = PyUnicode_GET_SIZE(temp);
5028 if (prec >= 0 && len > prec)
5029 len = prec;
5030 break;
5031
5032 case 'i':
5033 case 'd':
5034 case 'u':
5035 case 'o':
5036 case 'x':
5037 case 'X':
5038 if (c == 'i')
5039 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005040 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005041 temp = formatlong(v, flags, prec, c);
5042 if (!temp)
5043 goto onError;
5044 pbuf = PyUnicode_AS_UNICODE(temp);
5045 len = PyUnicode_GET_SIZE(temp);
5046 /* unbounded ints can always produce
5047 a sign character! */
5048 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005050 else {
5051 pbuf = formatbuf;
5052 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5053 flags, prec, c, v);
5054 if (len < 0)
5055 goto onError;
5056 /* only d conversion is signed */
5057 sign = c == 'd';
5058 }
5059 if (flags & F_ZERO)
5060 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 break;
5062
5063 case 'e':
5064 case 'E':
5065 case 'f':
5066 case 'g':
5067 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005068 pbuf = formatbuf;
5069 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5070 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 if (len < 0)
5072 goto onError;
5073 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005074 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 fill = '0';
5076 break;
5077
5078 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005079 pbuf = formatbuf;
5080 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 if (len < 0)
5082 goto onError;
5083 break;
5084
5085 default:
5086 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005087 "unsupported format character '%c' (0x%x) "
5088 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005089 (31<=c && c<=126) ? c : '?',
5090 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 goto onError;
5092 }
5093 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005094 if (*pbuf == '-' || *pbuf == '+') {
5095 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 len--;
5097 }
5098 else if (flags & F_SIGN)
5099 sign = '+';
5100 else if (flags & F_BLANK)
5101 sign = ' ';
5102 else
5103 sign = 0;
5104 }
5105 if (width < len)
5106 width = len;
5107 if (rescnt < width + (sign != 0)) {
5108 reslen -= rescnt;
5109 rescnt = width + fmtcnt + 100;
5110 reslen += rescnt;
5111 if (_PyUnicode_Resize(result, reslen) < 0)
5112 return NULL;
5113 res = PyUnicode_AS_UNICODE(result)
5114 + reslen - rescnt;
5115 }
5116 if (sign) {
5117 if (fill != ' ')
5118 *res++ = sign;
5119 rescnt--;
5120 if (width > len)
5121 width--;
5122 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005123 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5124 assert(pbuf[0] == '0');
5125 assert(pbuf[1] == c);
5126 if (fill != ' ') {
5127 *res++ = *pbuf++;
5128 *res++ = *pbuf++;
5129 }
5130 rescnt -= 2;
5131 width -= 2;
5132 if (width < 0)
5133 width = 0;
5134 len -= 2;
5135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 if (width > len && !(flags & F_LJUST)) {
5137 do {
5138 --rescnt;
5139 *res++ = fill;
5140 } while (--width > len);
5141 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005142 if (fill == ' ') {
5143 if (sign)
5144 *res++ = sign;
5145 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5146 assert(pbuf[0] == '0');
5147 assert(pbuf[1] == c);
5148 *res++ = *pbuf++;
5149 *res++ = *pbuf++;
5150 }
5151 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005152 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 res += len;
5154 rescnt -= len;
5155 while (--width >= len) {
5156 --rescnt;
5157 *res++ = ' ';
5158 }
5159 if (dict && (argidx < arglen) && c != '%') {
5160 PyErr_SetString(PyExc_TypeError,
5161 "not all arguments converted");
5162 goto onError;
5163 }
5164 Py_XDECREF(temp);
5165 } /* '%' */
5166 } /* until end */
5167 if (argidx < arglen && !dict) {
5168 PyErr_SetString(PyExc_TypeError,
5169 "not all arguments converted");
5170 goto onError;
5171 }
5172
5173 if (args_owned) {
5174 Py_DECREF(args);
5175 }
5176 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005177 if (_PyUnicode_Resize(result, reslen - rescnt))
5178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 return (PyObject *)result;
5180
5181 onError:
5182 Py_XDECREF(result);
5183 Py_DECREF(uformat);
5184 if (args_owned) {
5185 Py_DECREF(args);
5186 }
5187 return NULL;
5188}
5189
5190static PyBufferProcs unicode_as_buffer = {
5191 (getreadbufferproc) unicode_buffer_getreadbuf,
5192 (getwritebufferproc) unicode_buffer_getwritebuf,
5193 (getsegcountproc) unicode_buffer_getsegcount,
5194 (getcharbufferproc) unicode_buffer_getcharbuf,
5195};
5196
5197PyTypeObject PyUnicode_Type = {
5198 PyObject_HEAD_INIT(&PyType_Type)
5199 0, /* ob_size */
5200 "unicode", /* tp_name */
5201 sizeof(PyUnicodeObject), /* tp_size */
5202 0, /* tp_itemsize */
5203 /* Slots */
5204 (destructor)_PyUnicode_Free, /* tp_dealloc */
5205 0, /* tp_print */
5206 (getattrfunc)unicode_getattr, /* tp_getattr */
5207 0, /* tp_setattr */
5208 (cmpfunc) unicode_compare, /* tp_compare */
5209 (reprfunc) unicode_repr, /* tp_repr */
5210 0, /* tp_as_number */
5211 &unicode_as_sequence, /* tp_as_sequence */
5212 0, /* tp_as_mapping */
5213 (hashfunc) unicode_hash, /* tp_hash*/
5214 0, /* tp_call*/
5215 (reprfunc) unicode_str, /* tp_str */
5216 (getattrofunc) NULL, /* tp_getattro */
5217 (setattrofunc) NULL, /* tp_setattro */
5218 &unicode_as_buffer, /* tp_as_buffer */
5219 Py_TPFLAGS_DEFAULT, /* tp_flags */
5220};
5221
5222/* Initialize the Unicode implementation */
5223
Thomas Wouters78890102000-07-22 19:25:51 +00005224void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
5226 /* Doublecheck the configuration... */
5227 if (sizeof(Py_UNICODE) != 2)
5228 Py_FatalError("Unicode configuration error: "
5229 "sizeof(Py_UNICODE) != 2 bytes");
5230
Fred Drakee4315f52000-05-09 19:53:39 +00005231 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005232 unicode_freelist = NULL;
5233 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005235 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236}
5237
5238/* Finalize the Unicode implementation */
5239
5240void
Thomas Wouters78890102000-07-22 19:25:51 +00005241_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005243 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005245 Py_XDECREF(unicode_empty);
5246 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005247
5248 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 PyUnicodeObject *v = u;
5250 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005251 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005252 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005253 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005254 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005256 unicode_freelist = NULL;
5257 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258}