blob: a06c40b9d604f3d99c1d553d40c8b7ca95c8fd83 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001106static _PyUnicode_Name_CAPI *unicode_names = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p = NULL, *buf = NULL;
1114 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001115 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
1117 /* Escaped strings will always be longer than the resulting
1118 Unicode string, so we start with size here and then reduce the
1119 length after conversion to the true value. */
1120 v = _PyUnicode_New(size);
1121 if (v == NULL)
1122 goto onError;
1123 if (size == 0)
1124 return (PyObject *)v;
1125 p = buf = PyUnicode_AS_UNICODE(v);
1126 end = s + size;
1127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 int i;
1131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
1134 *p++ = (unsigned char)*s++;
1135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhdf846752000-09-03 11:29:49 +00001167 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001169 for (x = 0, i = 0; i < 2; i++) {
1170 c = (unsigned char)s[i];
1171 if (!isxdigit(c)) {
1172 if (unicodeescape_decoding_error(&s, &x, errors,
1173 "truncated \\xXX"))
1174 goto onError;
1175 i++;
1176 break;
1177 }
1178 x = (x<<4) & ~0xF;
1179 if (c >= '0' && c <= '9')
1180 x += c - '0';
1181 else if (c >= 'a' && c <= 'f')
1182 x += 10 + c - 'a';
1183 else
1184 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001186 s += i;
1187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 break;
1189
1190 /* \uXXXX with 4 hex digits */
1191 case 'u':
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \UXXXXXXXX with 8 hex digits */
1214 case 'U':
1215 for (chr = 0, i = 0; i < 8; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\uXXXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 chr = (chr<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 chr += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 chr += 10 + c - 'a';
1229 else
1230 chr += 10 + c - 'A';
1231 }
1232 s += i;
1233 goto store;
1234
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001235 case 'N':
1236 /* Ok, we need to deal with Unicode Character Names now,
1237 * make sure we've imported the hash table data...
1238 */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001239 if (unicode_names == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001240 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001241 mod = PyImport_ImportModule("ucnhash");
1242 if (mod == NULL)
1243 goto onError;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001244 v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 Py_DECREF(mod);
1246 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001247 goto onError;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001248 unicode_names = PyCObject_AsVoidPtr(v);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001249 Py_DECREF(v);
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001250 if (unicode_names == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001251 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001252 }
1253
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001255 const char *start = s + 1;
1256 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001257
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001258 /* look for the closing brace */
1259 while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001260 endBrace++;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 if (endBrace != end && *endBrace == '}') {
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001262 if (!unicode_names->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001263 if (unicodeescape_decoding_error(
1264 &s, &x, errors,
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001265 "Invalid Unicode Character Name")
1266 )
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001267 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001268 goto ucnFallthrough;
1269 }
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001271 goto store;
1272 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 if (unicodeescape_decoding_error(
1274 &s, &x, errors,
1275 "Unicode name missing closing brace"))
1276 goto onError;
1277 goto ucnFallthrough;
1278 }
1279 break;
1280 }
1281 if (unicodeescape_decoding_error(
1282 &s, &x, errors,
1283 "Missing opening brace for Unicode Character Name escape"))
1284 goto onError;
1285ucnFallthrough:
1286 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001287 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 *p++ = '\\';
1289 *p++ = (unsigned char)s[-1];
1290 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291store:
1292 /* when we get here, chr is a 32-bit unicode character */
1293 if (chr <= 0xffff)
1294 /* UCS-2 character */
1295 *p++ = (Py_UNICODE) chr;
1296 else if (chr <= 0x10ffff) {
1297 /* UCS-4 character. store as two surrogate characters */
1298 chr -= 0x10000L;
1299 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301 } else {
1302 if (unicodeescape_decoding_error(
1303 &s, &x, errors,
1304 "Illegal Unicode character")
1305 )
1306 goto onError;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001310 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return (PyObject *)v;
1313
1314 onError:
1315 Py_XDECREF(v);
1316 return NULL;
1317}
1318
1319/* Return a Unicode-Escape string version of the Unicode object.
1320
1321 If quotes is true, the string is enclosed in u"" or u'' quotes as
1322 appropriate.
1323
1324*/
1325
Barry Warsaw51ac5802000-03-20 16:36:48 +00001326static const Py_UNICODE *findchar(const Py_UNICODE *s,
1327 int size,
1328 Py_UNICODE ch);
1329
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330static
1331PyObject *unicodeescape_string(const Py_UNICODE *s,
1332 int size,
1333 int quotes)
1334{
1335 PyObject *repr;
1336 char *p;
1337 char *q;
1338
1339 static const char *hexdigit = "0123456789ABCDEF";
1340
1341 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1342 if (repr == NULL)
1343 return NULL;
1344
1345 p = q = PyString_AS_STRING(repr);
1346
1347 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 *p++ = 'u';
1349 *p++ = (findchar(s, size, '\'') &&
1350 !findchar(s, size, '"')) ? '"' : '\'';
1351 }
1352 while (size-- > 0) {
1353 Py_UNICODE ch = *s++;
1354 /* Escape quotes */
1355 if (quotes && (ch == q[1] || ch == '\\')) {
1356 *p++ = '\\';
1357 *p++ = (char) ch;
1358 }
1359 /* Map 16-bit characters to '\uxxxx' */
1360 else if (ch >= 256) {
1361 *p++ = '\\';
1362 *p++ = 'u';
1363 *p++ = hexdigit[(ch >> 12) & 0xf];
1364 *p++ = hexdigit[(ch >> 8) & 0xf];
1365 *p++ = hexdigit[(ch >> 4) & 0xf];
1366 *p++ = hexdigit[ch & 15];
1367 }
1368 /* Map non-printable US ASCII to '\ooo' */
1369 else if (ch < ' ' || ch >= 128) {
1370 *p++ = '\\';
1371 *p++ = hexdigit[(ch >> 6) & 7];
1372 *p++ = hexdigit[(ch >> 3) & 7];
1373 *p++ = hexdigit[ch & 7];
1374 }
1375 /* Copy everything else as-is */
1376 else
1377 *p++ = (char) ch;
1378 }
1379 if (quotes)
1380 *p++ = q[1];
1381
1382 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001383 if (_PyString_Resize(&repr, p - q))
1384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001387
1388 onError:
1389 Py_DECREF(repr);
1390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391}
1392
1393PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1394 int size)
1395{
1396 return unicodeescape_string(s, size, 0);
1397}
1398
1399PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1400{
1401 if (!PyUnicode_Check(unicode)) {
1402 PyErr_BadArgument();
1403 return NULL;
1404 }
1405 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1406 PyUnicode_GET_SIZE(unicode));
1407}
1408
1409/* --- Raw Unicode Escape Codec ------------------------------------------- */
1410
1411PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1412 int size,
1413 const char *errors)
1414{
1415 PyUnicodeObject *v;
1416 Py_UNICODE *p, *buf;
1417 const char *end;
1418 const char *bs;
1419
1420 /* Escaped strings will always be longer than the resulting
1421 Unicode string, so we start with size here and then reduce the
1422 length after conversion to the true value. */
1423 v = _PyUnicode_New(size);
1424 if (v == NULL)
1425 goto onError;
1426 if (size == 0)
1427 return (PyObject *)v;
1428 p = buf = PyUnicode_AS_UNICODE(v);
1429 end = s + size;
1430 while (s < end) {
1431 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001432 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433 int i;
1434
1435 /* Non-escape characters are interpreted as Unicode ordinals */
1436 if (*s != '\\') {
1437 *p++ = (unsigned char)*s++;
1438 continue;
1439 }
1440
1441 /* \u-escapes are only interpreted iff the number of leading
1442 backslashes if odd */
1443 bs = s;
1444 for (;s < end;) {
1445 if (*s != '\\')
1446 break;
1447 *p++ = (unsigned char)*s++;
1448 }
1449 if (((s - bs) & 1) == 0 ||
1450 s >= end ||
1451 *s != 'u') {
1452 continue;
1453 }
1454 p--;
1455 s++;
1456
1457 /* \uXXXX with 4 hex digits */
1458 for (x = 0, i = 0; i < 4; i++) {
1459 c = (unsigned char)s[i];
1460 if (!isxdigit(c)) {
1461 if (unicodeescape_decoding_error(&s, &x, errors,
1462 "truncated \\uXXXX"))
1463 goto onError;
1464 i++;
1465 break;
1466 }
1467 x = (x<<4) & ~0xF;
1468 if (c >= '0' && c <= '9')
1469 x += c - '0';
1470 else if (c >= 'a' && c <= 'f')
1471 x += 10 + c - 'a';
1472 else
1473 x += 10 + c - 'A';
1474 }
1475 s += i;
1476 *p++ = x;
1477 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001478 if (_PyUnicode_Resize(v, (int)(p - buf)))
1479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return (PyObject *)v;
1481
1482 onError:
1483 Py_XDECREF(v);
1484 return NULL;
1485}
1486
1487PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1488 int size)
1489{
1490 PyObject *repr;
1491 char *p;
1492 char *q;
1493
1494 static const char *hexdigit = "0123456789ABCDEF";
1495
1496 repr = PyString_FromStringAndSize(NULL, 6 * size);
1497 if (repr == NULL)
1498 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001499 if (size == 0)
1500 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501
1502 p = q = PyString_AS_STRING(repr);
1503 while (size-- > 0) {
1504 Py_UNICODE ch = *s++;
1505 /* Map 16-bit characters to '\uxxxx' */
1506 if (ch >= 256) {
1507 *p++ = '\\';
1508 *p++ = 'u';
1509 *p++ = hexdigit[(ch >> 12) & 0xf];
1510 *p++ = hexdigit[(ch >> 8) & 0xf];
1511 *p++ = hexdigit[(ch >> 4) & 0xf];
1512 *p++ = hexdigit[ch & 15];
1513 }
1514 /* Copy everything else as-is */
1515 else
1516 *p++ = (char) ch;
1517 }
1518 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001519 if (_PyString_Resize(&repr, p - q))
1520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521
1522 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001523
1524 onError:
1525 Py_DECREF(repr);
1526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527}
1528
1529PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1530{
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_BadArgument();
1533 return NULL;
1534 }
1535 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1536 PyUnicode_GET_SIZE(unicode));
1537}
1538
1539/* --- Latin-1 Codec ------------------------------------------------------ */
1540
1541PyObject *PyUnicode_DecodeLatin1(const char *s,
1542 int size,
1543 const char *errors)
1544{
1545 PyUnicodeObject *v;
1546 Py_UNICODE *p;
1547
1548 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1549 v = _PyUnicode_New(size);
1550 if (v == NULL)
1551 goto onError;
1552 if (size == 0)
1553 return (PyObject *)v;
1554 p = PyUnicode_AS_UNICODE(v);
1555 while (size-- > 0)
1556 *p++ = (unsigned char)*s++;
1557 return (PyObject *)v;
1558
1559 onError:
1560 Py_XDECREF(v);
1561 return NULL;
1562}
1563
1564static
1565int latin1_encoding_error(const Py_UNICODE **source,
1566 char **dest,
1567 const char *errors,
1568 const char *details)
1569{
1570 if ((errors == NULL) ||
1571 (strcmp(errors,"strict") == 0)) {
1572 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001573 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 details);
1575 return -1;
1576 }
1577 else if (strcmp(errors,"ignore") == 0) {
1578 return 0;
1579 }
1580 else if (strcmp(errors,"replace") == 0) {
1581 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001582 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 return 0;
1584 }
1585 else {
1586 PyErr_Format(PyExc_ValueError,
1587 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001588 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 errors);
1590 return -1;
1591 }
1592}
1593
1594PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1595 int size,
1596 const char *errors)
1597{
1598 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001599 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001600
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 repr = PyString_FromStringAndSize(NULL, size);
1602 if (repr == NULL)
1603 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001604 if (size == 0)
1605 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001608 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 while (size-- > 0) {
1610 Py_UNICODE ch = *p++;
1611 if (ch >= 256) {
1612 if (latin1_encoding_error(&p, &s, errors,
1613 "ordinal not in range(256)"))
1614 goto onError;
1615 }
1616 else
1617 *s++ = (char)ch;
1618 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 /* Resize if error handling skipped some characters */
1620 if (s - start < PyString_GET_SIZE(repr))
1621 if (_PyString_Resize(&repr, s - start))
1622 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return repr;
1624
1625 onError:
1626 Py_DECREF(repr);
1627 return NULL;
1628}
1629
1630PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1631{
1632 if (!PyUnicode_Check(unicode)) {
1633 PyErr_BadArgument();
1634 return NULL;
1635 }
1636 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1637 PyUnicode_GET_SIZE(unicode),
1638 NULL);
1639}
1640
1641/* --- 7-bit ASCII Codec -------------------------------------------------- */
1642
1643static
1644int ascii_decoding_error(const char **source,
1645 Py_UNICODE **dest,
1646 const char *errors,
1647 const char *details)
1648{
1649 if ((errors == NULL) ||
1650 (strcmp(errors,"strict") == 0)) {
1651 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001652 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 details);
1654 return -1;
1655 }
1656 else if (strcmp(errors,"ignore") == 0) {
1657 return 0;
1658 }
1659 else if (strcmp(errors,"replace") == 0) {
1660 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1661 (*dest)++;
1662 return 0;
1663 }
1664 else {
1665 PyErr_Format(PyExc_ValueError,
1666 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001667 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001668 errors);
1669 return -1;
1670 }
1671}
1672
1673PyObject *PyUnicode_DecodeASCII(const char *s,
1674 int size,
1675 const char *errors)
1676{
1677 PyUnicodeObject *v;
1678 Py_UNICODE *p;
1679
1680 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1681 v = _PyUnicode_New(size);
1682 if (v == NULL)
1683 goto onError;
1684 if (size == 0)
1685 return (PyObject *)v;
1686 p = PyUnicode_AS_UNICODE(v);
1687 while (size-- > 0) {
1688 register unsigned char c;
1689
1690 c = (unsigned char)*s++;
1691 if (c < 128)
1692 *p++ = c;
1693 else if (ascii_decoding_error(&s, &p, errors,
1694 "ordinal not in range(128)"))
1695 goto onError;
1696 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001697 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1698 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 return (PyObject *)v;
1701
1702 onError:
1703 Py_XDECREF(v);
1704 return NULL;
1705}
1706
1707static
1708int ascii_encoding_error(const Py_UNICODE **source,
1709 char **dest,
1710 const char *errors,
1711 const char *details)
1712{
1713 if ((errors == NULL) ||
1714 (strcmp(errors,"strict") == 0)) {
1715 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001716 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 details);
1718 return -1;
1719 }
1720 else if (strcmp(errors,"ignore") == 0) {
1721 return 0;
1722 }
1723 else if (strcmp(errors,"replace") == 0) {
1724 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001725 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 return 0;
1727 }
1728 else {
1729 PyErr_Format(PyExc_ValueError,
1730 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001731 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 errors);
1733 return -1;
1734 }
1735}
1736
1737PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1738 int size,
1739 const char *errors)
1740{
1741 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001742 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001743
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 repr = PyString_FromStringAndSize(NULL, size);
1745 if (repr == NULL)
1746 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001747 if (size == 0)
1748 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
1750 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001751 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 while (size-- > 0) {
1753 Py_UNICODE ch = *p++;
1754 if (ch >= 128) {
1755 if (ascii_encoding_error(&p, &s, errors,
1756 "ordinal not in range(128)"))
1757 goto onError;
1758 }
1759 else
1760 *s++ = (char)ch;
1761 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001762 /* Resize if error handling skipped some characters */
1763 if (s - start < PyString_GET_SIZE(repr))
1764 if (_PyString_Resize(&repr, s - start))
1765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 return repr;
1767
1768 onError:
1769 Py_DECREF(repr);
1770 return NULL;
1771}
1772
1773PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1774{
1775 if (!PyUnicode_Check(unicode)) {
1776 PyErr_BadArgument();
1777 return NULL;
1778 }
1779 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1780 PyUnicode_GET_SIZE(unicode),
1781 NULL);
1782}
1783
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001784#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001785
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001786/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001788PyObject *PyUnicode_DecodeMBCS(const char *s,
1789 int size,
1790 const char *errors)
1791{
1792 PyUnicodeObject *v;
1793 Py_UNICODE *p;
1794
1795 /* First get the size of the result */
1796 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001797 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001798 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1799
1800 v = _PyUnicode_New(usize);
1801 if (v == NULL)
1802 return NULL;
1803 if (usize == 0)
1804 return (PyObject *)v;
1805 p = PyUnicode_AS_UNICODE(v);
1806 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1807 Py_DECREF(v);
1808 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1809 }
1810
1811 return (PyObject *)v;
1812}
1813
1814PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1815 int size,
1816 const char *errors)
1817{
1818 PyObject *repr;
1819 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001820 DWORD mbcssize;
1821
1822 /* If there are no characters, bail now! */
1823 if (size==0)
1824 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001825
1826 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001827 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001828 if (mbcssize==0)
1829 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1830
1831 repr = PyString_FromStringAndSize(NULL, mbcssize);
1832 if (repr == NULL)
1833 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001834 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001835 return repr;
1836
1837 /* Do the conversion */
1838 s = PyString_AS_STRING(repr);
1839 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1840 Py_DECREF(repr);
1841 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1842 }
1843 return repr;
1844}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001846#endif /* MS_WIN32 */
1847
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848/* --- Character Mapping Codec -------------------------------------------- */
1849
1850static
1851int charmap_decoding_error(const char **source,
1852 Py_UNICODE **dest,
1853 const char *errors,
1854 const char *details)
1855{
1856 if ((errors == NULL) ||
1857 (strcmp(errors,"strict") == 0)) {
1858 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001859 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 details);
1861 return -1;
1862 }
1863 else if (strcmp(errors,"ignore") == 0) {
1864 return 0;
1865 }
1866 else if (strcmp(errors,"replace") == 0) {
1867 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1868 (*dest)++;
1869 return 0;
1870 }
1871 else {
1872 PyErr_Format(PyExc_ValueError,
1873 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001874 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 errors);
1876 return -1;
1877 }
1878}
1879
1880PyObject *PyUnicode_DecodeCharmap(const char *s,
1881 int size,
1882 PyObject *mapping,
1883 const char *errors)
1884{
1885 PyUnicodeObject *v;
1886 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001887 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888
1889 /* Default to Latin-1 */
1890 if (mapping == NULL)
1891 return PyUnicode_DecodeLatin1(s, size, errors);
1892
1893 v = _PyUnicode_New(size);
1894 if (v == NULL)
1895 goto onError;
1896 if (size == 0)
1897 return (PyObject *)v;
1898 p = PyUnicode_AS_UNICODE(v);
1899 while (size-- > 0) {
1900 unsigned char ch = *s++;
1901 PyObject *w, *x;
1902
1903 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1904 w = PyInt_FromLong((long)ch);
1905 if (w == NULL)
1906 goto onError;
1907 x = PyObject_GetItem(mapping, w);
1908 Py_DECREF(w);
1909 if (x == NULL) {
1910 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001911 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001913 x = Py_None;
1914 Py_INCREF(x);
1915 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 }
1918
1919 /* Apply mapping */
1920 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001921 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 if (value < 0 || value > 65535) {
1923 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001924 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 Py_DECREF(x);
1926 goto onError;
1927 }
1928 *p++ = (Py_UNICODE)value;
1929 }
1930 else if (x == Py_None) {
1931 /* undefined mapping */
1932 if (charmap_decoding_error(&s, &p, errors,
1933 "character maps to <undefined>")) {
1934 Py_DECREF(x);
1935 goto onError;
1936 }
1937 }
1938 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001939 int targetsize = PyUnicode_GET_SIZE(x);
1940
1941 if (targetsize == 1)
1942 /* 1-1 mapping */
1943 *p++ = *PyUnicode_AS_UNICODE(x);
1944
1945 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001947 if (targetsize > extrachars) {
1948 /* resize first */
1949 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1950 int needed = (targetsize - extrachars) + \
1951 (targetsize << 2);
1952 extrachars += needed;
1953 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001954 Py_DECREF(x);
1955 goto onError;
1956 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001957 p = PyUnicode_AS_UNICODE(v) + oldpos;
1958 }
1959 Py_UNICODE_COPY(p,
1960 PyUnicode_AS_UNICODE(x),
1961 targetsize);
1962 p += targetsize;
1963 extrachars -= targetsize;
1964 }
1965 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 }
1967 else {
1968 /* wrong return value */
1969 PyErr_SetString(PyExc_TypeError,
1970 "character mapping must return integer, None or unicode");
1971 Py_DECREF(x);
1972 goto onError;
1973 }
1974 Py_DECREF(x);
1975 }
1976 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1977 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1978 goto onError;
1979 return (PyObject *)v;
1980
1981 onError:
1982 Py_XDECREF(v);
1983 return NULL;
1984}
1985
1986static
1987int charmap_encoding_error(const Py_UNICODE **source,
1988 char **dest,
1989 const char *errors,
1990 const char *details)
1991{
1992 if ((errors == NULL) ||
1993 (strcmp(errors,"strict") == 0)) {
1994 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001995 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 details);
1997 return -1;
1998 }
1999 else if (strcmp(errors,"ignore") == 0) {
2000 return 0;
2001 }
2002 else if (strcmp(errors,"replace") == 0) {
2003 **dest = '?';
2004 (*dest)++;
2005 return 0;
2006 }
2007 else {
2008 PyErr_Format(PyExc_ValueError,
2009 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002010 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 errors);
2012 return -1;
2013 }
2014}
2015
2016PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2017 int size,
2018 PyObject *mapping,
2019 const char *errors)
2020{
2021 PyObject *v;
2022 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002023 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024
2025 /* Default to Latin-1 */
2026 if (mapping == NULL)
2027 return PyUnicode_EncodeLatin1(p, size, errors);
2028
2029 v = PyString_FromStringAndSize(NULL, size);
2030 if (v == NULL)
2031 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002032 if (size == 0)
2033 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 s = PyString_AS_STRING(v);
2035 while (size-- > 0) {
2036 Py_UNICODE ch = *p++;
2037 PyObject *w, *x;
2038
2039 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2040 w = PyInt_FromLong((long)ch);
2041 if (w == NULL)
2042 goto onError;
2043 x = PyObject_GetItem(mapping, w);
2044 Py_DECREF(w);
2045 if (x == NULL) {
2046 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002047 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002049 x = Py_None;
2050 Py_INCREF(x);
2051 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 }
2054
2055 /* Apply mapping */
2056 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002057 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 if (value < 0 || value > 255) {
2059 PyErr_SetString(PyExc_TypeError,
2060 "character mapping must be in range(256)");
2061 Py_DECREF(x);
2062 goto onError;
2063 }
2064 *s++ = (char)value;
2065 }
2066 else if (x == Py_None) {
2067 /* undefined mapping */
2068 if (charmap_encoding_error(&p, &s, errors,
2069 "character maps to <undefined>")) {
2070 Py_DECREF(x);
2071 goto onError;
2072 }
2073 }
2074 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002075 int targetsize = PyString_GET_SIZE(x);
2076
2077 if (targetsize == 1)
2078 /* 1-1 mapping */
2079 *s++ = *PyString_AS_STRING(x);
2080
2081 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002083 if (targetsize > extrachars) {
2084 /* resize first */
2085 int oldpos = (int)(s - PyString_AS_STRING(v));
2086 int needed = (targetsize - extrachars) + \
2087 (targetsize << 2);
2088 extrachars += needed;
2089 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002090 Py_DECREF(x);
2091 goto onError;
2092 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002093 s = PyString_AS_STRING(v) + oldpos;
2094 }
2095 memcpy(s,
2096 PyString_AS_STRING(x),
2097 targetsize);
2098 s += targetsize;
2099 extrachars -= targetsize;
2100 }
2101 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 }
2103 else {
2104 /* wrong return value */
2105 PyErr_SetString(PyExc_TypeError,
2106 "character mapping must return integer, None or unicode");
2107 Py_DECREF(x);
2108 goto onError;
2109 }
2110 Py_DECREF(x);
2111 }
2112 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2113 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2114 goto onError;
2115 return v;
2116
2117 onError:
2118 Py_DECREF(v);
2119 return NULL;
2120}
2121
2122PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2123 PyObject *mapping)
2124{
2125 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2126 PyErr_BadArgument();
2127 return NULL;
2128 }
2129 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2130 PyUnicode_GET_SIZE(unicode),
2131 mapping,
2132 NULL);
2133}
2134
2135static
2136int translate_error(const Py_UNICODE **source,
2137 Py_UNICODE **dest,
2138 const char *errors,
2139 const char *details)
2140{
2141 if ((errors == NULL) ||
2142 (strcmp(errors,"strict") == 0)) {
2143 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002144 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 details);
2146 return -1;
2147 }
2148 else if (strcmp(errors,"ignore") == 0) {
2149 return 0;
2150 }
2151 else if (strcmp(errors,"replace") == 0) {
2152 **dest = '?';
2153 (*dest)++;
2154 return 0;
2155 }
2156 else {
2157 PyErr_Format(PyExc_ValueError,
2158 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002159 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 errors);
2161 return -1;
2162 }
2163}
2164
2165PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2166 int size,
2167 PyObject *mapping,
2168 const char *errors)
2169{
2170 PyUnicodeObject *v;
2171 Py_UNICODE *p;
2172
2173 if (mapping == NULL) {
2174 PyErr_BadArgument();
2175 return NULL;
2176 }
2177
2178 /* Output will never be longer than input */
2179 v = _PyUnicode_New(size);
2180 if (v == NULL)
2181 goto onError;
2182 if (size == 0)
2183 goto done;
2184 p = PyUnicode_AS_UNICODE(v);
2185 while (size-- > 0) {
2186 Py_UNICODE ch = *s++;
2187 PyObject *w, *x;
2188
2189 /* Get mapping */
2190 w = PyInt_FromLong(ch);
2191 if (w == NULL)
2192 goto onError;
2193 x = PyObject_GetItem(mapping, w);
2194 Py_DECREF(w);
2195 if (x == NULL) {
2196 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2197 /* No mapping found: default to 1-1 mapping */
2198 PyErr_Clear();
2199 *p++ = ch;
2200 continue;
2201 }
2202 goto onError;
2203 }
2204
2205 /* Apply mapping */
2206 if (PyInt_Check(x))
2207 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2208 else if (x == Py_None) {
2209 /* undefined mapping */
2210 if (translate_error(&s, &p, errors,
2211 "character maps to <undefined>")) {
2212 Py_DECREF(x);
2213 goto onError;
2214 }
2215 }
2216 else if (PyUnicode_Check(x)) {
2217 if (PyUnicode_GET_SIZE(x) != 1) {
2218 /* 1-n mapping */
2219 PyErr_SetString(PyExc_NotImplementedError,
2220 "1-n mappings are currently not implemented");
2221 Py_DECREF(x);
2222 goto onError;
2223 }
2224 *p++ = *PyUnicode_AS_UNICODE(x);
2225 }
2226 else {
2227 /* wrong return value */
2228 PyErr_SetString(PyExc_TypeError,
2229 "translate mapping must return integer, None or unicode");
2230 Py_DECREF(x);
2231 goto onError;
2232 }
2233 Py_DECREF(x);
2234 }
2235 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002236 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2237 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 done:
2240 return (PyObject *)v;
2241
2242 onError:
2243 Py_XDECREF(v);
2244 return NULL;
2245}
2246
2247PyObject *PyUnicode_Translate(PyObject *str,
2248 PyObject *mapping,
2249 const char *errors)
2250{
2251 PyObject *result;
2252
2253 str = PyUnicode_FromObject(str);
2254 if (str == NULL)
2255 goto onError;
2256 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2257 PyUnicode_GET_SIZE(str),
2258 mapping,
2259 errors);
2260 Py_DECREF(str);
2261 return result;
2262
2263 onError:
2264 Py_XDECREF(str);
2265 return NULL;
2266}
2267
Guido van Rossum9e896b32000-04-05 20:11:21 +00002268/* --- Decimal Encoder ---------------------------------------------------- */
2269
2270int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2271 int length,
2272 char *output,
2273 const char *errors)
2274{
2275 Py_UNICODE *p, *end;
2276
2277 if (output == NULL) {
2278 PyErr_BadArgument();
2279 return -1;
2280 }
2281
2282 p = s;
2283 end = s + length;
2284 while (p < end) {
2285 register Py_UNICODE ch = *p++;
2286 int decimal;
2287
2288 if (Py_UNICODE_ISSPACE(ch)) {
2289 *output++ = ' ';
2290 continue;
2291 }
2292 decimal = Py_UNICODE_TODECIMAL(ch);
2293 if (decimal >= 0) {
2294 *output++ = '0' + decimal;
2295 continue;
2296 }
Guido van Rossumba477042000-04-06 18:18:10 +00002297 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002298 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002299 continue;
2300 }
2301 /* All other characters are considered invalid */
2302 if (errors == NULL || strcmp(errors, "strict") == 0) {
2303 PyErr_SetString(PyExc_ValueError,
2304 "invalid decimal Unicode string");
2305 goto onError;
2306 }
2307 else if (strcmp(errors, "ignore") == 0)
2308 continue;
2309 else if (strcmp(errors, "replace") == 0) {
2310 *output++ = '?';
2311 continue;
2312 }
2313 }
2314 /* 0-terminate the output string */
2315 *output++ = '\0';
2316 return 0;
2317
2318 onError:
2319 return -1;
2320}
2321
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322/* --- Helpers ------------------------------------------------------------ */
2323
2324static
2325int count(PyUnicodeObject *self,
2326 int start,
2327 int end,
2328 PyUnicodeObject *substring)
2329{
2330 int count = 0;
2331
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002332 if (start < 0)
2333 start += self->length;
2334 if (start < 0)
2335 start = 0;
2336 if (end > self->length)
2337 end = self->length;
2338 if (end < 0)
2339 end += self->length;
2340 if (end < 0)
2341 end = 0;
2342
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002343 if (substring->length == 0)
2344 return (end - start + 1);
2345
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 end -= substring->length;
2347
2348 while (start <= end)
2349 if (Py_UNICODE_MATCH(self, start, substring)) {
2350 count++;
2351 start += substring->length;
2352 } else
2353 start++;
2354
2355 return count;
2356}
2357
2358int PyUnicode_Count(PyObject *str,
2359 PyObject *substr,
2360 int start,
2361 int end)
2362{
2363 int result;
2364
2365 str = PyUnicode_FromObject(str);
2366 if (str == NULL)
2367 return -1;
2368 substr = PyUnicode_FromObject(substr);
2369 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002370 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 return -1;
2372 }
2373
2374 result = count((PyUnicodeObject *)str,
2375 start, end,
2376 (PyUnicodeObject *)substr);
2377
2378 Py_DECREF(str);
2379 Py_DECREF(substr);
2380 return result;
2381}
2382
2383static
2384int findstring(PyUnicodeObject *self,
2385 PyUnicodeObject *substring,
2386 int start,
2387 int end,
2388 int direction)
2389{
2390 if (start < 0)
2391 start += self->length;
2392 if (start < 0)
2393 start = 0;
2394
2395 if (substring->length == 0)
2396 return start;
2397
2398 if (end > self->length)
2399 end = self->length;
2400 if (end < 0)
2401 end += self->length;
2402 if (end < 0)
2403 end = 0;
2404
2405 end -= substring->length;
2406
2407 if (direction < 0) {
2408 for (; end >= start; end--)
2409 if (Py_UNICODE_MATCH(self, end, substring))
2410 return end;
2411 } else {
2412 for (; start <= end; start++)
2413 if (Py_UNICODE_MATCH(self, start, substring))
2414 return start;
2415 }
2416
2417 return -1;
2418}
2419
2420int PyUnicode_Find(PyObject *str,
2421 PyObject *substr,
2422 int start,
2423 int end,
2424 int direction)
2425{
2426 int result;
2427
2428 str = PyUnicode_FromObject(str);
2429 if (str == NULL)
2430 return -1;
2431 substr = PyUnicode_FromObject(substr);
2432 if (substr == NULL) {
2433 Py_DECREF(substr);
2434 return -1;
2435 }
2436
2437 result = findstring((PyUnicodeObject *)str,
2438 (PyUnicodeObject *)substr,
2439 start, end, direction);
2440 Py_DECREF(str);
2441 Py_DECREF(substr);
2442 return result;
2443}
2444
2445static
2446int tailmatch(PyUnicodeObject *self,
2447 PyUnicodeObject *substring,
2448 int start,
2449 int end,
2450 int direction)
2451{
2452 if (start < 0)
2453 start += self->length;
2454 if (start < 0)
2455 start = 0;
2456
2457 if (substring->length == 0)
2458 return 1;
2459
2460 if (end > self->length)
2461 end = self->length;
2462 if (end < 0)
2463 end += self->length;
2464 if (end < 0)
2465 end = 0;
2466
2467 end -= substring->length;
2468 if (end < start)
2469 return 0;
2470
2471 if (direction > 0) {
2472 if (Py_UNICODE_MATCH(self, end, substring))
2473 return 1;
2474 } else {
2475 if (Py_UNICODE_MATCH(self, start, substring))
2476 return 1;
2477 }
2478
2479 return 0;
2480}
2481
2482int PyUnicode_Tailmatch(PyObject *str,
2483 PyObject *substr,
2484 int start,
2485 int end,
2486 int direction)
2487{
2488 int result;
2489
2490 str = PyUnicode_FromObject(str);
2491 if (str == NULL)
2492 return -1;
2493 substr = PyUnicode_FromObject(substr);
2494 if (substr == NULL) {
2495 Py_DECREF(substr);
2496 return -1;
2497 }
2498
2499 result = tailmatch((PyUnicodeObject *)str,
2500 (PyUnicodeObject *)substr,
2501 start, end, direction);
2502 Py_DECREF(str);
2503 Py_DECREF(substr);
2504 return result;
2505}
2506
2507static
2508const Py_UNICODE *findchar(const Py_UNICODE *s,
2509 int size,
2510 Py_UNICODE ch)
2511{
2512 /* like wcschr, but doesn't stop at NULL characters */
2513
2514 while (size-- > 0) {
2515 if (*s == ch)
2516 return s;
2517 s++;
2518 }
2519
2520 return NULL;
2521}
2522
2523/* Apply fixfct filter to the Unicode object self and return a
2524 reference to the modified object */
2525
2526static
2527PyObject *fixup(PyUnicodeObject *self,
2528 int (*fixfct)(PyUnicodeObject *s))
2529{
2530
2531 PyUnicodeObject *u;
2532
2533 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2534 self->length);
2535 if (u == NULL)
2536 return NULL;
2537 if (!fixfct(u)) {
2538 /* fixfct should return TRUE if it modified the buffer. If
2539 FALSE, return a reference to the original buffer instead
2540 (to save space, not time) */
2541 Py_INCREF(self);
2542 Py_DECREF(u);
2543 return (PyObject*) self;
2544 }
2545 return (PyObject*) u;
2546}
2547
2548static
2549int fixupper(PyUnicodeObject *self)
2550{
2551 int len = self->length;
2552 Py_UNICODE *s = self->str;
2553 int status = 0;
2554
2555 while (len-- > 0) {
2556 register Py_UNICODE ch;
2557
2558 ch = Py_UNICODE_TOUPPER(*s);
2559 if (ch != *s) {
2560 status = 1;
2561 *s = ch;
2562 }
2563 s++;
2564 }
2565
2566 return status;
2567}
2568
2569static
2570int fixlower(PyUnicodeObject *self)
2571{
2572 int len = self->length;
2573 Py_UNICODE *s = self->str;
2574 int status = 0;
2575
2576 while (len-- > 0) {
2577 register Py_UNICODE ch;
2578
2579 ch = Py_UNICODE_TOLOWER(*s);
2580 if (ch != *s) {
2581 status = 1;
2582 *s = ch;
2583 }
2584 s++;
2585 }
2586
2587 return status;
2588}
2589
2590static
2591int fixswapcase(PyUnicodeObject *self)
2592{
2593 int len = self->length;
2594 Py_UNICODE *s = self->str;
2595 int status = 0;
2596
2597 while (len-- > 0) {
2598 if (Py_UNICODE_ISUPPER(*s)) {
2599 *s = Py_UNICODE_TOLOWER(*s);
2600 status = 1;
2601 } else if (Py_UNICODE_ISLOWER(*s)) {
2602 *s = Py_UNICODE_TOUPPER(*s);
2603 status = 1;
2604 }
2605 s++;
2606 }
2607
2608 return status;
2609}
2610
2611static
2612int fixcapitalize(PyUnicodeObject *self)
2613{
2614 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2615 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2616 return 1;
2617 }
2618 return 0;
2619}
2620
2621static
2622int fixtitle(PyUnicodeObject *self)
2623{
2624 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2625 register Py_UNICODE *e;
2626 int previous_is_cased;
2627
2628 /* Shortcut for single character strings */
2629 if (PyUnicode_GET_SIZE(self) == 1) {
2630 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2631 if (*p != ch) {
2632 *p = ch;
2633 return 1;
2634 }
2635 else
2636 return 0;
2637 }
2638
2639 e = p + PyUnicode_GET_SIZE(self);
2640 previous_is_cased = 0;
2641 for (; p < e; p++) {
2642 register const Py_UNICODE ch = *p;
2643
2644 if (previous_is_cased)
2645 *p = Py_UNICODE_TOLOWER(ch);
2646 else
2647 *p = Py_UNICODE_TOTITLE(ch);
2648
2649 if (Py_UNICODE_ISLOWER(ch) ||
2650 Py_UNICODE_ISUPPER(ch) ||
2651 Py_UNICODE_ISTITLE(ch))
2652 previous_is_cased = 1;
2653 else
2654 previous_is_cased = 0;
2655 }
2656 return 1;
2657}
2658
2659PyObject *PyUnicode_Join(PyObject *separator,
2660 PyObject *seq)
2661{
2662 Py_UNICODE *sep;
2663 int seplen;
2664 PyUnicodeObject *res = NULL;
2665 int reslen = 0;
2666 Py_UNICODE *p;
2667 int seqlen = 0;
2668 int sz = 100;
2669 int i;
2670
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002671 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 if (seqlen < 0 && PyErr_Occurred())
2673 return NULL;
2674
2675 if (separator == NULL) {
2676 Py_UNICODE blank = ' ';
2677 sep = &blank;
2678 seplen = 1;
2679 }
2680 else {
2681 separator = PyUnicode_FromObject(separator);
2682 if (separator == NULL)
2683 return NULL;
2684 sep = PyUnicode_AS_UNICODE(separator);
2685 seplen = PyUnicode_GET_SIZE(separator);
2686 }
2687
2688 res = _PyUnicode_New(sz);
2689 if (res == NULL)
2690 goto onError;
2691 p = PyUnicode_AS_UNICODE(res);
2692 reslen = 0;
2693
2694 for (i = 0; i < seqlen; i++) {
2695 int itemlen;
2696 PyObject *item;
2697
2698 item = PySequence_GetItem(seq, i);
2699 if (item == NULL)
2700 goto onError;
2701 if (!PyUnicode_Check(item)) {
2702 PyObject *v;
2703 v = PyUnicode_FromObject(item);
2704 Py_DECREF(item);
2705 item = v;
2706 if (item == NULL)
2707 goto onError;
2708 }
2709 itemlen = PyUnicode_GET_SIZE(item);
2710 while (reslen + itemlen + seplen >= sz) {
2711 if (_PyUnicode_Resize(res, sz*2))
2712 goto onError;
2713 sz *= 2;
2714 p = PyUnicode_AS_UNICODE(res) + reslen;
2715 }
2716 if (i > 0) {
2717 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2718 p += seplen;
2719 reslen += seplen;
2720 }
2721 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2722 p += itemlen;
2723 reslen += itemlen;
2724 Py_DECREF(item);
2725 }
2726 if (_PyUnicode_Resize(res, reslen))
2727 goto onError;
2728
2729 Py_XDECREF(separator);
2730 return (PyObject *)res;
2731
2732 onError:
2733 Py_XDECREF(separator);
2734 Py_DECREF(res);
2735 return NULL;
2736}
2737
2738static
2739PyUnicodeObject *pad(PyUnicodeObject *self,
2740 int left,
2741 int right,
2742 Py_UNICODE fill)
2743{
2744 PyUnicodeObject *u;
2745
2746 if (left < 0)
2747 left = 0;
2748 if (right < 0)
2749 right = 0;
2750
2751 if (left == 0 && right == 0) {
2752 Py_INCREF(self);
2753 return self;
2754 }
2755
2756 u = _PyUnicode_New(left + self->length + right);
2757 if (u) {
2758 if (left)
2759 Py_UNICODE_FILL(u->str, fill, left);
2760 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2761 if (right)
2762 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2763 }
2764
2765 return u;
2766}
2767
2768#define SPLIT_APPEND(data, left, right) \
2769 str = PyUnicode_FromUnicode(data + left, right - left); \
2770 if (!str) \
2771 goto onError; \
2772 if (PyList_Append(list, str)) { \
2773 Py_DECREF(str); \
2774 goto onError; \
2775 } \
2776 else \
2777 Py_DECREF(str);
2778
2779static
2780PyObject *split_whitespace(PyUnicodeObject *self,
2781 PyObject *list,
2782 int maxcount)
2783{
2784 register int i;
2785 register int j;
2786 int len = self->length;
2787 PyObject *str;
2788
2789 for (i = j = 0; i < len; ) {
2790 /* find a token */
2791 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2792 i++;
2793 j = i;
2794 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2795 i++;
2796 if (j < i) {
2797 if (maxcount-- <= 0)
2798 break;
2799 SPLIT_APPEND(self->str, j, i);
2800 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2801 i++;
2802 j = i;
2803 }
2804 }
2805 if (j < len) {
2806 SPLIT_APPEND(self->str, j, len);
2807 }
2808 return list;
2809
2810 onError:
2811 Py_DECREF(list);
2812 return NULL;
2813}
2814
2815PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002816 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817{
2818 register int i;
2819 register int j;
2820 int len;
2821 PyObject *list;
2822 PyObject *str;
2823 Py_UNICODE *data;
2824
2825 string = PyUnicode_FromObject(string);
2826 if (string == NULL)
2827 return NULL;
2828 data = PyUnicode_AS_UNICODE(string);
2829 len = PyUnicode_GET_SIZE(string);
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 list = PyList_New(0);
2832 if (!list)
2833 goto onError;
2834
2835 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002836 int eol;
2837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 /* Find a line and append it */
2839 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2840 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841
2842 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002843 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 if (i < len) {
2845 if (data[i] == '\r' && i + 1 < len &&
2846 data[i+1] == '\n')
2847 i += 2;
2848 else
2849 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002850 if (keepends)
2851 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 }
Guido van Rossum86662912000-04-11 15:38:46 +00002853 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 j = i;
2855 }
2856 if (j < len) {
2857 SPLIT_APPEND(data, j, len);
2858 }
2859
2860 Py_DECREF(string);
2861 return list;
2862
2863 onError:
2864 Py_DECREF(list);
2865 Py_DECREF(string);
2866 return NULL;
2867}
2868
2869static
2870PyObject *split_char(PyUnicodeObject *self,
2871 PyObject *list,
2872 Py_UNICODE ch,
2873 int maxcount)
2874{
2875 register int i;
2876 register int j;
2877 int len = self->length;
2878 PyObject *str;
2879
2880 for (i = j = 0; i < len; ) {
2881 if (self->str[i] == ch) {
2882 if (maxcount-- <= 0)
2883 break;
2884 SPLIT_APPEND(self->str, j, i);
2885 i = j = i + 1;
2886 } else
2887 i++;
2888 }
2889 if (j <= len) {
2890 SPLIT_APPEND(self->str, j, len);
2891 }
2892 return list;
2893
2894 onError:
2895 Py_DECREF(list);
2896 return NULL;
2897}
2898
2899static
2900PyObject *split_substring(PyUnicodeObject *self,
2901 PyObject *list,
2902 PyUnicodeObject *substring,
2903 int maxcount)
2904{
2905 register int i;
2906 register int j;
2907 int len = self->length;
2908 int sublen = substring->length;
2909 PyObject *str;
2910
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002911 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 if (Py_UNICODE_MATCH(self, i, substring)) {
2913 if (maxcount-- <= 0)
2914 break;
2915 SPLIT_APPEND(self->str, j, i);
2916 i = j = i + sublen;
2917 } else
2918 i++;
2919 }
2920 if (j <= len) {
2921 SPLIT_APPEND(self->str, j, len);
2922 }
2923 return list;
2924
2925 onError:
2926 Py_DECREF(list);
2927 return NULL;
2928}
2929
2930#undef SPLIT_APPEND
2931
2932static
2933PyObject *split(PyUnicodeObject *self,
2934 PyUnicodeObject *substring,
2935 int maxcount)
2936{
2937 PyObject *list;
2938
2939 if (maxcount < 0)
2940 maxcount = INT_MAX;
2941
2942 list = PyList_New(0);
2943 if (!list)
2944 return NULL;
2945
2946 if (substring == NULL)
2947 return split_whitespace(self,list,maxcount);
2948
2949 else if (substring->length == 1)
2950 return split_char(self,list,substring->str[0],maxcount);
2951
2952 else if (substring->length == 0) {
2953 Py_DECREF(list);
2954 PyErr_SetString(PyExc_ValueError, "empty separator");
2955 return NULL;
2956 }
2957 else
2958 return split_substring(self,list,substring,maxcount);
2959}
2960
2961static
2962PyObject *strip(PyUnicodeObject *self,
2963 int left,
2964 int right)
2965{
2966 Py_UNICODE *p = self->str;
2967 int start = 0;
2968 int end = self->length;
2969
2970 if (left)
2971 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2972 start++;
2973
2974 if (right)
2975 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2976 end--;
2977
2978 if (start == 0 && end == self->length) {
2979 /* couldn't strip anything off, return original string */
2980 Py_INCREF(self);
2981 return (PyObject*) self;
2982 }
2983
2984 return (PyObject*) PyUnicode_FromUnicode(
2985 self->str + start,
2986 end - start
2987 );
2988}
2989
2990static
2991PyObject *replace(PyUnicodeObject *self,
2992 PyUnicodeObject *str1,
2993 PyUnicodeObject *str2,
2994 int maxcount)
2995{
2996 PyUnicodeObject *u;
2997
2998 if (maxcount < 0)
2999 maxcount = INT_MAX;
3000
3001 if (str1->length == 1 && str2->length == 1) {
3002 int i;
3003
3004 /* replace characters */
3005 if (!findchar(self->str, self->length, str1->str[0])) {
3006 /* nothing to replace, return original string */
3007 Py_INCREF(self);
3008 u = self;
3009 } else {
3010 Py_UNICODE u1 = str1->str[0];
3011 Py_UNICODE u2 = str2->str[0];
3012
3013 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3014 self->str,
3015 self->length
3016 );
3017 if (u)
3018 for (i = 0; i < u->length; i++)
3019 if (u->str[i] == u1) {
3020 if (--maxcount < 0)
3021 break;
3022 u->str[i] = u2;
3023 }
3024 }
3025
3026 } else {
3027 int n, i;
3028 Py_UNICODE *p;
3029
3030 /* replace strings */
3031 n = count(self, 0, self->length, str1);
3032 if (n > maxcount)
3033 n = maxcount;
3034 if (n == 0) {
3035 /* nothing to replace, return original string */
3036 Py_INCREF(self);
3037 u = self;
3038 } else {
3039 u = _PyUnicode_New(
3040 self->length + n * (str2->length - str1->length));
3041 if (u) {
3042 i = 0;
3043 p = u->str;
3044 while (i <= self->length - str1->length)
3045 if (Py_UNICODE_MATCH(self, i, str1)) {
3046 /* replace string segment */
3047 Py_UNICODE_COPY(p, str2->str, str2->length);
3048 p += str2->length;
3049 i += str1->length;
3050 if (--n <= 0) {
3051 /* copy remaining part */
3052 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3053 break;
3054 }
3055 } else
3056 *p++ = self->str[i++];
3057 }
3058 }
3059 }
3060
3061 return (PyObject *) u;
3062}
3063
3064/* --- Unicode Object Methods --------------------------------------------- */
3065
3066static char title__doc__[] =
3067"S.title() -> unicode\n\
3068\n\
3069Return a titlecased version of S, i.e. words start with title case\n\
3070characters, all remaining cased characters have lower case.";
3071
3072static PyObject*
3073unicode_title(PyUnicodeObject *self, PyObject *args)
3074{
3075 if (!PyArg_NoArgs(args))
3076 return NULL;
3077 return fixup(self, fixtitle);
3078}
3079
3080static char capitalize__doc__[] =
3081"S.capitalize() -> unicode\n\
3082\n\
3083Return a capitalized version of S, i.e. make the first character\n\
3084have upper case.";
3085
3086static PyObject*
3087unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3088{
3089 if (!PyArg_NoArgs(args))
3090 return NULL;
3091 return fixup(self, fixcapitalize);
3092}
3093
3094#if 0
3095static char capwords__doc__[] =
3096"S.capwords() -> unicode\n\
3097\n\
3098Apply .capitalize() to all words in S and return the result with\n\
3099normalized whitespace (all whitespace strings are replaced by ' ').";
3100
3101static PyObject*
3102unicode_capwords(PyUnicodeObject *self, PyObject *args)
3103{
3104 PyObject *list;
3105 PyObject *item;
3106 int i;
3107
3108 if (!PyArg_NoArgs(args))
3109 return NULL;
3110
3111 /* Split into words */
3112 list = split(self, NULL, -1);
3113 if (!list)
3114 return NULL;
3115
3116 /* Capitalize each word */
3117 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3118 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3119 fixcapitalize);
3120 if (item == NULL)
3121 goto onError;
3122 Py_DECREF(PyList_GET_ITEM(list, i));
3123 PyList_SET_ITEM(list, i, item);
3124 }
3125
3126 /* Join the words to form a new string */
3127 item = PyUnicode_Join(NULL, list);
3128
3129onError:
3130 Py_DECREF(list);
3131 return (PyObject *)item;
3132}
3133#endif
3134
3135static char center__doc__[] =
3136"S.center(width) -> unicode\n\
3137\n\
3138Return S centered in a Unicode string of length width. Padding is done\n\
3139using spaces.";
3140
3141static PyObject *
3142unicode_center(PyUnicodeObject *self, PyObject *args)
3143{
3144 int marg, left;
3145 int width;
3146
3147 if (!PyArg_ParseTuple(args, "i:center", &width))
3148 return NULL;
3149
3150 if (self->length >= width) {
3151 Py_INCREF(self);
3152 return (PyObject*) self;
3153 }
3154
3155 marg = width - self->length;
3156 left = marg / 2 + (marg & width & 1);
3157
3158 return (PyObject*) pad(self, left, marg - left, ' ');
3159}
3160
Marc-André Lemburge5034372000-08-08 08:04:29 +00003161#if 0
3162
3163/* This code should go into some future Unicode collation support
3164 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003165 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003166
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003167/* speedy UTF-16 code point order comparison */
3168/* gleaned from: */
3169/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3170
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003171static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003172{
3173 0, 0, 0, 0, 0, 0, 0, 0,
3174 0, 0, 0, 0, 0, 0, 0, 0,
3175 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003176 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003177};
3178
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179static int
3180unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3181{
3182 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003183
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184 Py_UNICODE *s1 = str1->str;
3185 Py_UNICODE *s2 = str2->str;
3186
3187 len1 = str1->length;
3188 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003189
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003191 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003192 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003193
3194 c1 = *s1++;
3195 c2 = *s2++;
3196 if (c1 > (1<<11) * 26)
3197 c1 += utf16Fixup[c1>>11];
3198 if (c2 > (1<<11) * 26)
3199 c2 += utf16Fixup[c2>>11];
3200
3201 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003202 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003203 if (diff)
3204 return (diff < 0) ? -1 : (diff != 0);
3205 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 }
3207
3208 return (len1 < len2) ? -1 : (len1 != len2);
3209}
3210
Marc-André Lemburge5034372000-08-08 08:04:29 +00003211#else
3212
3213static int
3214unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3215{
3216 register int len1, len2;
3217
3218 Py_UNICODE *s1 = str1->str;
3219 Py_UNICODE *s2 = str2->str;
3220
3221 len1 = str1->length;
3222 len2 = str2->length;
3223
3224 while (len1 > 0 && len2 > 0) {
3225 register long diff;
3226
3227 diff = (long)*s1++ - (long)*s2++;
3228 if (diff)
3229 return (diff < 0) ? -1 : (diff != 0);
3230 len1--; len2--;
3231 }
3232
3233 return (len1 < len2) ? -1 : (len1 != len2);
3234}
3235
3236#endif
3237
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238int PyUnicode_Compare(PyObject *left,
3239 PyObject *right)
3240{
3241 PyUnicodeObject *u = NULL, *v = NULL;
3242 int result;
3243
3244 /* Coerce the two arguments */
3245 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3246 if (u == NULL)
3247 goto onError;
3248 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3249 if (v == NULL)
3250 goto onError;
3251
Thomas Wouters7e474022000-07-16 12:04:32 +00003252 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 if (v == u) {
3254 Py_DECREF(u);
3255 Py_DECREF(v);
3256 return 0;
3257 }
3258
3259 result = unicode_compare(u, v);
3260
3261 Py_DECREF(u);
3262 Py_DECREF(v);
3263 return result;
3264
3265onError:
3266 Py_XDECREF(u);
3267 Py_XDECREF(v);
3268 return -1;
3269}
3270
Guido van Rossum403d68b2000-03-13 15:55:09 +00003271int PyUnicode_Contains(PyObject *container,
3272 PyObject *element)
3273{
3274 PyUnicodeObject *u = NULL, *v = NULL;
3275 int result;
3276 register const Py_UNICODE *p, *e;
3277 register Py_UNICODE ch;
3278
3279 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003280 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003281 if (v == NULL) {
3282 PyErr_SetString(PyExc_TypeError,
3283 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003284 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003285 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003286 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3287 if (u == NULL) {
3288 Py_DECREF(v);
3289 goto onError;
3290 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003291
3292 /* Check v in u */
3293 if (PyUnicode_GET_SIZE(v) != 1) {
3294 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003295 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003296 goto onError;
3297 }
3298 ch = *PyUnicode_AS_UNICODE(v);
3299 p = PyUnicode_AS_UNICODE(u);
3300 e = p + PyUnicode_GET_SIZE(u);
3301 result = 0;
3302 while (p < e) {
3303 if (*p++ == ch) {
3304 result = 1;
3305 break;
3306 }
3307 }
3308
3309 Py_DECREF(u);
3310 Py_DECREF(v);
3311 return result;
3312
3313onError:
3314 Py_XDECREF(u);
3315 Py_XDECREF(v);
3316 return -1;
3317}
3318
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319/* Concat to string or Unicode object giving a new Unicode object. */
3320
3321PyObject *PyUnicode_Concat(PyObject *left,
3322 PyObject *right)
3323{
3324 PyUnicodeObject *u = NULL, *v = NULL, *w;
3325
3326 /* Coerce the two arguments */
3327 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3328 if (u == NULL)
3329 goto onError;
3330 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3331 if (v == NULL)
3332 goto onError;
3333
3334 /* Shortcuts */
3335 if (v == unicode_empty) {
3336 Py_DECREF(v);
3337 return (PyObject *)u;
3338 }
3339 if (u == unicode_empty) {
3340 Py_DECREF(u);
3341 return (PyObject *)v;
3342 }
3343
3344 /* Concat the two Unicode strings */
3345 w = _PyUnicode_New(u->length + v->length);
3346 if (w == NULL)
3347 goto onError;
3348 Py_UNICODE_COPY(w->str, u->str, u->length);
3349 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3350
3351 Py_DECREF(u);
3352 Py_DECREF(v);
3353 return (PyObject *)w;
3354
3355onError:
3356 Py_XDECREF(u);
3357 Py_XDECREF(v);
3358 return NULL;
3359}
3360
3361static char count__doc__[] =
3362"S.count(sub[, start[, end]]) -> int\n\
3363\n\
3364Return the number of occurrences of substring sub in Unicode string\n\
3365S[start:end]. Optional arguments start and end are\n\
3366interpreted as in slice notation.";
3367
3368static PyObject *
3369unicode_count(PyUnicodeObject *self, PyObject *args)
3370{
3371 PyUnicodeObject *substring;
3372 int start = 0;
3373 int end = INT_MAX;
3374 PyObject *result;
3375
Guido van Rossumb8872e62000-05-09 14:14:27 +00003376 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3377 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 return NULL;
3379
3380 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3381 (PyObject *)substring);
3382 if (substring == NULL)
3383 return NULL;
3384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 if (start < 0)
3386 start += self->length;
3387 if (start < 0)
3388 start = 0;
3389 if (end > self->length)
3390 end = self->length;
3391 if (end < 0)
3392 end += self->length;
3393 if (end < 0)
3394 end = 0;
3395
3396 result = PyInt_FromLong((long) count(self, start, end, substring));
3397
3398 Py_DECREF(substring);
3399 return result;
3400}
3401
3402static char encode__doc__[] =
3403"S.encode([encoding[,errors]]) -> string\n\
3404\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003405Return an encoded string version of S. Default encoding is the current\n\
3406default string encoding. errors may be given to set a different error\n\
3407handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3408a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409
3410static PyObject *
3411unicode_encode(PyUnicodeObject *self, PyObject *args)
3412{
3413 char *encoding = NULL;
3414 char *errors = NULL;
3415 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3416 return NULL;
3417 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3418}
3419
3420static char expandtabs__doc__[] =
3421"S.expandtabs([tabsize]) -> unicode\n\
3422\n\
3423Return a copy of S where all tab characters are expanded using spaces.\n\
3424If tabsize is not given, a tab size of 8 characters is assumed.";
3425
3426static PyObject*
3427unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3428{
3429 Py_UNICODE *e;
3430 Py_UNICODE *p;
3431 Py_UNICODE *q;
3432 int i, j;
3433 PyUnicodeObject *u;
3434 int tabsize = 8;
3435
3436 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3437 return NULL;
3438
Thomas Wouters7e474022000-07-16 12:04:32 +00003439 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 i = j = 0;
3441 e = self->str + self->length;
3442 for (p = self->str; p < e; p++)
3443 if (*p == '\t') {
3444 if (tabsize > 0)
3445 j += tabsize - (j % tabsize);
3446 }
3447 else {
3448 j++;
3449 if (*p == '\n' || *p == '\r') {
3450 i += j;
3451 j = 0;
3452 }
3453 }
3454
3455 /* Second pass: create output string and fill it */
3456 u = _PyUnicode_New(i + j);
3457 if (!u)
3458 return NULL;
3459
3460 j = 0;
3461 q = u->str;
3462
3463 for (p = self->str; p < e; p++)
3464 if (*p == '\t') {
3465 if (tabsize > 0) {
3466 i = tabsize - (j % tabsize);
3467 j += i;
3468 while (i--)
3469 *q++ = ' ';
3470 }
3471 }
3472 else {
3473 j++;
3474 *q++ = *p;
3475 if (*p == '\n' || *p == '\r')
3476 j = 0;
3477 }
3478
3479 return (PyObject*) u;
3480}
3481
3482static char find__doc__[] =
3483"S.find(sub [,start [,end]]) -> int\n\
3484\n\
3485Return the lowest index in S where substring sub is found,\n\
3486such that sub is contained within s[start,end]. Optional\n\
3487arguments start and end are interpreted as in slice notation.\n\
3488\n\
3489Return -1 on failure.";
3490
3491static PyObject *
3492unicode_find(PyUnicodeObject *self, PyObject *args)
3493{
3494 PyUnicodeObject *substring;
3495 int start = 0;
3496 int end = INT_MAX;
3497 PyObject *result;
3498
Guido van Rossumb8872e62000-05-09 14:14:27 +00003499 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3500 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 return NULL;
3502 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3503 (PyObject *)substring);
3504 if (substring == NULL)
3505 return NULL;
3506
3507 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3508
3509 Py_DECREF(substring);
3510 return result;
3511}
3512
3513static PyObject *
3514unicode_getitem(PyUnicodeObject *self, int index)
3515{
3516 if (index < 0 || index >= self->length) {
3517 PyErr_SetString(PyExc_IndexError, "string index out of range");
3518 return NULL;
3519 }
3520
3521 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3522}
3523
3524static long
3525unicode_hash(PyUnicodeObject *self)
3526{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003527 /* Since Unicode objects compare equal to their ASCII string
3528 counterparts, they should use the individual character values
3529 as basis for their hash value. This is needed to assure that
3530 strings and Unicode objects behave in the same way as
3531 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532
Fredrik Lundhdde61642000-07-10 18:27:47 +00003533 register int len;
3534 register Py_UNICODE *p;
3535 register long x;
3536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 if (self->hash != -1)
3538 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003539 len = PyUnicode_GET_SIZE(self);
3540 p = PyUnicode_AS_UNICODE(self);
3541 x = *p << 7;
3542 while (--len >= 0)
3543 x = (1000003*x) ^ *p++;
3544 x ^= PyUnicode_GET_SIZE(self);
3545 if (x == -1)
3546 x = -2;
3547 self->hash = x;
3548 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549}
3550
3551static char index__doc__[] =
3552"S.index(sub [,start [,end]]) -> int\n\
3553\n\
3554Like S.find() but raise ValueError when the substring is not found.";
3555
3556static PyObject *
3557unicode_index(PyUnicodeObject *self, PyObject *args)
3558{
3559 int result;
3560 PyUnicodeObject *substring;
3561 int start = 0;
3562 int end = INT_MAX;
3563
Guido van Rossumb8872e62000-05-09 14:14:27 +00003564 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3565 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 return NULL;
3567
3568 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3569 (PyObject *)substring);
3570 if (substring == NULL)
3571 return NULL;
3572
3573 result = findstring(self, substring, start, end, 1);
3574
3575 Py_DECREF(substring);
3576 if (result < 0) {
3577 PyErr_SetString(PyExc_ValueError, "substring not found");
3578 return NULL;
3579 }
3580 return PyInt_FromLong(result);
3581}
3582
3583static char islower__doc__[] =
3584"S.islower() -> int\n\
3585\n\
3586Return 1 if all cased characters in S are lowercase and there is\n\
3587at least one cased character in S, 0 otherwise.";
3588
3589static PyObject*
3590unicode_islower(PyUnicodeObject *self, PyObject *args)
3591{
3592 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3593 register const Py_UNICODE *e;
3594 int cased;
3595
3596 if (!PyArg_NoArgs(args))
3597 return NULL;
3598
3599 /* Shortcut for single character strings */
3600 if (PyUnicode_GET_SIZE(self) == 1)
3601 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3602
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003603 /* Special case for empty strings */
3604 if (PyString_GET_SIZE(self) == 0)
3605 return PyInt_FromLong(0);
3606
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 e = p + PyUnicode_GET_SIZE(self);
3608 cased = 0;
3609 for (; p < e; p++) {
3610 register const Py_UNICODE ch = *p;
3611
3612 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3613 return PyInt_FromLong(0);
3614 else if (!cased && Py_UNICODE_ISLOWER(ch))
3615 cased = 1;
3616 }
3617 return PyInt_FromLong(cased);
3618}
3619
3620static char isupper__doc__[] =
3621"S.isupper() -> int\n\
3622\n\
3623Return 1 if all cased characters in S are uppercase and there is\n\
3624at least one cased character in S, 0 otherwise.";
3625
3626static PyObject*
3627unicode_isupper(PyUnicodeObject *self, PyObject *args)
3628{
3629 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3630 register const Py_UNICODE *e;
3631 int cased;
3632
3633 if (!PyArg_NoArgs(args))
3634 return NULL;
3635
3636 /* Shortcut for single character strings */
3637 if (PyUnicode_GET_SIZE(self) == 1)
3638 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3639
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003640 /* Special case for empty strings */
3641 if (PyString_GET_SIZE(self) == 0)
3642 return PyInt_FromLong(0);
3643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 e = p + PyUnicode_GET_SIZE(self);
3645 cased = 0;
3646 for (; p < e; p++) {
3647 register const Py_UNICODE ch = *p;
3648
3649 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3650 return PyInt_FromLong(0);
3651 else if (!cased && Py_UNICODE_ISUPPER(ch))
3652 cased = 1;
3653 }
3654 return PyInt_FromLong(cased);
3655}
3656
3657static char istitle__doc__[] =
3658"S.istitle() -> int\n\
3659\n\
3660Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3661may only follow uncased characters and lowercase characters only cased\n\
3662ones. Return 0 otherwise.";
3663
3664static PyObject*
3665unicode_istitle(PyUnicodeObject *self, PyObject *args)
3666{
3667 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3668 register const Py_UNICODE *e;
3669 int cased, previous_is_cased;
3670
3671 if (!PyArg_NoArgs(args))
3672 return NULL;
3673
3674 /* Shortcut for single character strings */
3675 if (PyUnicode_GET_SIZE(self) == 1)
3676 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3677 (Py_UNICODE_ISUPPER(*p) != 0));
3678
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003679 /* Special case for empty strings */
3680 if (PyString_GET_SIZE(self) == 0)
3681 return PyInt_FromLong(0);
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 e = p + PyUnicode_GET_SIZE(self);
3684 cased = 0;
3685 previous_is_cased = 0;
3686 for (; p < e; p++) {
3687 register const Py_UNICODE ch = *p;
3688
3689 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3690 if (previous_is_cased)
3691 return PyInt_FromLong(0);
3692 previous_is_cased = 1;
3693 cased = 1;
3694 }
3695 else if (Py_UNICODE_ISLOWER(ch)) {
3696 if (!previous_is_cased)
3697 return PyInt_FromLong(0);
3698 previous_is_cased = 1;
3699 cased = 1;
3700 }
3701 else
3702 previous_is_cased = 0;
3703 }
3704 return PyInt_FromLong(cased);
3705}
3706
3707static char isspace__doc__[] =
3708"S.isspace() -> int\n\
3709\n\
3710Return 1 if there are only whitespace characters in S,\n\
37110 otherwise.";
3712
3713static PyObject*
3714unicode_isspace(PyUnicodeObject *self, PyObject *args)
3715{
3716 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3717 register const Py_UNICODE *e;
3718
3719 if (!PyArg_NoArgs(args))
3720 return NULL;
3721
3722 /* Shortcut for single character strings */
3723 if (PyUnicode_GET_SIZE(self) == 1 &&
3724 Py_UNICODE_ISSPACE(*p))
3725 return PyInt_FromLong(1);
3726
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003727 /* Special case for empty strings */
3728 if (PyString_GET_SIZE(self) == 0)
3729 return PyInt_FromLong(0);
3730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 e = p + PyUnicode_GET_SIZE(self);
3732 for (; p < e; p++) {
3733 if (!Py_UNICODE_ISSPACE(*p))
3734 return PyInt_FromLong(0);
3735 }
3736 return PyInt_FromLong(1);
3737}
3738
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003739static char isalpha__doc__[] =
3740"S.isalpha() -> int\n\
3741\n\
3742Return 1 if all characters in S are alphabetic\n\
3743and there is at least one character in S, 0 otherwise.";
3744
3745static PyObject*
3746unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3747{
3748 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3749 register const Py_UNICODE *e;
3750
3751 if (!PyArg_NoArgs(args))
3752 return NULL;
3753
3754 /* Shortcut for single character strings */
3755 if (PyUnicode_GET_SIZE(self) == 1 &&
3756 Py_UNICODE_ISALPHA(*p))
3757 return PyInt_FromLong(1);
3758
3759 /* Special case for empty strings */
3760 if (PyString_GET_SIZE(self) == 0)
3761 return PyInt_FromLong(0);
3762
3763 e = p + PyUnicode_GET_SIZE(self);
3764 for (; p < e; p++) {
3765 if (!Py_UNICODE_ISALPHA(*p))
3766 return PyInt_FromLong(0);
3767 }
3768 return PyInt_FromLong(1);
3769}
3770
3771static char isalnum__doc__[] =
3772"S.isalnum() -> int\n\
3773\n\
3774Return 1 if all characters in S are alphanumeric\n\
3775and there is at least one character in S, 0 otherwise.";
3776
3777static PyObject*
3778unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3779{
3780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3781 register const Py_UNICODE *e;
3782
3783 if (!PyArg_NoArgs(args))
3784 return NULL;
3785
3786 /* Shortcut for single character strings */
3787 if (PyUnicode_GET_SIZE(self) == 1 &&
3788 Py_UNICODE_ISALNUM(*p))
3789 return PyInt_FromLong(1);
3790
3791 /* Special case for empty strings */
3792 if (PyString_GET_SIZE(self) == 0)
3793 return PyInt_FromLong(0);
3794
3795 e = p + PyUnicode_GET_SIZE(self);
3796 for (; p < e; p++) {
3797 if (!Py_UNICODE_ISALNUM(*p))
3798 return PyInt_FromLong(0);
3799 }
3800 return PyInt_FromLong(1);
3801}
3802
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803static char isdecimal__doc__[] =
3804"S.isdecimal() -> int\n\
3805\n\
3806Return 1 if there are only decimal characters in S,\n\
38070 otherwise.";
3808
3809static PyObject*
3810unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3811{
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814
3815 if (!PyArg_NoArgs(args))
3816 return NULL;
3817
3818 /* Shortcut for single character strings */
3819 if (PyUnicode_GET_SIZE(self) == 1 &&
3820 Py_UNICODE_ISDECIMAL(*p))
3821 return PyInt_FromLong(1);
3822
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003823 /* Special case for empty strings */
3824 if (PyString_GET_SIZE(self) == 0)
3825 return PyInt_FromLong(0);
3826
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 e = p + PyUnicode_GET_SIZE(self);
3828 for (; p < e; p++) {
3829 if (!Py_UNICODE_ISDECIMAL(*p))
3830 return PyInt_FromLong(0);
3831 }
3832 return PyInt_FromLong(1);
3833}
3834
3835static char isdigit__doc__[] =
3836"S.isdigit() -> int\n\
3837\n\
3838Return 1 if there are only digit characters in S,\n\
38390 otherwise.";
3840
3841static PyObject*
3842unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3843{
3844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3845 register const Py_UNICODE *e;
3846
3847 if (!PyArg_NoArgs(args))
3848 return NULL;
3849
3850 /* Shortcut for single character strings */
3851 if (PyUnicode_GET_SIZE(self) == 1 &&
3852 Py_UNICODE_ISDIGIT(*p))
3853 return PyInt_FromLong(1);
3854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003855 /* Special case for empty strings */
3856 if (PyString_GET_SIZE(self) == 0)
3857 return PyInt_FromLong(0);
3858
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 e = p + PyUnicode_GET_SIZE(self);
3860 for (; p < e; p++) {
3861 if (!Py_UNICODE_ISDIGIT(*p))
3862 return PyInt_FromLong(0);
3863 }
3864 return PyInt_FromLong(1);
3865}
3866
3867static char isnumeric__doc__[] =
3868"S.isnumeric() -> int\n\
3869\n\
3870Return 1 if there are only numeric characters in S,\n\
38710 otherwise.";
3872
3873static PyObject*
3874unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3875{
3876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3877 register const Py_UNICODE *e;
3878
3879 if (!PyArg_NoArgs(args))
3880 return NULL;
3881
3882 /* Shortcut for single character strings */
3883 if (PyUnicode_GET_SIZE(self) == 1 &&
3884 Py_UNICODE_ISNUMERIC(*p))
3885 return PyInt_FromLong(1);
3886
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003887 /* Special case for empty strings */
3888 if (PyString_GET_SIZE(self) == 0)
3889 return PyInt_FromLong(0);
3890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 e = p + PyUnicode_GET_SIZE(self);
3892 for (; p < e; p++) {
3893 if (!Py_UNICODE_ISNUMERIC(*p))
3894 return PyInt_FromLong(0);
3895 }
3896 return PyInt_FromLong(1);
3897}
3898
3899static char join__doc__[] =
3900"S.join(sequence) -> unicode\n\
3901\n\
3902Return a string which is the concatenation of the strings in the\n\
3903sequence. The separator between elements is S.";
3904
3905static PyObject*
3906unicode_join(PyUnicodeObject *self, PyObject *args)
3907{
3908 PyObject *data;
3909 if (!PyArg_ParseTuple(args, "O:join", &data))
3910 return NULL;
3911
3912 return PyUnicode_Join((PyObject *)self, data);
3913}
3914
3915static int
3916unicode_length(PyUnicodeObject *self)
3917{
3918 return self->length;
3919}
3920
3921static char ljust__doc__[] =
3922"S.ljust(width) -> unicode\n\
3923\n\
3924Return S left justified in a Unicode string of length width. Padding is\n\
3925done using spaces.";
3926
3927static PyObject *
3928unicode_ljust(PyUnicodeObject *self, PyObject *args)
3929{
3930 int width;
3931 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3932 return NULL;
3933
3934 if (self->length >= width) {
3935 Py_INCREF(self);
3936 return (PyObject*) self;
3937 }
3938
3939 return (PyObject*) pad(self, 0, width - self->length, ' ');
3940}
3941
3942static char lower__doc__[] =
3943"S.lower() -> unicode\n\
3944\n\
3945Return a copy of the string S converted to lowercase.";
3946
3947static PyObject*
3948unicode_lower(PyUnicodeObject *self, PyObject *args)
3949{
3950 if (!PyArg_NoArgs(args))
3951 return NULL;
3952 return fixup(self, fixlower);
3953}
3954
3955static char lstrip__doc__[] =
3956"S.lstrip() -> unicode\n\
3957\n\
3958Return a copy of the string S with leading whitespace removed.";
3959
3960static PyObject *
3961unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3962{
3963 if (!PyArg_NoArgs(args))
3964 return NULL;
3965 return strip(self, 1, 0);
3966}
3967
3968static PyObject*
3969unicode_repeat(PyUnicodeObject *str, int len)
3970{
3971 PyUnicodeObject *u;
3972 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003973 int nchars;
3974 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
3976 if (len < 0)
3977 len = 0;
3978
3979 if (len == 1) {
3980 /* no repeat, return original string */
3981 Py_INCREF(str);
3982 return (PyObject*) str;
3983 }
Tim Peters8f422462000-09-09 06:13:41 +00003984
3985 /* ensure # of chars needed doesn't overflow int and # of bytes
3986 * needed doesn't overflow size_t
3987 */
3988 nchars = len * str->length;
3989 if (len && nchars / len != str->length) {
3990 PyErr_SetString(PyExc_OverflowError,
3991 "repeated string is too long");
3992 return NULL;
3993 }
3994 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
3995 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
3996 PyErr_SetString(PyExc_OverflowError,
3997 "repeated string is too long");
3998 return NULL;
3999 }
4000 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 if (!u)
4002 return NULL;
4003
4004 p = u->str;
4005
4006 while (len-- > 0) {
4007 Py_UNICODE_COPY(p, str->str, str->length);
4008 p += str->length;
4009 }
4010
4011 return (PyObject*) u;
4012}
4013
4014PyObject *PyUnicode_Replace(PyObject *obj,
4015 PyObject *subobj,
4016 PyObject *replobj,
4017 int maxcount)
4018{
4019 PyObject *self;
4020 PyObject *str1;
4021 PyObject *str2;
4022 PyObject *result;
4023
4024 self = PyUnicode_FromObject(obj);
4025 if (self == NULL)
4026 return NULL;
4027 str1 = PyUnicode_FromObject(subobj);
4028 if (str1 == NULL) {
4029 Py_DECREF(self);
4030 return NULL;
4031 }
4032 str2 = PyUnicode_FromObject(replobj);
4033 if (str2 == NULL) {
4034 Py_DECREF(self);
4035 Py_DECREF(str1);
4036 return NULL;
4037 }
4038 result = replace((PyUnicodeObject *)self,
4039 (PyUnicodeObject *)str1,
4040 (PyUnicodeObject *)str2,
4041 maxcount);
4042 Py_DECREF(self);
4043 Py_DECREF(str1);
4044 Py_DECREF(str2);
4045 return result;
4046}
4047
4048static char replace__doc__[] =
4049"S.replace (old, new[, maxsplit]) -> unicode\n\
4050\n\
4051Return a copy of S with all occurrences of substring\n\
4052old replaced by new. If the optional argument maxsplit is\n\
4053given, only the first maxsplit occurrences are replaced.";
4054
4055static PyObject*
4056unicode_replace(PyUnicodeObject *self, PyObject *args)
4057{
4058 PyUnicodeObject *str1;
4059 PyUnicodeObject *str2;
4060 int maxcount = -1;
4061 PyObject *result;
4062
4063 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4064 return NULL;
4065 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4066 if (str1 == NULL)
4067 return NULL;
4068 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4069 if (str2 == NULL)
4070 return NULL;
4071
4072 result = replace(self, str1, str2, maxcount);
4073
4074 Py_DECREF(str1);
4075 Py_DECREF(str2);
4076 return result;
4077}
4078
4079static
4080PyObject *unicode_repr(PyObject *unicode)
4081{
4082 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4083 PyUnicode_GET_SIZE(unicode),
4084 1);
4085}
4086
4087static char rfind__doc__[] =
4088"S.rfind(sub [,start [,end]]) -> int\n\
4089\n\
4090Return the highest index in S where substring sub is found,\n\
4091such that sub is contained within s[start,end]. Optional\n\
4092arguments start and end are interpreted as in slice notation.\n\
4093\n\
4094Return -1 on failure.";
4095
4096static PyObject *
4097unicode_rfind(PyUnicodeObject *self, PyObject *args)
4098{
4099 PyUnicodeObject *substring;
4100 int start = 0;
4101 int end = INT_MAX;
4102 PyObject *result;
4103
Guido van Rossumb8872e62000-05-09 14:14:27 +00004104 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4105 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 return NULL;
4107 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4108 (PyObject *)substring);
4109 if (substring == NULL)
4110 return NULL;
4111
4112 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4113
4114 Py_DECREF(substring);
4115 return result;
4116}
4117
4118static char rindex__doc__[] =
4119"S.rindex(sub [,start [,end]]) -> int\n\
4120\n\
4121Like S.rfind() but raise ValueError when the substring is not found.";
4122
4123static PyObject *
4124unicode_rindex(PyUnicodeObject *self, PyObject *args)
4125{
4126 int result;
4127 PyUnicodeObject *substring;
4128 int start = 0;
4129 int end = INT_MAX;
4130
Guido van Rossumb8872e62000-05-09 14:14:27 +00004131 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4132 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 return NULL;
4134 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4135 (PyObject *)substring);
4136 if (substring == NULL)
4137 return NULL;
4138
4139 result = findstring(self, substring, start, end, -1);
4140
4141 Py_DECREF(substring);
4142 if (result < 0) {
4143 PyErr_SetString(PyExc_ValueError, "substring not found");
4144 return NULL;
4145 }
4146 return PyInt_FromLong(result);
4147}
4148
4149static char rjust__doc__[] =
4150"S.rjust(width) -> unicode\n\
4151\n\
4152Return S right justified in a Unicode string of length width. Padding is\n\
4153done using spaces.";
4154
4155static PyObject *
4156unicode_rjust(PyUnicodeObject *self, PyObject *args)
4157{
4158 int width;
4159 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4160 return NULL;
4161
4162 if (self->length >= width) {
4163 Py_INCREF(self);
4164 return (PyObject*) self;
4165 }
4166
4167 return (PyObject*) pad(self, width - self->length, 0, ' ');
4168}
4169
4170static char rstrip__doc__[] =
4171"S.rstrip() -> unicode\n\
4172\n\
4173Return a copy of the string S with trailing whitespace removed.";
4174
4175static PyObject *
4176unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4177{
4178 if (!PyArg_NoArgs(args))
4179 return NULL;
4180 return strip(self, 0, 1);
4181}
4182
4183static PyObject*
4184unicode_slice(PyUnicodeObject *self, int start, int end)
4185{
4186 /* standard clamping */
4187 if (start < 0)
4188 start = 0;
4189 if (end < 0)
4190 end = 0;
4191 if (end > self->length)
4192 end = self->length;
4193 if (start == 0 && end == self->length) {
4194 /* full slice, return original string */
4195 Py_INCREF(self);
4196 return (PyObject*) self;
4197 }
4198 if (start > end)
4199 start = end;
4200 /* copy slice */
4201 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4202 end - start);
4203}
4204
4205PyObject *PyUnicode_Split(PyObject *s,
4206 PyObject *sep,
4207 int maxsplit)
4208{
4209 PyObject *result;
4210
4211 s = PyUnicode_FromObject(s);
4212 if (s == NULL)
4213 return NULL;
4214 if (sep != NULL) {
4215 sep = PyUnicode_FromObject(sep);
4216 if (sep == NULL) {
4217 Py_DECREF(s);
4218 return NULL;
4219 }
4220 }
4221
4222 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4223
4224 Py_DECREF(s);
4225 Py_XDECREF(sep);
4226 return result;
4227}
4228
4229static char split__doc__[] =
4230"S.split([sep [,maxsplit]]) -> list of strings\n\
4231\n\
4232Return a list of the words in S, using sep as the\n\
4233delimiter string. If maxsplit is given, at most maxsplit\n\
4234splits are done. If sep is not specified, any whitespace string\n\
4235is a separator.";
4236
4237static PyObject*
4238unicode_split(PyUnicodeObject *self, PyObject *args)
4239{
4240 PyObject *substring = Py_None;
4241 int maxcount = -1;
4242
4243 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4244 return NULL;
4245
4246 if (substring == Py_None)
4247 return split(self, NULL, maxcount);
4248 else if (PyUnicode_Check(substring))
4249 return split(self, (PyUnicodeObject *)substring, maxcount);
4250 else
4251 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4252}
4253
4254static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004255"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256\n\
4257Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004258Line breaks are not included in the resulting list unless keepends\n\
4259is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260
4261static PyObject*
4262unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4263{
Guido van Rossum86662912000-04-11 15:38:46 +00004264 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265
Guido van Rossum86662912000-04-11 15:38:46 +00004266 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 return NULL;
4268
Guido van Rossum86662912000-04-11 15:38:46 +00004269 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270}
4271
4272static
4273PyObject *unicode_str(PyUnicodeObject *self)
4274{
Fred Drakee4315f52000-05-09 19:53:39 +00004275 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276}
4277
4278static char strip__doc__[] =
4279"S.strip() -> unicode\n\
4280\n\
4281Return a copy of S with leading and trailing whitespace removed.";
4282
4283static PyObject *
4284unicode_strip(PyUnicodeObject *self, PyObject *args)
4285{
4286 if (!PyArg_NoArgs(args))
4287 return NULL;
4288 return strip(self, 1, 1);
4289}
4290
4291static char swapcase__doc__[] =
4292"S.swapcase() -> unicode\n\
4293\n\
4294Return a copy of S with uppercase characters converted to lowercase\n\
4295and vice versa.";
4296
4297static PyObject*
4298unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4299{
4300 if (!PyArg_NoArgs(args))
4301 return NULL;
4302 return fixup(self, fixswapcase);
4303}
4304
4305static char translate__doc__[] =
4306"S.translate(table) -> unicode\n\
4307\n\
4308Return a copy of the string S, where all characters have been mapped\n\
4309through the given translation table, which must be a mapping of\n\
4310Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4311are left untouched. Characters mapped to None are deleted.";
4312
4313static PyObject*
4314unicode_translate(PyUnicodeObject *self, PyObject *args)
4315{
4316 PyObject *table;
4317
4318 if (!PyArg_ParseTuple(args, "O:translate", &table))
4319 return NULL;
4320 return PyUnicode_TranslateCharmap(self->str,
4321 self->length,
4322 table,
4323 "ignore");
4324}
4325
4326static char upper__doc__[] =
4327"S.upper() -> unicode\n\
4328\n\
4329Return a copy of S converted to uppercase.";
4330
4331static PyObject*
4332unicode_upper(PyUnicodeObject *self, PyObject *args)
4333{
4334 if (!PyArg_NoArgs(args))
4335 return NULL;
4336 return fixup(self, fixupper);
4337}
4338
4339#if 0
4340static char zfill__doc__[] =
4341"S.zfill(width) -> unicode\n\
4342\n\
4343Pad a numeric string x with zeros on the left, to fill a field\n\
4344of the specified width. The string x is never truncated.";
4345
4346static PyObject *
4347unicode_zfill(PyUnicodeObject *self, PyObject *args)
4348{
4349 int fill;
4350 PyUnicodeObject *u;
4351
4352 int width;
4353 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4354 return NULL;
4355
4356 if (self->length >= width) {
4357 Py_INCREF(self);
4358 return (PyObject*) self;
4359 }
4360
4361 fill = width - self->length;
4362
4363 u = pad(self, fill, 0, '0');
4364
4365 if (u->str[fill] == '+' || u->str[fill] == '-') {
4366 /* move sign to beginning of string */
4367 u->str[0] = u->str[fill];
4368 u->str[fill] = '0';
4369 }
4370
4371 return (PyObject*) u;
4372}
4373#endif
4374
4375#if 0
4376static PyObject*
4377unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4378{
4379 if (!PyArg_NoArgs(args))
4380 return NULL;
4381 return PyInt_FromLong(unicode_freelist_size);
4382}
4383#endif
4384
4385static char startswith__doc__[] =
4386"S.startswith(prefix[, start[, end]]) -> int\n\
4387\n\
4388Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4389optional start, test S beginning at that position. With optional end, stop\n\
4390comparing S at that position.";
4391
4392static PyObject *
4393unicode_startswith(PyUnicodeObject *self,
4394 PyObject *args)
4395{
4396 PyUnicodeObject *substring;
4397 int start = 0;
4398 int end = INT_MAX;
4399 PyObject *result;
4400
Guido van Rossumb8872e62000-05-09 14:14:27 +00004401 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4402 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 return NULL;
4404 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4405 (PyObject *)substring);
4406 if (substring == NULL)
4407 return NULL;
4408
4409 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4410
4411 Py_DECREF(substring);
4412 return result;
4413}
4414
4415
4416static char endswith__doc__[] =
4417"S.endswith(suffix[, start[, end]]) -> int\n\
4418\n\
4419Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4420optional start, test S beginning at that position. With optional end, stop\n\
4421comparing S at that position.";
4422
4423static PyObject *
4424unicode_endswith(PyUnicodeObject *self,
4425 PyObject *args)
4426{
4427 PyUnicodeObject *substring;
4428 int start = 0;
4429 int end = INT_MAX;
4430 PyObject *result;
4431
Guido van Rossumb8872e62000-05-09 14:14:27 +00004432 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4433 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 return NULL;
4435 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4436 (PyObject *)substring);
4437 if (substring == NULL)
4438 return NULL;
4439
4440 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4441
4442 Py_DECREF(substring);
4443 return result;
4444}
4445
4446
4447static PyMethodDef unicode_methods[] = {
4448
4449 /* Order is according to common usage: often used methods should
4450 appear first, since lookup is done sequentially. */
4451
4452 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4453 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4454 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4455 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4456 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4457 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4458 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4459 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4460 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4461 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4462 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4463 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4464 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4465 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4466/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4467 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4468 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4469 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4470 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4471 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4472 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4473 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4474 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4475 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4476 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4477 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4478 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4479 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4480 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4481 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4482 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4483 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4484 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004485 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4486 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487#if 0
4488 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4489 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4490#endif
4491
4492#if 0
4493 /* This one is just used for debugging the implementation. */
4494 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4495#endif
4496
4497 {NULL, NULL}
4498};
4499
4500static PyObject *
4501unicode_getattr(PyUnicodeObject *self, char *name)
4502{
4503 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4504}
4505
4506static PySequenceMethods unicode_as_sequence = {
4507 (inquiry) unicode_length, /* sq_length */
4508 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4509 (intargfunc) unicode_repeat, /* sq_repeat */
4510 (intargfunc) unicode_getitem, /* sq_item */
4511 (intintargfunc) unicode_slice, /* sq_slice */
4512 0, /* sq_ass_item */
4513 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004514 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515};
4516
4517static int
4518unicode_buffer_getreadbuf(PyUnicodeObject *self,
4519 int index,
4520 const void **ptr)
4521{
4522 if (index != 0) {
4523 PyErr_SetString(PyExc_SystemError,
4524 "accessing non-existent unicode segment");
4525 return -1;
4526 }
4527 *ptr = (void *) self->str;
4528 return PyUnicode_GET_DATA_SIZE(self);
4529}
4530
4531static int
4532unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4533 const void **ptr)
4534{
4535 PyErr_SetString(PyExc_TypeError,
4536 "cannot use unicode as modifyable buffer");
4537 return -1;
4538}
4539
4540static int
4541unicode_buffer_getsegcount(PyUnicodeObject *self,
4542 int *lenp)
4543{
4544 if (lenp)
4545 *lenp = PyUnicode_GET_DATA_SIZE(self);
4546 return 1;
4547}
4548
4549static int
4550unicode_buffer_getcharbuf(PyUnicodeObject *self,
4551 int index,
4552 const void **ptr)
4553{
4554 PyObject *str;
4555
4556 if (index != 0) {
4557 PyErr_SetString(PyExc_SystemError,
4558 "accessing non-existent unicode segment");
4559 return -1;
4560 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004561 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 if (str == NULL)
4563 return -1;
4564 *ptr = (void *) PyString_AS_STRING(str);
4565 return PyString_GET_SIZE(str);
4566}
4567
4568/* Helpers for PyUnicode_Format() */
4569
4570static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004571getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572{
4573 int argidx = *p_argidx;
4574 if (argidx < arglen) {
4575 (*p_argidx)++;
4576 if (arglen < 0)
4577 return args;
4578 else
4579 return PyTuple_GetItem(args, argidx);
4580 }
4581 PyErr_SetString(PyExc_TypeError,
4582 "not enough arguments for format string");
4583 return NULL;
4584}
4585
4586#define F_LJUST (1<<0)
4587#define F_SIGN (1<<1)
4588#define F_BLANK (1<<2)
4589#define F_ALT (1<<3)
4590#define F_ZERO (1<<4)
4591
4592static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594{
4595 register int i;
4596 int len;
4597 va_list va;
4598 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600
4601 /* First, format the string as char array, then expand to Py_UNICODE
4602 array. */
4603 charbuffer = (char *)buffer;
4604 len = vsprintf(charbuffer, format, va);
4605 for (i = len - 1; i >= 0; i--)
4606 buffer[i] = (Py_UNICODE) charbuffer[i];
4607
4608 va_end(va);
4609 return len;
4610}
4611
4612static int
4613formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004614 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 int flags,
4616 int prec,
4617 int type,
4618 PyObject *v)
4619{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004620 /* fmt = '%#.' + `prec` + `type`
4621 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 char fmt[20];
4623 double x;
4624
4625 x = PyFloat_AsDouble(v);
4626 if (x == -1.0 && PyErr_Occurred())
4627 return -1;
4628 if (prec < 0)
4629 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4631 type = 'g';
4632 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004633 /* worst case length calc to ensure no buffer overrun:
4634 fmt = %#.<prec>g
4635 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4636 for any double rep.)
4637 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4638 If prec=0 the effective precision is 1 (the leading digit is
4639 always given), therefore increase by one to 10+prec. */
4640 if (buflen <= (size_t)10 + (size_t)prec) {
4641 PyErr_SetString(PyExc_OverflowError,
4642 "formatted float is too long (precision too long?)");
4643 return -1;
4644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 return usprintf(buf, fmt, x);
4646}
4647
Tim Peters38fd5b62000-09-21 05:43:11 +00004648static PyObject*
4649formatlong(PyObject *val, int flags, int prec, int type)
4650{
4651 char *buf;
4652 int i, len;
4653 PyObject *str; /* temporary string object. */
4654 PyUnicodeObject *result;
4655
4656 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4657 if (!str)
4658 return NULL;
4659 result = _PyUnicode_New(len);
4660 for (i = 0; i < len; i++)
4661 result->str[i] = buf[i];
4662 result->str[len] = 0;
4663 Py_DECREF(str);
4664 return (PyObject*)result;
4665}
4666
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667static int
4668formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004669 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670 int flags,
4671 int prec,
4672 int type,
4673 PyObject *v)
4674{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004675 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004676 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4677 + 1 + 1 = 24*/
4678 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 long x;
4680
4681 x = PyInt_AsLong(v);
4682 if (x == -1 && PyErr_Occurred())
4683 return -1;
4684 if (prec < 0)
4685 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004686 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4687 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4688 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4689 PyErr_SetString(PyExc_OverflowError,
4690 "formatted integer is too long (precision too long?)");
4691 return -1;
4692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4694 return usprintf(buf, fmt, x);
4695}
4696
4697static int
4698formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004699 size_t buflen,
4700 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004702 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004703 if (PyUnicode_Check(v)) {
4704 if (PyUnicode_GET_SIZE(v) != 1)
4705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004709 else if (PyString_Check(v)) {
4710 if (PyString_GET_SIZE(v) != 1)
4711 goto onError;
4712 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714
4715 else {
4716 /* Integer input truncated to a character */
4717 long x;
4718 x = PyInt_AsLong(v);
4719 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004720 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 buf[0] = (char) x;
4722 }
4723 buf[1] = '\0';
4724 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004725
4726 onError:
4727 PyErr_SetString(PyExc_TypeError,
4728 "%c requires int or char");
4729 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730}
4731
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004732/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4733
4734 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4735 chars are formatted. XXX This is a magic number. Each formatting
4736 routine does bounds checking to ensure no overflow, but a better
4737 solution may be to malloc a buffer of appropriate size for each
4738 format. For now, the current solution is sufficient.
4739*/
4740#define FORMATBUFLEN (size_t)120
4741
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742PyObject *PyUnicode_Format(PyObject *format,
4743 PyObject *args)
4744{
4745 Py_UNICODE *fmt, *res;
4746 int fmtcnt, rescnt, reslen, arglen, argidx;
4747 int args_owned = 0;
4748 PyUnicodeObject *result = NULL;
4749 PyObject *dict = NULL;
4750 PyObject *uformat;
4751
4752 if (format == NULL || args == NULL) {
4753 PyErr_BadInternalCall();
4754 return NULL;
4755 }
4756 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004757 if (uformat == NULL)
4758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 fmt = PyUnicode_AS_UNICODE(uformat);
4760 fmtcnt = PyUnicode_GET_SIZE(uformat);
4761
4762 reslen = rescnt = fmtcnt + 100;
4763 result = _PyUnicode_New(reslen);
4764 if (result == NULL)
4765 goto onError;
4766 res = PyUnicode_AS_UNICODE(result);
4767
4768 if (PyTuple_Check(args)) {
4769 arglen = PyTuple_Size(args);
4770 argidx = 0;
4771 }
4772 else {
4773 arglen = -1;
4774 argidx = -2;
4775 }
4776 if (args->ob_type->tp_as_mapping)
4777 dict = args;
4778
4779 while (--fmtcnt >= 0) {
4780 if (*fmt != '%') {
4781 if (--rescnt < 0) {
4782 rescnt = fmtcnt + 100;
4783 reslen += rescnt;
4784 if (_PyUnicode_Resize(result, reslen) < 0)
4785 return NULL;
4786 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4787 --rescnt;
4788 }
4789 *res++ = *fmt++;
4790 }
4791 else {
4792 /* Got a format specifier */
4793 int flags = 0;
4794 int width = -1;
4795 int prec = -1;
4796 int size = 0;
4797 Py_UNICODE c = '\0';
4798 Py_UNICODE fill;
4799 PyObject *v = NULL;
4800 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004801 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 Py_UNICODE sign;
4803 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004804 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805
4806 fmt++;
4807 if (*fmt == '(') {
4808 Py_UNICODE *keystart;
4809 int keylen;
4810 PyObject *key;
4811 int pcount = 1;
4812
4813 if (dict == NULL) {
4814 PyErr_SetString(PyExc_TypeError,
4815 "format requires a mapping");
4816 goto onError;
4817 }
4818 ++fmt;
4819 --fmtcnt;
4820 keystart = fmt;
4821 /* Skip over balanced parentheses */
4822 while (pcount > 0 && --fmtcnt >= 0) {
4823 if (*fmt == ')')
4824 --pcount;
4825 else if (*fmt == '(')
4826 ++pcount;
4827 fmt++;
4828 }
4829 keylen = fmt - keystart - 1;
4830 if (fmtcnt < 0 || pcount > 0) {
4831 PyErr_SetString(PyExc_ValueError,
4832 "incomplete format key");
4833 goto onError;
4834 }
Fred Drakee4315f52000-05-09 19:53:39 +00004835 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 then looked up since Python uses strings to hold
4837 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004838 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 key = PyUnicode_EncodeUTF8(keystart,
4840 keylen,
4841 NULL);
4842 if (key == NULL)
4843 goto onError;
4844 if (args_owned) {
4845 Py_DECREF(args);
4846 args_owned = 0;
4847 }
4848 args = PyObject_GetItem(dict, key);
4849 Py_DECREF(key);
4850 if (args == NULL) {
4851 goto onError;
4852 }
4853 args_owned = 1;
4854 arglen = -1;
4855 argidx = -2;
4856 }
4857 while (--fmtcnt >= 0) {
4858 switch (c = *fmt++) {
4859 case '-': flags |= F_LJUST; continue;
4860 case '+': flags |= F_SIGN; continue;
4861 case ' ': flags |= F_BLANK; continue;
4862 case '#': flags |= F_ALT; continue;
4863 case '0': flags |= F_ZERO; continue;
4864 }
4865 break;
4866 }
4867 if (c == '*') {
4868 v = getnextarg(args, arglen, &argidx);
4869 if (v == NULL)
4870 goto onError;
4871 if (!PyInt_Check(v)) {
4872 PyErr_SetString(PyExc_TypeError,
4873 "* wants int");
4874 goto onError;
4875 }
4876 width = PyInt_AsLong(v);
4877 if (width < 0) {
4878 flags |= F_LJUST;
4879 width = -width;
4880 }
4881 if (--fmtcnt >= 0)
4882 c = *fmt++;
4883 }
4884 else if (c >= '0' && c <= '9') {
4885 width = c - '0';
4886 while (--fmtcnt >= 0) {
4887 c = *fmt++;
4888 if (c < '0' || c > '9')
4889 break;
4890 if ((width*10) / 10 != width) {
4891 PyErr_SetString(PyExc_ValueError,
4892 "width too big");
4893 goto onError;
4894 }
4895 width = width*10 + (c - '0');
4896 }
4897 }
4898 if (c == '.') {
4899 prec = 0;
4900 if (--fmtcnt >= 0)
4901 c = *fmt++;
4902 if (c == '*') {
4903 v = getnextarg(args, arglen, &argidx);
4904 if (v == NULL)
4905 goto onError;
4906 if (!PyInt_Check(v)) {
4907 PyErr_SetString(PyExc_TypeError,
4908 "* wants int");
4909 goto onError;
4910 }
4911 prec = PyInt_AsLong(v);
4912 if (prec < 0)
4913 prec = 0;
4914 if (--fmtcnt >= 0)
4915 c = *fmt++;
4916 }
4917 else if (c >= '0' && c <= '9') {
4918 prec = c - '0';
4919 while (--fmtcnt >= 0) {
4920 c = Py_CHARMASK(*fmt++);
4921 if (c < '0' || c > '9')
4922 break;
4923 if ((prec*10) / 10 != prec) {
4924 PyErr_SetString(PyExc_ValueError,
4925 "prec too big");
4926 goto onError;
4927 }
4928 prec = prec*10 + (c - '0');
4929 }
4930 }
4931 } /* prec */
4932 if (fmtcnt >= 0) {
4933 if (c == 'h' || c == 'l' || c == 'L') {
4934 size = c;
4935 if (--fmtcnt >= 0)
4936 c = *fmt++;
4937 }
4938 }
4939 if (fmtcnt < 0) {
4940 PyErr_SetString(PyExc_ValueError,
4941 "incomplete format");
4942 goto onError;
4943 }
4944 if (c != '%') {
4945 v = getnextarg(args, arglen, &argidx);
4946 if (v == NULL)
4947 goto onError;
4948 }
4949 sign = 0;
4950 fill = ' ';
4951 switch (c) {
4952
4953 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004954 pbuf = formatbuf;
4955 /* presume that buffer length is at least 1 */
4956 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 len = 1;
4958 break;
4959
4960 case 's':
4961 case 'r':
4962 if (PyUnicode_Check(v) && c == 's') {
4963 temp = v;
4964 Py_INCREF(temp);
4965 }
4966 else {
4967 PyObject *unicode;
4968 if (c == 's')
4969 temp = PyObject_Str(v);
4970 else
4971 temp = PyObject_Repr(v);
4972 if (temp == NULL)
4973 goto onError;
4974 if (!PyString_Check(temp)) {
4975 /* XXX Note: this should never happen, since
4976 PyObject_Repr() and PyObject_Str() assure
4977 this */
4978 Py_DECREF(temp);
4979 PyErr_SetString(PyExc_TypeError,
4980 "%s argument has non-string str()");
4981 goto onError;
4982 }
Fred Drakee4315f52000-05-09 19:53:39 +00004983 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004985 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 "strict");
4987 Py_DECREF(temp);
4988 temp = unicode;
4989 if (temp == NULL)
4990 goto onError;
4991 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004992 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 len = PyUnicode_GET_SIZE(temp);
4994 if (prec >= 0 && len > prec)
4995 len = prec;
4996 break;
4997
4998 case 'i':
4999 case 'd':
5000 case 'u':
5001 case 'o':
5002 case 'x':
5003 case 'X':
5004 if (c == 'i')
5005 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005006 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005007 temp = formatlong(v, flags, prec, c);
5008 if (!temp)
5009 goto onError;
5010 pbuf = PyUnicode_AS_UNICODE(temp);
5011 len = PyUnicode_GET_SIZE(temp);
5012 /* unbounded ints can always produce
5013 a sign character! */
5014 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005016 else {
5017 pbuf = formatbuf;
5018 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5019 flags, prec, c, v);
5020 if (len < 0)
5021 goto onError;
5022 /* only d conversion is signed */
5023 sign = c == 'd';
5024 }
5025 if (flags & F_ZERO)
5026 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 break;
5028
5029 case 'e':
5030 case 'E':
5031 case 'f':
5032 case 'g':
5033 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005034 pbuf = formatbuf;
5035 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5036 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 if (len < 0)
5038 goto onError;
5039 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005040 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 fill = '0';
5042 break;
5043
5044 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005045 pbuf = formatbuf;
5046 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 if (len < 0)
5048 goto onError;
5049 break;
5050
5051 default:
5052 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005053 "unsupported format character '%c' (0x%x) "
5054 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005055 (31<=c && c<=126) ? c : '?',
5056 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 goto onError;
5058 }
5059 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005060 if (*pbuf == '-' || *pbuf == '+') {
5061 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 len--;
5063 }
5064 else if (flags & F_SIGN)
5065 sign = '+';
5066 else if (flags & F_BLANK)
5067 sign = ' ';
5068 else
5069 sign = 0;
5070 }
5071 if (width < len)
5072 width = len;
5073 if (rescnt < width + (sign != 0)) {
5074 reslen -= rescnt;
5075 rescnt = width + fmtcnt + 100;
5076 reslen += rescnt;
5077 if (_PyUnicode_Resize(result, reslen) < 0)
5078 return NULL;
5079 res = PyUnicode_AS_UNICODE(result)
5080 + reslen - rescnt;
5081 }
5082 if (sign) {
5083 if (fill != ' ')
5084 *res++ = sign;
5085 rescnt--;
5086 if (width > len)
5087 width--;
5088 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005089 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5090 assert(pbuf[0] == '0');
5091 assert(pbuf[1] == c);
5092 if (fill != ' ') {
5093 *res++ = *pbuf++;
5094 *res++ = *pbuf++;
5095 }
5096 rescnt -= 2;
5097 width -= 2;
5098 if (width < 0)
5099 width = 0;
5100 len -= 2;
5101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (width > len && !(flags & F_LJUST)) {
5103 do {
5104 --rescnt;
5105 *res++ = fill;
5106 } while (--width > len);
5107 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005108 if (fill == ' ') {
5109 if (sign)
5110 *res++ = sign;
5111 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5112 assert(pbuf[0] == '0');
5113 assert(pbuf[1] == c);
5114 *res++ = *pbuf++;
5115 *res++ = *pbuf++;
5116 }
5117 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005118 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 res += len;
5120 rescnt -= len;
5121 while (--width >= len) {
5122 --rescnt;
5123 *res++ = ' ';
5124 }
5125 if (dict && (argidx < arglen) && c != '%') {
5126 PyErr_SetString(PyExc_TypeError,
5127 "not all arguments converted");
5128 goto onError;
5129 }
5130 Py_XDECREF(temp);
5131 } /* '%' */
5132 } /* until end */
5133 if (argidx < arglen && !dict) {
5134 PyErr_SetString(PyExc_TypeError,
5135 "not all arguments converted");
5136 goto onError;
5137 }
5138
5139 if (args_owned) {
5140 Py_DECREF(args);
5141 }
5142 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005143 if (_PyUnicode_Resize(result, reslen - rescnt))
5144 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 return (PyObject *)result;
5146
5147 onError:
5148 Py_XDECREF(result);
5149 Py_DECREF(uformat);
5150 if (args_owned) {
5151 Py_DECREF(args);
5152 }
5153 return NULL;
5154}
5155
5156static PyBufferProcs unicode_as_buffer = {
5157 (getreadbufferproc) unicode_buffer_getreadbuf,
5158 (getwritebufferproc) unicode_buffer_getwritebuf,
5159 (getsegcountproc) unicode_buffer_getsegcount,
5160 (getcharbufferproc) unicode_buffer_getcharbuf,
5161};
5162
5163PyTypeObject PyUnicode_Type = {
5164 PyObject_HEAD_INIT(&PyType_Type)
5165 0, /* ob_size */
5166 "unicode", /* tp_name */
5167 sizeof(PyUnicodeObject), /* tp_size */
5168 0, /* tp_itemsize */
5169 /* Slots */
5170 (destructor)_PyUnicode_Free, /* tp_dealloc */
5171 0, /* tp_print */
5172 (getattrfunc)unicode_getattr, /* tp_getattr */
5173 0, /* tp_setattr */
5174 (cmpfunc) unicode_compare, /* tp_compare */
5175 (reprfunc) unicode_repr, /* tp_repr */
5176 0, /* tp_as_number */
5177 &unicode_as_sequence, /* tp_as_sequence */
5178 0, /* tp_as_mapping */
5179 (hashfunc) unicode_hash, /* tp_hash*/
5180 0, /* tp_call*/
5181 (reprfunc) unicode_str, /* tp_str */
5182 (getattrofunc) NULL, /* tp_getattro */
5183 (setattrofunc) NULL, /* tp_setattro */
5184 &unicode_as_buffer, /* tp_as_buffer */
5185 Py_TPFLAGS_DEFAULT, /* tp_flags */
5186};
5187
5188/* Initialize the Unicode implementation */
5189
Thomas Wouters78890102000-07-22 19:25:51 +00005190void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
5192 /* Doublecheck the configuration... */
5193 if (sizeof(Py_UNICODE) != 2)
5194 Py_FatalError("Unicode configuration error: "
5195 "sizeof(Py_UNICODE) != 2 bytes");
5196
Fred Drakee4315f52000-05-09 19:53:39 +00005197 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005198 unicode_freelist = NULL;
5199 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005201 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
5204/* Finalize the Unicode implementation */
5205
5206void
Thomas Wouters78890102000-07-22 19:25:51 +00005207_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005209 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005211 Py_XDECREF(unicode_empty);
5212 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005213
5214 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 PyUnicodeObject *v = u;
5216 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005217 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005218 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005219 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005220 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005222 unicode_freelist = NULL;
5223 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224}