blob: a3678d52e8c52e69cb55ba6cebf5c4baf1952b95 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000070#ifdef MS_WIN32
71#include <windows.h>
72#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073
Guido van Rossumd57fd912000-03-10 22:53:23 +000074/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE 1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
83
Barry Warsaw51ac5802000-03-20 16:36:48 +000084 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000086 malloc()-overhead) bytes of unused garbage.
87
88 Setting the limit to 0 effectively turns the feature off.
89
Guido van Rossumfd4b9572000-04-10 13:51:10 +000090 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
93*/
94
Guido van Rossumfd4b9572000-04-10 13:51:10 +000095#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000096
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105/* --- Globals ------------------------------------------------------------
106
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
109
110*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
112/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000113static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
134{
135 void *oldstr;
136
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000137 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000138 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000139 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000140
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
158 unicode->length = length;
159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
167
168 return 0;
169}
170
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000171int PyUnicode_Resize(PyObject **unicode,
172 int length)
173{
174 PyUnicodeObject *v;
175
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
179 }
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
184 }
185 return _PyUnicode_Resize(v, length);
186}
187
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188/* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
190
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199 register PyUnicodeObject *unicode;
200
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
205 }
206
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000210 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000217 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 }
220 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000221 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000223 }
224 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000270 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
358{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 const char *s;
360 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000361 int owned = 0;
362 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
367 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
377 }
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
383 }
384 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
391 }
392 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 }
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000406 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000407 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000408
409 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 if (len == 0) {
411 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000417 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 return v;
421
422 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427}
428
429PyObject *PyUnicode_Decode(const char *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *buffer = NULL, *unicode;
435
Fred Drakee4315f52000-05-09 19:53:39 +0000436 if (encoding == NULL)
437 encoding = PyUnicode_GetDefaultEncoding();
438
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000442 else if (strcmp(encoding, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s, size, errors);
444 else if (strcmp(encoding, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446
447 /* Decode via the codec registry */
448 buffer = PyBuffer_FromMemory((void *)s, size);
449 if (buffer == NULL)
450 goto onError;
451 unicode = PyCodec_Decode(buffer, encoding, errors);
452 if (unicode == NULL)
453 goto onError;
454 if (!PyUnicode_Check(unicode)) {
455 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000456 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 unicode->ob_type->tp_name);
458 Py_DECREF(unicode);
459 goto onError;
460 }
461 Py_DECREF(buffer);
462 return unicode;
463
464 onError:
465 Py_XDECREF(buffer);
466 return NULL;
467}
468
469PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470 int size,
471 const char *encoding,
472 const char *errors)
473{
474 PyObject *v, *unicode;
475
476 unicode = PyUnicode_FromUnicode(s, size);
477 if (unicode == NULL)
478 return NULL;
479 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480 Py_DECREF(unicode);
481 return v;
482}
483
484PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485 const char *encoding,
486 const char *errors)
487{
488 PyObject *v;
489
490 if (!PyUnicode_Check(unicode)) {
491 PyErr_BadArgument();
492 goto onError;
493 }
Fred Drakee4315f52000-05-09 19:53:39 +0000494
495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
497
498 /* Shortcuts for common default encodings */
499 if (errors == NULL) {
500 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode);
506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507
508 /* Encode via the codec registry */
509 v = PyCodec_Encode(unicode, encoding, errors);
510 if (v == NULL)
511 goto onError;
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v)) {
514 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000515 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 v->ob_type->tp_name);
517 Py_DECREF(v);
518 goto onError;
519 }
520 return v;
521
522 onError:
523 return NULL;
524}
525
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000526/* Return a Python string holding the default encoded value of the
527 Unicode object.
528
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
533
534 The refcount of the string is *not* incremented.
535
536 *** Exported for internal use by the interpreter only !!! ***
537
538*/
539
540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541 const char *errors)
542{
543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
544
545 if (v)
546 return v;
547 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548 if (v && errors == NULL)
549 ((PyUnicodeObject *)unicode)->defenc = v;
550 return v;
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
554{
555 if (!PyUnicode_Check(unicode)) {
556 PyErr_BadArgument();
557 goto onError;
558 }
559 return PyUnicode_AS_UNICODE(unicode);
560
561 onError:
562 return NULL;
563}
564
565int PyUnicode_GetSize(PyObject *unicode)
566{
567 if (!PyUnicode_Check(unicode)) {
568 PyErr_BadArgument();
569 goto onError;
570 }
571 return PyUnicode_GET_SIZE(unicode);
572
573 onError:
574 return -1;
575}
576
Thomas Wouters78890102000-07-22 19:25:51 +0000577const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000578{
579 return unicode_default_encoding;
580}
581
582int PyUnicode_SetDefaultEncoding(const char *encoding)
583{
584 PyObject *v;
585
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v = _PyCodec_Lookup(encoding);
589 if (v == NULL)
590 goto onError;
591 Py_DECREF(v);
592 strncpy(unicode_default_encoding,
593 encoding,
594 sizeof(unicode_default_encoding));
595 return 0;
596
597 onError:
598 return -1;
599}
600
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601/* --- UTF-8 Codec -------------------------------------------------------- */
602
603static
604char utf8_code_length[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
623};
624
625static
626int utf8_decoding_error(const char **source,
627 Py_UNICODE **dest,
628 const char *errors,
629 const char *details)
630{
631 if ((errors == NULL) ||
632 (strcmp(errors,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000634 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 details);
636 return -1;
637 }
638 else if (strcmp(errors,"ignore") == 0) {
639 (*source)++;
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 (*source)++;
644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656PyObject *PyUnicode_DecodeUTF8(const char *s,
657 int size,
658 const char *errors)
659{
660 int n;
661 const char *e;
662 PyUnicodeObject *unicode;
663 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665
666 /* Note: size will always be longer than the resulting Unicode
667 character count */
668 unicode = _PyUnicode_New(size);
669 if (!unicode)
670 return NULL;
671 if (size == 0)
672 return (PyObject *)unicode;
673
674 /* Unpack UTF-8 encoded data */
675 p = unicode->str;
676 e = s + size;
677
678 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000679 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680
681 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683 s++;
684 continue;
685 }
686
687 n = utf8_code_length[ch];
688
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000689 if (s + n > e) {
690 errmsg = "unexpected end of data";
691 goto utf8Error;
692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693
694 switch (n) {
695
696 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 errmsg = "unexpected code byte";
698 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 break;
700
701 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 errmsg = "internal error";
703 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 break;
705
706 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 if ((s[1] & 0xc0) != 0x80) {
708 errmsg = "invalid data";
709 goto utf8Error;
710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000712 if (ch < 0x80) {
713 errmsg = "illegal encoding";
714 goto utf8Error;
715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000717 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 break;
719
720 case 3:
721 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000722 (s[2] & 0xc0) != 0x80) {
723 errmsg = "invalid data";
724 goto utf8Error;
725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728 errmsg = "illegal encoding";
729 goto utf8Error;
730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000732 *p++ = (Py_UNICODE)ch;
733 break;
734
735 case 4:
736 if ((s[1] & 0xc0) != 0x80 ||
737 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 (s[3] & 0xc0) != 0x80) {
739 errmsg = "invalid data";
740 goto utf8Error;
741 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000745 if ((ch < 0x10000) || /* minimum value allowed for 4
746 byte encoding */
747 (ch > 0x10ffff)) { /* maximum value allowed for
748 UTF-16 */
749 errmsg = "illegal encoding";
750 goto utf8Error;
751 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000752 /* compute and append the two surrogates: */
753
754 /* translate from 10000..10FFFF to 0..FFFF */
755 ch -= 0x10000;
756
757 /* high surrogate = top 10 bits added to D800 */
758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
759
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 break;
763
764 default:
765 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000766 errmsg = "unsupported Unicode code range";
767 goto utf8Error;
768 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769 }
770 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000771 continue;
772
773 utf8Error:
774 if (utf8_decoding_error(&s, &p, errors, errmsg))
775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 }
777
778 /* Adjust length */
779 if (_PyUnicode_Resize(unicode, p - unicode->str))
780 goto onError;
781
782 return (PyObject *)unicode;
783
784onError:
785 Py_DECREF(unicode);
786 return NULL;
787}
788
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789/* Not used anymore, now that the encoder supports UTF-16
790 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000791#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792static
793int utf8_encoding_error(const Py_UNICODE **source,
794 char **dest,
795 const char *errors,
796 const char *details)
797{
798 if ((errors == NULL) ||
799 (strcmp(errors,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000801 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 details);
803 return -1;
804 }
805 else if (strcmp(errors,"ignore") == 0) {
806 return 0;
807 }
808 else if (strcmp(errors,"replace") == 0) {
809 **dest = '?';
810 (*dest)++;
811 return 0;
812 }
813 else {
814 PyErr_Format(PyExc_ValueError,
815 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000816 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 errors);
818 return -1;
819 }
820}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824 int size,
825 const char *errors)
826{
827 PyObject *v;
828 char *p;
829 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000830 Py_UCS4 ch2;
831 unsigned int cbAllocated = 3 * size;
832 unsigned int cbWritten = 0;
833 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000835 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (v == NULL)
837 return NULL;
838 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000839 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
841 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000842 while (i < size) {
843 Py_UCS4 ch = s[i++];
844 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000846 cbWritten++;
847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 else if (ch < 0x0800) {
849 *p++ = 0xc0 | (ch >> 6);
850 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000851 cbWritten += 2;
852 }
853 else {
854 /* Check for high surrogate */
855 if (0xD800 <= ch && ch <= 0xDBFF) {
856 if (i != size) {
857 ch2 = s[i];
858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
859
860 if (cbWritten >= (cbAllocated - 4)) {
861 /* Provide enough room for some more
862 surrogates */
863 cbAllocated += 4*10;
864 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000865 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 }
867
868 /* combine the two values */
869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
870
871 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 i++;
874 cbWritten += 4;
875 }
876 }
877 }
878 else {
879 *p++ = (char)(0xe0 | (ch >> 12));
880 cbWritten += 3;
881 }
882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 }
885 }
886 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000887 if (_PyString_Resize(&v, p - q))
888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 return v;
890
891 onError:
892 Py_DECREF(v);
893 return NULL;
894}
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
897{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 if (!PyUnicode_Check(unicode)) {
899 PyErr_BadArgument();
900 return NULL;
901 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903 PyUnicode_GET_SIZE(unicode),
904 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905}
906
907/* --- UTF-16 Codec ------------------------------------------------------- */
908
909static
910int utf16_decoding_error(const Py_UNICODE **source,
911 Py_UNICODE **dest,
912 const char *errors,
913 const char *details)
914{
915 if ((errors == NULL) ||
916 (strcmp(errors,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000918 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000919 details);
920 return -1;
921 }
922 else if (strcmp(errors,"ignore") == 0) {
923 return 0;
924 }
925 else if (strcmp(errors,"replace") == 0) {
926 if (dest) {
927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928 (*dest)++;
929 }
930 return 0;
931 }
932 else {
933 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 errors);
937 return -1;
938 }
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_DecodeUTF16(const char *s,
942 int size,
943 const char *errors,
944 int *byteorder)
945{
946 PyUnicodeObject *unicode;
947 Py_UNICODE *p;
948 const Py_UNICODE *q, *e;
949 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000950 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951
952 /* size should be an even number */
953 if (size % sizeof(Py_UNICODE) != 0) {
954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955 return NULL;
956 /* The remaining input chars are ignored if we fall through
957 here... */
958 }
959
960 /* Note: size will always be longer than the resulting Unicode
961 character count */
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 /* Unpack UTF-16 encoded data */
969 p = unicode->str;
970 q = (Py_UNICODE *)s;
971 e = q + (size / sizeof(Py_UNICODE));
972
973 if (byteorder)
974 bo = *byteorder;
975
976 while (q < e) {
977 register Py_UNICODE ch = *q++;
978
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982 !) */
983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
984 if (ch == 0xFEFF) {
985 bo = -1;
986 continue;
987 } else if (ch == 0xFFFE) {
988 bo = 1;
989 continue;
990 }
991 if (bo == 1)
992 ch = (ch >> 8) | (ch << 8);
993#else
994 if (ch == 0xFEFF) {
995 bo = 1;
996 continue;
997 } else if (ch == 0xFFFE) {
998 bo = -1;
999 continue;
1000 }
1001 if (bo == -1)
1002 ch = (ch >> 8) | (ch << 8);
1003#endif
1004 if (ch < 0xD800 || ch > 0xDFFF) {
1005 *p++ = ch;
1006 continue;
1007 }
1008
1009 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001010 if (q >= e) {
1011 errmsg = "unexpected end of data";
1012 goto utf16Error;
1013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 if (0xDC00 <= *q && *q <= 0xDFFF) {
1015 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001021 errmsg = "code pairs are not supported";
1022 goto utf16Error;
1023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 else
1025 continue;
1026 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "illegal encoding";
1028 /* Fall through to report the error */
1029
1030 utf16Error:
1031 if (utf16_decoding_error(&q, &p, errors, errmsg))
1032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 }
1034
1035 if (byteorder)
1036 *byteorder = bo;
1037
1038 /* Adjust length */
1039 if (_PyUnicode_Resize(unicode, p - unicode->str))
1040 goto onError;
1041
1042 return (PyObject *)unicode;
1043
1044onError:
1045 Py_DECREF(unicode);
1046 return NULL;
1047}
1048
1049#undef UTF16_ERROR
1050
1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052 int size,
1053 const char *errors,
1054 int byteorder)
1055{
1056 PyObject *v;
1057 Py_UNICODE *p;
1058 char *q;
1059
1060 /* We don't create UTF-16 pairs... */
1061 v = PyString_FromStringAndSize(NULL,
1062 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063 if (v == NULL)
1064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 q = PyString_AS_STRING(v);
1067 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (byteorder == 0)
1069 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001070 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (byteorder == 0 ||
1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074 byteorder == -1
1075#else
1076 byteorder == 1
1077#endif
1078 )
1079 memcpy(p, s, size * sizeof(Py_UNICODE));
1080 else
1081 while (size-- > 0) {
1082 Py_UNICODE ch = *s++;
1083 *p++ = (ch >> 8) | (ch << 8);
1084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 return v;
1086}
1087
1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089{
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 return NULL;
1093 }
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095 PyUnicode_GET_SIZE(unicode),
1096 NULL,
1097 0);
1098}
1099
1100/* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102static
1103int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001104 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 const char *errors,
1106 const char *details)
1107{
1108 if ((errors == NULL) ||
1109 (strcmp(errors,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001111 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 details);
1113 return -1;
1114 }
1115 else if (strcmp(errors,"ignore") == 0) {
1116 return 0;
1117 }
1118 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001119 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 return 0;
1121 }
1122 else {
1123 PyErr_Format(PyExc_ValueError,
1124 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 errors);
1127 return -1;
1128 }
1129}
1130
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001131static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133static
1134int mystrnicmp(const char *s1, const char *s2, size_t count)
1135{
1136 char c1, c2;
1137
1138 if (count)
1139 {
1140 do
1141 {
1142 c1 = tolower(*(s1++));
1143 c2 = tolower(*(s2++));
1144 }
1145 while(--count && c1 == c2);
1146
1147 return c1 - c2;
1148 }
1149
1150 return 0;
1151}
1152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154 int size,
1155 const char *errors)
1156{
1157 PyUnicodeObject *v;
1158 Py_UNICODE *p = NULL, *buf = NULL;
1159 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001160 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v = _PyUnicode_New(size);
1166 if (v == NULL)
1167 goto onError;
1168 if (size == 0)
1169 return (PyObject *)v;
1170 p = buf = PyUnicode_AS_UNICODE(v);
1171 end = s + size;
1172 while (s < end) {
1173 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001174 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 int i;
1176
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1178 if (*s != '\\') {
1179 *p++ = (unsigned char)*s++;
1180 continue;
1181 }
1182
1183 /* \ - Escapes */
1184 s++;
1185 switch (*s++) {
1186
1187 /* \x escapes */
1188 case '\n': break;
1189 case '\\': *p++ = '\\'; break;
1190 case '\'': *p++ = '\''; break;
1191 case '\"': *p++ = '\"'; break;
1192 case 'b': *p++ = '\b'; break;
1193 case 'f': *p++ = '\014'; break; /* FF */
1194 case 't': *p++ = '\t'; break;
1195 case 'n': *p++ = '\n'; break;
1196 case 'r': *p++ = '\r'; break;
1197 case 'v': *p++ = '\013'; break; /* VT */
1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001203 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001205 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001207 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001209 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 break;
1211
Fredrik Lundhdf846752000-09-03 11:29:49 +00001212 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001214 for (x = 0, i = 0; i < 2; i++) {
1215 c = (unsigned char)s[i];
1216 if (!isxdigit(c)) {
1217 if (unicodeescape_decoding_error(&s, &x, errors,
1218 "truncated \\xXX"))
1219 goto onError;
1220 i++;
1221 break;
1222 }
1223 x = (x<<4) & ~0xF;
1224 if (c >= '0' && c <= '9')
1225 x += c - '0';
1226 else if (c >= 'a' && c <= 'f')
1227 x += 10 + c - 'a';
1228 else
1229 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001231 s += i;
1232 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 break;
1234
1235 /* \uXXXX with 4 hex digits */
1236 case 'u':
1237 for (x = 0, i = 0; i < 4; i++) {
1238 c = (unsigned char)s[i];
1239 if (!isxdigit(c)) {
1240 if (unicodeescape_decoding_error(&s, &x, errors,
1241 "truncated \\uXXXX"))
1242 goto onError;
1243 i++;
1244 break;
1245 }
1246 x = (x<<4) & ~0xF;
1247 if (c >= '0' && c <= '9')
1248 x += c - '0';
1249 else if (c >= 'a' && c <= 'f')
1250 x += 10 + c - 'a';
1251 else
1252 x += 10 + c - 'A';
1253 }
1254 s += i;
1255 *p++ = x;
1256 break;
1257
Fredrik Lundhdf846752000-09-03 11:29:49 +00001258 /* \UXXXXXXXX with 8 hex digits */
1259 case 'U':
1260 for (chr = 0, i = 0; i < 8; i++) {
1261 c = (unsigned char)s[i];
1262 if (!isxdigit(c)) {
1263 if (unicodeescape_decoding_error(&s, &x, errors,
1264 "truncated \\uXXXX"))
1265 goto onError;
1266 i++;
1267 break;
1268 }
1269 chr = (chr<<4) & ~0xF;
1270 if (c >= '0' && c <= '9')
1271 chr += c - '0';
1272 else if (c >= 'a' && c <= 'f')
1273 chr += 10 + c - 'a';
1274 else
1275 chr += 10 + c - 'A';
1276 }
1277 s += i;
1278 goto store;
1279
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001280 case 'N':
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1283 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001284 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001285 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 mod = PyImport_ImportModule("ucnhash");
1287 if (mod == NULL)
1288 goto onError;
1289 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290 Py_DECREF(mod);
1291 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001292 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001293 pucnHash = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001296 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001297 }
1298
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001300 const char *start = s + 1;
1301 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001302 unsigned long j;
1303
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1306 */
1307 while (*endBrace != '}' &&
1308 (unsigned int)(endBrace - start) <=
1309 pucnHash->cchMax &&
1310 endBrace < end)
1311 {
1312 endBrace++;
1313 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001314 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001315 j = pucnHash->hash(start, endBrace - start);
1316 if (j > pucnHash->cKeys ||
1317 mystrnicmp(
1318 start,
1319 ((_Py_UnicodeCharacterName *)
1320 (pucnHash->getValue(j)))->pszUCN,
1321 (int)(endBrace - start)) != 0)
1322 {
1323 if (unicodeescape_decoding_error(
1324 &s, &x, errors,
1325 "Invalid Unicode Character Name"))
1326 {
1327 goto onError;
1328 }
1329 goto ucnFallthrough;
1330 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001331 chr = ((_Py_UnicodeCharacterName *)
1332 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001333 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001334 goto store;
1335 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Unicode name missing closing brace"))
1339 goto onError;
1340 goto ucnFallthrough;
1341 }
1342 break;
1343 }
1344 if (unicodeescape_decoding_error(
1345 &s, &x, errors,
1346 "Missing opening brace for Unicode Character Name escape"))
1347 goto onError;
1348ucnFallthrough:
1349 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001350 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 *p++ = '\\';
1352 *p++ = (unsigned char)s[-1];
1353 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001354store:
1355 /* when we get here, chr is a 32-bit unicode character */
1356 if (chr <= 0xffff)
1357 /* UCS-2 character */
1358 *p++ = (Py_UNICODE) chr;
1359 else if (chr <= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1361 chr -= 0x10000L;
1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364 } else {
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Illegal Unicode character")
1368 )
1369 goto onError;
1370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 }
1372 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001373 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return (PyObject *)v;
1376
1377 onError:
1378 Py_XDECREF(v);
1379 return NULL;
1380}
1381
1382/* Return a Unicode-Escape string version of the Unicode object.
1383
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1385 appropriate.
1386
1387*/
1388
Barry Warsaw51ac5802000-03-20 16:36:48 +00001389static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390 int size,
1391 Py_UNICODE ch);
1392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393static
1394PyObject *unicodeescape_string(const Py_UNICODE *s,
1395 int size,
1396 int quotes)
1397{
1398 PyObject *repr;
1399 char *p;
1400 char *q;
1401
1402 static const char *hexdigit = "0123456789ABCDEF";
1403
1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405 if (repr == NULL)
1406 return NULL;
1407
1408 p = q = PyString_AS_STRING(repr);
1409
1410 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 *p++ = 'u';
1412 *p++ = (findchar(s, size, '\'') &&
1413 !findchar(s, size, '"')) ? '"' : '\'';
1414 }
1415 while (size-- > 0) {
1416 Py_UNICODE ch = *s++;
1417 /* Escape quotes */
1418 if (quotes && (ch == q[1] || ch == '\\')) {
1419 *p++ = '\\';
1420 *p++ = (char) ch;
1421 }
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch >= 256) {
1424 *p++ = '\\';
1425 *p++ = 'u';
1426 *p++ = hexdigit[(ch >> 12) & 0xf];
1427 *p++ = hexdigit[(ch >> 8) & 0xf];
1428 *p++ = hexdigit[(ch >> 4) & 0xf];
1429 *p++ = hexdigit[ch & 15];
1430 }
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch < ' ' || ch >= 128) {
1433 *p++ = '\\';
1434 *p++ = hexdigit[(ch >> 6) & 7];
1435 *p++ = hexdigit[(ch >> 3) & 7];
1436 *p++ = hexdigit[ch & 7];
1437 }
1438 /* Copy everything else as-is */
1439 else
1440 *p++ = (char) ch;
1441 }
1442 if (quotes)
1443 *p++ = q[1];
1444
1445 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001446 if (_PyString_Resize(&repr, p - q))
1447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001450
1451 onError:
1452 Py_DECREF(repr);
1453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457 int size)
1458{
1459 return unicodeescape_string(s, size, 0);
1460}
1461
1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463{
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 return NULL;
1467 }
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469 PyUnicode_GET_SIZE(unicode));
1470}
1471
1472/* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475 int size,
1476 const char *errors)
1477{
1478 PyUnicodeObject *v;
1479 Py_UNICODE *p, *buf;
1480 const char *end;
1481 const char *bs;
1482
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v = _PyUnicode_New(size);
1487 if (v == NULL)
1488 goto onError;
1489 if (size == 0)
1490 return (PyObject *)v;
1491 p = buf = PyUnicode_AS_UNICODE(v);
1492 end = s + size;
1493 while (s < end) {
1494 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001495 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 int i;
1497
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1499 if (*s != '\\') {
1500 *p++ = (unsigned char)*s++;
1501 continue;
1502 }
1503
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1506 bs = s;
1507 for (;s < end;) {
1508 if (*s != '\\')
1509 break;
1510 *p++ = (unsigned char)*s++;
1511 }
1512 if (((s - bs) & 1) == 0 ||
1513 s >= end ||
1514 *s != 'u') {
1515 continue;
1516 }
1517 p--;
1518 s++;
1519
1520 /* \uXXXX with 4 hex digits */
1521 for (x = 0, i = 0; i < 4; i++) {
1522 c = (unsigned char)s[i];
1523 if (!isxdigit(c)) {
1524 if (unicodeescape_decoding_error(&s, &x, errors,
1525 "truncated \\uXXXX"))
1526 goto onError;
1527 i++;
1528 break;
1529 }
1530 x = (x<<4) & ~0xF;
1531 if (c >= '0' && c <= '9')
1532 x += c - '0';
1533 else if (c >= 'a' && c <= 'f')
1534 x += 10 + c - 'a';
1535 else
1536 x += 10 + c - 'A';
1537 }
1538 s += i;
1539 *p++ = x;
1540 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001541 if (_PyUnicode_Resize(v, (int)(p - buf)))
1542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return (PyObject *)v;
1544
1545 onError:
1546 Py_XDECREF(v);
1547 return NULL;
1548}
1549
1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551 int size)
1552{
1553 PyObject *repr;
1554 char *p;
1555 char *q;
1556
1557 static const char *hexdigit = "0123456789ABCDEF";
1558
1559 repr = PyString_FromStringAndSize(NULL, 6 * size);
1560 if (repr == NULL)
1561 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001562 if (size == 0)
1563 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564
1565 p = q = PyString_AS_STRING(repr);
1566 while (size-- > 0) {
1567 Py_UNICODE ch = *s++;
1568 /* Map 16-bit characters to '\uxxxx' */
1569 if (ch >= 256) {
1570 *p++ = '\\';
1571 *p++ = 'u';
1572 *p++ = hexdigit[(ch >> 12) & 0xf];
1573 *p++ = hexdigit[(ch >> 8) & 0xf];
1574 *p++ = hexdigit[(ch >> 4) & 0xf];
1575 *p++ = hexdigit[ch & 15];
1576 }
1577 /* Copy everything else as-is */
1578 else
1579 *p++ = (char) ch;
1580 }
1581 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001582 if (_PyString_Resize(&repr, p - q))
1583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584
1585 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001586
1587 onError:
1588 Py_DECREF(repr);
1589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590}
1591
1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593{
1594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1597 }
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599 PyUnicode_GET_SIZE(unicode));
1600}
1601
1602/* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604PyObject *PyUnicode_DecodeLatin1(const char *s,
1605 int size,
1606 const char *errors)
1607{
1608 PyUnicodeObject *v;
1609 Py_UNICODE *p;
1610
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v = _PyUnicode_New(size);
1613 if (v == NULL)
1614 goto onError;
1615 if (size == 0)
1616 return (PyObject *)v;
1617 p = PyUnicode_AS_UNICODE(v);
1618 while (size-- > 0)
1619 *p++ = (unsigned char)*s++;
1620 return (PyObject *)v;
1621
1622 onError:
1623 Py_XDECREF(v);
1624 return NULL;
1625}
1626
1627static
1628int latin1_encoding_error(const Py_UNICODE **source,
1629 char **dest,
1630 const char *errors,
1631 const char *details)
1632{
1633 if ((errors == NULL) ||
1634 (strcmp(errors,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001636 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 details);
1638 return -1;
1639 }
1640 else if (strcmp(errors,"ignore") == 0) {
1641 return 0;
1642 }
1643 else if (strcmp(errors,"replace") == 0) {
1644 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 return 0;
1647 }
1648 else {
1649 PyErr_Format(PyExc_ValueError,
1650 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 errors);
1653 return -1;
1654 }
1655}
1656
1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658 int size,
1659 const char *errors)
1660{
1661 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001662 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001663
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 repr = PyString_FromStringAndSize(NULL, size);
1665 if (repr == NULL)
1666 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001667 if (size == 0)
1668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
1670 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001671 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1678 }
1679 else
1680 *s++ = (char)ch;
1681 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return repr;
1687
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1698 }
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1711{
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001715 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 details);
1717 return -1;
1718 }
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1721 }
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1726 }
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 errors);
1732 return -1;
1733 }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1739{
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1742
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1752
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return (PyObject *)v;
1764
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1775{
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001779 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 details);
1781 return -1;
1782 }
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1785 }
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001788 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return 0;
1790 }
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 errors);
1796 return -1;
1797 }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1803{
1804 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001805 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 repr = PyString_FromStringAndSize(NULL, size);
1808 if (repr == NULL)
1809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001810 if (size == 0)
1811 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812
1813 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001814 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (size-- > 0) {
1816 Py_UNICODE ch = *p++;
1817 if (ch >= 128) {
1818 if (ascii_encoding_error(&p, &s, errors,
1819 "ordinal not in range(128)"))
1820 goto onError;
1821 }
1822 else
1823 *s++ = (char)ch;
1824 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001825 /* Resize if error handling skipped some characters */
1826 if (s - start < PyString_GET_SIZE(repr))
1827 if (_PyString_Resize(&repr, s - start))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return repr;
1830
1831 onError:
1832 Py_DECREF(repr);
1833 return NULL;
1834}
1835
1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837{
1838 if (!PyUnicode_Check(unicode)) {
1839 PyErr_BadArgument();
1840 return NULL;
1841 }
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843 PyUnicode_GET_SIZE(unicode),
1844 NULL);
1845}
1846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001847#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001851PyObject *PyUnicode_DecodeMBCS(const char *s,
1852 int size,
1853 const char *errors)
1854{
1855 PyUnicodeObject *v;
1856 Py_UNICODE *p;
1857
1858 /* First get the size of the result */
1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001860 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863 v = _PyUnicode_New(usize);
1864 if (v == NULL)
1865 return NULL;
1866 if (usize == 0)
1867 return (PyObject *)v;
1868 p = PyUnicode_AS_UNICODE(v);
1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870 Py_DECREF(v);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872 }
1873
1874 return (PyObject *)v;
1875}
1876
1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878 int size,
1879 const char *errors)
1880{
1881 PyObject *repr;
1882 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001883 DWORD mbcssize;
1884
1885 /* If there are no characters, bail now! */
1886 if (size==0)
1887 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888
1889 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001891 if (mbcssize==0)
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894 repr = PyString_FromStringAndSize(NULL, mbcssize);
1895 if (repr == NULL)
1896 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001897 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001898 return repr;
1899
1900 /* Do the conversion */
1901 s = PyString_AS_STRING(repr);
1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903 Py_DECREF(repr);
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905 }
1906 return repr;
1907}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909#endif /* MS_WIN32 */
1910
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911/* --- Character Mapping Codec -------------------------------------------- */
1912
1913static
1914int charmap_decoding_error(const char **source,
1915 Py_UNICODE **dest,
1916 const char *errors,
1917 const char *details)
1918{
1919 if ((errors == NULL) ||
1920 (strcmp(errors,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001922 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 details);
1924 return -1;
1925 }
1926 else if (strcmp(errors,"ignore") == 0) {
1927 return 0;
1928 }
1929 else if (strcmp(errors,"replace") == 0) {
1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931 (*dest)++;
1932 return 0;
1933 }
1934 else {
1935 PyErr_Format(PyExc_ValueError,
1936 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001937 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 errors);
1939 return -1;
1940 }
1941}
1942
1943PyObject *PyUnicode_DecodeCharmap(const char *s,
1944 int size,
1945 PyObject *mapping,
1946 const char *errors)
1947{
1948 PyUnicodeObject *v;
1949 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001950 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 /* Default to Latin-1 */
1953 if (mapping == NULL)
1954 return PyUnicode_DecodeLatin1(s, size, errors);
1955
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 return (PyObject *)v;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 unsigned char ch = *s++;
1964 PyObject *w, *x;
1965
1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967 w = PyInt_FromLong((long)ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001974 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001976 x = Py_None;
1977 Py_INCREF(x);
1978 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001979 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 }
1981
1982 /* Apply mapping */
1983 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001984 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 if (value < 0 || value > 65535) {
1986 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001987 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 Py_DECREF(x);
1989 goto onError;
1990 }
1991 *p++ = (Py_UNICODE)value;
1992 }
1993 else if (x == Py_None) {
1994 /* undefined mapping */
1995 if (charmap_decoding_error(&s, &p, errors,
1996 "character maps to <undefined>")) {
1997 Py_DECREF(x);
1998 goto onError;
1999 }
2000 }
2001 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002002 int targetsize = PyUnicode_GET_SIZE(x);
2003
2004 if (targetsize == 1)
2005 /* 1-1 mapping */
2006 *p++ = *PyUnicode_AS_UNICODE(x);
2007
2008 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002010 if (targetsize > extrachars) {
2011 /* resize first */
2012 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2013 int needed = (targetsize - extrachars) + \
2014 (targetsize << 2);
2015 extrachars += needed;
2016 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002017 Py_DECREF(x);
2018 goto onError;
2019 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002020 p = PyUnicode_AS_UNICODE(v) + oldpos;
2021 }
2022 Py_UNICODE_COPY(p,
2023 PyUnicode_AS_UNICODE(x),
2024 targetsize);
2025 p += targetsize;
2026 extrachars -= targetsize;
2027 }
2028 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 }
2030 else {
2031 /* wrong return value */
2032 PyErr_SetString(PyExc_TypeError,
2033 "character mapping must return integer, None or unicode");
2034 Py_DECREF(x);
2035 goto onError;
2036 }
2037 Py_DECREF(x);
2038 }
2039 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2040 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2041 goto onError;
2042 return (PyObject *)v;
2043
2044 onError:
2045 Py_XDECREF(v);
2046 return NULL;
2047}
2048
2049static
2050int charmap_encoding_error(const Py_UNICODE **source,
2051 char **dest,
2052 const char *errors,
2053 const char *details)
2054{
2055 if ((errors == NULL) ||
2056 (strcmp(errors,"strict") == 0)) {
2057 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002058 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 details);
2060 return -1;
2061 }
2062 else if (strcmp(errors,"ignore") == 0) {
2063 return 0;
2064 }
2065 else if (strcmp(errors,"replace") == 0) {
2066 **dest = '?';
2067 (*dest)++;
2068 return 0;
2069 }
2070 else {
2071 PyErr_Format(PyExc_ValueError,
2072 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002073 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 errors);
2075 return -1;
2076 }
2077}
2078
2079PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2080 int size,
2081 PyObject *mapping,
2082 const char *errors)
2083{
2084 PyObject *v;
2085 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002086 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087
2088 /* Default to Latin-1 */
2089 if (mapping == NULL)
2090 return PyUnicode_EncodeLatin1(p, size, errors);
2091
2092 v = PyString_FromStringAndSize(NULL, size);
2093 if (v == NULL)
2094 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002095 if (size == 0)
2096 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 s = PyString_AS_STRING(v);
2098 while (size-- > 0) {
2099 Py_UNICODE ch = *p++;
2100 PyObject *w, *x;
2101
2102 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2103 w = PyInt_FromLong((long)ch);
2104 if (w == NULL)
2105 goto onError;
2106 x = PyObject_GetItem(mapping, w);
2107 Py_DECREF(w);
2108 if (x == NULL) {
2109 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002110 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002112 x = Py_None;
2113 Py_INCREF(x);
2114 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002115 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
2117
2118 /* Apply mapping */
2119 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002120 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 if (value < 0 || value > 255) {
2122 PyErr_SetString(PyExc_TypeError,
2123 "character mapping must be in range(256)");
2124 Py_DECREF(x);
2125 goto onError;
2126 }
2127 *s++ = (char)value;
2128 }
2129 else if (x == Py_None) {
2130 /* undefined mapping */
2131 if (charmap_encoding_error(&p, &s, errors,
2132 "character maps to <undefined>")) {
2133 Py_DECREF(x);
2134 goto onError;
2135 }
2136 }
2137 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002138 int targetsize = PyString_GET_SIZE(x);
2139
2140 if (targetsize == 1)
2141 /* 1-1 mapping */
2142 *s++ = *PyString_AS_STRING(x);
2143
2144 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002146 if (targetsize > extrachars) {
2147 /* resize first */
2148 int oldpos = (int)(s - PyString_AS_STRING(v));
2149 int needed = (targetsize - extrachars) + \
2150 (targetsize << 2);
2151 extrachars += needed;
2152 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002153 Py_DECREF(x);
2154 goto onError;
2155 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002156 s = PyString_AS_STRING(v) + oldpos;
2157 }
2158 memcpy(s,
2159 PyString_AS_STRING(x),
2160 targetsize);
2161 s += targetsize;
2162 extrachars -= targetsize;
2163 }
2164 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 }
2166 else {
2167 /* wrong return value */
2168 PyErr_SetString(PyExc_TypeError,
2169 "character mapping must return integer, None or unicode");
2170 Py_DECREF(x);
2171 goto onError;
2172 }
2173 Py_DECREF(x);
2174 }
2175 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2176 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2177 goto onError;
2178 return v;
2179
2180 onError:
2181 Py_DECREF(v);
2182 return NULL;
2183}
2184
2185PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2186 PyObject *mapping)
2187{
2188 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode),
2194 mapping,
2195 NULL);
2196}
2197
2198static
2199int translate_error(const Py_UNICODE **source,
2200 Py_UNICODE **dest,
2201 const char *errors,
2202 const char *details)
2203{
2204 if ((errors == NULL) ||
2205 (strcmp(errors,"strict") == 0)) {
2206 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002207 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 details);
2209 return -1;
2210 }
2211 else if (strcmp(errors,"ignore") == 0) {
2212 return 0;
2213 }
2214 else if (strcmp(errors,"replace") == 0) {
2215 **dest = '?';
2216 (*dest)++;
2217 return 0;
2218 }
2219 else {
2220 PyErr_Format(PyExc_ValueError,
2221 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002222 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 errors);
2224 return -1;
2225 }
2226}
2227
2228PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2229 int size,
2230 PyObject *mapping,
2231 const char *errors)
2232{
2233 PyUnicodeObject *v;
2234 Py_UNICODE *p;
2235
2236 if (mapping == NULL) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
2240
2241 /* Output will never be longer than input */
2242 v = _PyUnicode_New(size);
2243 if (v == NULL)
2244 goto onError;
2245 if (size == 0)
2246 goto done;
2247 p = PyUnicode_AS_UNICODE(v);
2248 while (size-- > 0) {
2249 Py_UNICODE ch = *s++;
2250 PyObject *w, *x;
2251
2252 /* Get mapping */
2253 w = PyInt_FromLong(ch);
2254 if (w == NULL)
2255 goto onError;
2256 x = PyObject_GetItem(mapping, w);
2257 Py_DECREF(w);
2258 if (x == NULL) {
2259 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2260 /* No mapping found: default to 1-1 mapping */
2261 PyErr_Clear();
2262 *p++ = ch;
2263 continue;
2264 }
2265 goto onError;
2266 }
2267
2268 /* Apply mapping */
2269 if (PyInt_Check(x))
2270 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2271 else if (x == Py_None) {
2272 /* undefined mapping */
2273 if (translate_error(&s, &p, errors,
2274 "character maps to <undefined>")) {
2275 Py_DECREF(x);
2276 goto onError;
2277 }
2278 }
2279 else if (PyUnicode_Check(x)) {
2280 if (PyUnicode_GET_SIZE(x) != 1) {
2281 /* 1-n mapping */
2282 PyErr_SetString(PyExc_NotImplementedError,
2283 "1-n mappings are currently not implemented");
2284 Py_DECREF(x);
2285 goto onError;
2286 }
2287 *p++ = *PyUnicode_AS_UNICODE(x);
2288 }
2289 else {
2290 /* wrong return value */
2291 PyErr_SetString(PyExc_TypeError,
2292 "translate mapping must return integer, None or unicode");
2293 Py_DECREF(x);
2294 goto onError;
2295 }
2296 Py_DECREF(x);
2297 }
2298 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002299 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 done:
2303 return (PyObject *)v;
2304
2305 onError:
2306 Py_XDECREF(v);
2307 return NULL;
2308}
2309
2310PyObject *PyUnicode_Translate(PyObject *str,
2311 PyObject *mapping,
2312 const char *errors)
2313{
2314 PyObject *result;
2315
2316 str = PyUnicode_FromObject(str);
2317 if (str == NULL)
2318 goto onError;
2319 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2320 PyUnicode_GET_SIZE(str),
2321 mapping,
2322 errors);
2323 Py_DECREF(str);
2324 return result;
2325
2326 onError:
2327 Py_XDECREF(str);
2328 return NULL;
2329}
2330
Guido van Rossum9e896b32000-04-05 20:11:21 +00002331/* --- Decimal Encoder ---------------------------------------------------- */
2332
2333int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2334 int length,
2335 char *output,
2336 const char *errors)
2337{
2338 Py_UNICODE *p, *end;
2339
2340 if (output == NULL) {
2341 PyErr_BadArgument();
2342 return -1;
2343 }
2344
2345 p = s;
2346 end = s + length;
2347 while (p < end) {
2348 register Py_UNICODE ch = *p++;
2349 int decimal;
2350
2351 if (Py_UNICODE_ISSPACE(ch)) {
2352 *output++ = ' ';
2353 continue;
2354 }
2355 decimal = Py_UNICODE_TODECIMAL(ch);
2356 if (decimal >= 0) {
2357 *output++ = '0' + decimal;
2358 continue;
2359 }
Guido van Rossumba477042000-04-06 18:18:10 +00002360 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002361 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002362 continue;
2363 }
2364 /* All other characters are considered invalid */
2365 if (errors == NULL || strcmp(errors, "strict") == 0) {
2366 PyErr_SetString(PyExc_ValueError,
2367 "invalid decimal Unicode string");
2368 goto onError;
2369 }
2370 else if (strcmp(errors, "ignore") == 0)
2371 continue;
2372 else if (strcmp(errors, "replace") == 0) {
2373 *output++ = '?';
2374 continue;
2375 }
2376 }
2377 /* 0-terminate the output string */
2378 *output++ = '\0';
2379 return 0;
2380
2381 onError:
2382 return -1;
2383}
2384
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385/* --- Helpers ------------------------------------------------------------ */
2386
2387static
2388int count(PyUnicodeObject *self,
2389 int start,
2390 int end,
2391 PyUnicodeObject *substring)
2392{
2393 int count = 0;
2394
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002395 if (start < 0)
2396 start += self->length;
2397 if (start < 0)
2398 start = 0;
2399 if (end > self->length)
2400 end = self->length;
2401 if (end < 0)
2402 end += self->length;
2403 if (end < 0)
2404 end = 0;
2405
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002406 if (substring->length == 0)
2407 return (end - start + 1);
2408
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 end -= substring->length;
2410
2411 while (start <= end)
2412 if (Py_UNICODE_MATCH(self, start, substring)) {
2413 count++;
2414 start += substring->length;
2415 } else
2416 start++;
2417
2418 return count;
2419}
2420
2421int PyUnicode_Count(PyObject *str,
2422 PyObject *substr,
2423 int start,
2424 int end)
2425{
2426 int result;
2427
2428 str = PyUnicode_FromObject(str);
2429 if (str == NULL)
2430 return -1;
2431 substr = PyUnicode_FromObject(substr);
2432 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002433 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 return -1;
2435 }
2436
2437 result = count((PyUnicodeObject *)str,
2438 start, end,
2439 (PyUnicodeObject *)substr);
2440
2441 Py_DECREF(str);
2442 Py_DECREF(substr);
2443 return result;
2444}
2445
2446static
2447int findstring(PyUnicodeObject *self,
2448 PyUnicodeObject *substring,
2449 int start,
2450 int end,
2451 int direction)
2452{
2453 if (start < 0)
2454 start += self->length;
2455 if (start < 0)
2456 start = 0;
2457
2458 if (substring->length == 0)
2459 return start;
2460
2461 if (end > self->length)
2462 end = self->length;
2463 if (end < 0)
2464 end += self->length;
2465 if (end < 0)
2466 end = 0;
2467
2468 end -= substring->length;
2469
2470 if (direction < 0) {
2471 for (; end >= start; end--)
2472 if (Py_UNICODE_MATCH(self, end, substring))
2473 return end;
2474 } else {
2475 for (; start <= end; start++)
2476 if (Py_UNICODE_MATCH(self, start, substring))
2477 return start;
2478 }
2479
2480 return -1;
2481}
2482
2483int PyUnicode_Find(PyObject *str,
2484 PyObject *substr,
2485 int start,
2486 int end,
2487 int direction)
2488{
2489 int result;
2490
2491 str = PyUnicode_FromObject(str);
2492 if (str == NULL)
2493 return -1;
2494 substr = PyUnicode_FromObject(substr);
2495 if (substr == NULL) {
2496 Py_DECREF(substr);
2497 return -1;
2498 }
2499
2500 result = findstring((PyUnicodeObject *)str,
2501 (PyUnicodeObject *)substr,
2502 start, end, direction);
2503 Py_DECREF(str);
2504 Py_DECREF(substr);
2505 return result;
2506}
2507
2508static
2509int tailmatch(PyUnicodeObject *self,
2510 PyUnicodeObject *substring,
2511 int start,
2512 int end,
2513 int direction)
2514{
2515 if (start < 0)
2516 start += self->length;
2517 if (start < 0)
2518 start = 0;
2519
2520 if (substring->length == 0)
2521 return 1;
2522
2523 if (end > self->length)
2524 end = self->length;
2525 if (end < 0)
2526 end += self->length;
2527 if (end < 0)
2528 end = 0;
2529
2530 end -= substring->length;
2531 if (end < start)
2532 return 0;
2533
2534 if (direction > 0) {
2535 if (Py_UNICODE_MATCH(self, end, substring))
2536 return 1;
2537 } else {
2538 if (Py_UNICODE_MATCH(self, start, substring))
2539 return 1;
2540 }
2541
2542 return 0;
2543}
2544
2545int PyUnicode_Tailmatch(PyObject *str,
2546 PyObject *substr,
2547 int start,
2548 int end,
2549 int direction)
2550{
2551 int result;
2552
2553 str = PyUnicode_FromObject(str);
2554 if (str == NULL)
2555 return -1;
2556 substr = PyUnicode_FromObject(substr);
2557 if (substr == NULL) {
2558 Py_DECREF(substr);
2559 return -1;
2560 }
2561
2562 result = tailmatch((PyUnicodeObject *)str,
2563 (PyUnicodeObject *)substr,
2564 start, end, direction);
2565 Py_DECREF(str);
2566 Py_DECREF(substr);
2567 return result;
2568}
2569
2570static
2571const Py_UNICODE *findchar(const Py_UNICODE *s,
2572 int size,
2573 Py_UNICODE ch)
2574{
2575 /* like wcschr, but doesn't stop at NULL characters */
2576
2577 while (size-- > 0) {
2578 if (*s == ch)
2579 return s;
2580 s++;
2581 }
2582
2583 return NULL;
2584}
2585
2586/* Apply fixfct filter to the Unicode object self and return a
2587 reference to the modified object */
2588
2589static
2590PyObject *fixup(PyUnicodeObject *self,
2591 int (*fixfct)(PyUnicodeObject *s))
2592{
2593
2594 PyUnicodeObject *u;
2595
2596 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2597 self->length);
2598 if (u == NULL)
2599 return NULL;
2600 if (!fixfct(u)) {
2601 /* fixfct should return TRUE if it modified the buffer. If
2602 FALSE, return a reference to the original buffer instead
2603 (to save space, not time) */
2604 Py_INCREF(self);
2605 Py_DECREF(u);
2606 return (PyObject*) self;
2607 }
2608 return (PyObject*) u;
2609}
2610
2611static
2612int fixupper(PyUnicodeObject *self)
2613{
2614 int len = self->length;
2615 Py_UNICODE *s = self->str;
2616 int status = 0;
2617
2618 while (len-- > 0) {
2619 register Py_UNICODE ch;
2620
2621 ch = Py_UNICODE_TOUPPER(*s);
2622 if (ch != *s) {
2623 status = 1;
2624 *s = ch;
2625 }
2626 s++;
2627 }
2628
2629 return status;
2630}
2631
2632static
2633int fixlower(PyUnicodeObject *self)
2634{
2635 int len = self->length;
2636 Py_UNICODE *s = self->str;
2637 int status = 0;
2638
2639 while (len-- > 0) {
2640 register Py_UNICODE ch;
2641
2642 ch = Py_UNICODE_TOLOWER(*s);
2643 if (ch != *s) {
2644 status = 1;
2645 *s = ch;
2646 }
2647 s++;
2648 }
2649
2650 return status;
2651}
2652
2653static
2654int fixswapcase(PyUnicodeObject *self)
2655{
2656 int len = self->length;
2657 Py_UNICODE *s = self->str;
2658 int status = 0;
2659
2660 while (len-- > 0) {
2661 if (Py_UNICODE_ISUPPER(*s)) {
2662 *s = Py_UNICODE_TOLOWER(*s);
2663 status = 1;
2664 } else if (Py_UNICODE_ISLOWER(*s)) {
2665 *s = Py_UNICODE_TOUPPER(*s);
2666 status = 1;
2667 }
2668 s++;
2669 }
2670
2671 return status;
2672}
2673
2674static
2675int fixcapitalize(PyUnicodeObject *self)
2676{
2677 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2678 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2679 return 1;
2680 }
2681 return 0;
2682}
2683
2684static
2685int fixtitle(PyUnicodeObject *self)
2686{
2687 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2688 register Py_UNICODE *e;
2689 int previous_is_cased;
2690
2691 /* Shortcut for single character strings */
2692 if (PyUnicode_GET_SIZE(self) == 1) {
2693 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2694 if (*p != ch) {
2695 *p = ch;
2696 return 1;
2697 }
2698 else
2699 return 0;
2700 }
2701
2702 e = p + PyUnicode_GET_SIZE(self);
2703 previous_is_cased = 0;
2704 for (; p < e; p++) {
2705 register const Py_UNICODE ch = *p;
2706
2707 if (previous_is_cased)
2708 *p = Py_UNICODE_TOLOWER(ch);
2709 else
2710 *p = Py_UNICODE_TOTITLE(ch);
2711
2712 if (Py_UNICODE_ISLOWER(ch) ||
2713 Py_UNICODE_ISUPPER(ch) ||
2714 Py_UNICODE_ISTITLE(ch))
2715 previous_is_cased = 1;
2716 else
2717 previous_is_cased = 0;
2718 }
2719 return 1;
2720}
2721
2722PyObject *PyUnicode_Join(PyObject *separator,
2723 PyObject *seq)
2724{
2725 Py_UNICODE *sep;
2726 int seplen;
2727 PyUnicodeObject *res = NULL;
2728 int reslen = 0;
2729 Py_UNICODE *p;
2730 int seqlen = 0;
2731 int sz = 100;
2732 int i;
2733
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002734 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 if (seqlen < 0 && PyErr_Occurred())
2736 return NULL;
2737
2738 if (separator == NULL) {
2739 Py_UNICODE blank = ' ';
2740 sep = &blank;
2741 seplen = 1;
2742 }
2743 else {
2744 separator = PyUnicode_FromObject(separator);
2745 if (separator == NULL)
2746 return NULL;
2747 sep = PyUnicode_AS_UNICODE(separator);
2748 seplen = PyUnicode_GET_SIZE(separator);
2749 }
2750
2751 res = _PyUnicode_New(sz);
2752 if (res == NULL)
2753 goto onError;
2754 p = PyUnicode_AS_UNICODE(res);
2755 reslen = 0;
2756
2757 for (i = 0; i < seqlen; i++) {
2758 int itemlen;
2759 PyObject *item;
2760
2761 item = PySequence_GetItem(seq, i);
2762 if (item == NULL)
2763 goto onError;
2764 if (!PyUnicode_Check(item)) {
2765 PyObject *v;
2766 v = PyUnicode_FromObject(item);
2767 Py_DECREF(item);
2768 item = v;
2769 if (item == NULL)
2770 goto onError;
2771 }
2772 itemlen = PyUnicode_GET_SIZE(item);
2773 while (reslen + itemlen + seplen >= sz) {
2774 if (_PyUnicode_Resize(res, sz*2))
2775 goto onError;
2776 sz *= 2;
2777 p = PyUnicode_AS_UNICODE(res) + reslen;
2778 }
2779 if (i > 0) {
2780 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2781 p += seplen;
2782 reslen += seplen;
2783 }
2784 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2785 p += itemlen;
2786 reslen += itemlen;
2787 Py_DECREF(item);
2788 }
2789 if (_PyUnicode_Resize(res, reslen))
2790 goto onError;
2791
2792 Py_XDECREF(separator);
2793 return (PyObject *)res;
2794
2795 onError:
2796 Py_XDECREF(separator);
2797 Py_DECREF(res);
2798 return NULL;
2799}
2800
2801static
2802PyUnicodeObject *pad(PyUnicodeObject *self,
2803 int left,
2804 int right,
2805 Py_UNICODE fill)
2806{
2807 PyUnicodeObject *u;
2808
2809 if (left < 0)
2810 left = 0;
2811 if (right < 0)
2812 right = 0;
2813
2814 if (left == 0 && right == 0) {
2815 Py_INCREF(self);
2816 return self;
2817 }
2818
2819 u = _PyUnicode_New(left + self->length + right);
2820 if (u) {
2821 if (left)
2822 Py_UNICODE_FILL(u->str, fill, left);
2823 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2824 if (right)
2825 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2826 }
2827
2828 return u;
2829}
2830
2831#define SPLIT_APPEND(data, left, right) \
2832 str = PyUnicode_FromUnicode(data + left, right - left); \
2833 if (!str) \
2834 goto onError; \
2835 if (PyList_Append(list, str)) { \
2836 Py_DECREF(str); \
2837 goto onError; \
2838 } \
2839 else \
2840 Py_DECREF(str);
2841
2842static
2843PyObject *split_whitespace(PyUnicodeObject *self,
2844 PyObject *list,
2845 int maxcount)
2846{
2847 register int i;
2848 register int j;
2849 int len = self->length;
2850 PyObject *str;
2851
2852 for (i = j = 0; i < len; ) {
2853 /* find a token */
2854 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2855 i++;
2856 j = i;
2857 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2858 i++;
2859 if (j < i) {
2860 if (maxcount-- <= 0)
2861 break;
2862 SPLIT_APPEND(self->str, j, i);
2863 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2864 i++;
2865 j = i;
2866 }
2867 }
2868 if (j < len) {
2869 SPLIT_APPEND(self->str, j, len);
2870 }
2871 return list;
2872
2873 onError:
2874 Py_DECREF(list);
2875 return NULL;
2876}
2877
2878PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002879 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880{
2881 register int i;
2882 register int j;
2883 int len;
2884 PyObject *list;
2885 PyObject *str;
2886 Py_UNICODE *data;
2887
2888 string = PyUnicode_FromObject(string);
2889 if (string == NULL)
2890 return NULL;
2891 data = PyUnicode_AS_UNICODE(string);
2892 len = PyUnicode_GET_SIZE(string);
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 list = PyList_New(0);
2895 if (!list)
2896 goto onError;
2897
2898 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002899 int eol;
2900
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 /* Find a line and append it */
2902 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2903 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904
2905 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002906 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 if (i < len) {
2908 if (data[i] == '\r' && i + 1 < len &&
2909 data[i+1] == '\n')
2910 i += 2;
2911 else
2912 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002913 if (keepends)
2914 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 }
Guido van Rossum86662912000-04-11 15:38:46 +00002916 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 j = i;
2918 }
2919 if (j < len) {
2920 SPLIT_APPEND(data, j, len);
2921 }
2922
2923 Py_DECREF(string);
2924 return list;
2925
2926 onError:
2927 Py_DECREF(list);
2928 Py_DECREF(string);
2929 return NULL;
2930}
2931
2932static
2933PyObject *split_char(PyUnicodeObject *self,
2934 PyObject *list,
2935 Py_UNICODE ch,
2936 int maxcount)
2937{
2938 register int i;
2939 register int j;
2940 int len = self->length;
2941 PyObject *str;
2942
2943 for (i = j = 0; i < len; ) {
2944 if (self->str[i] == ch) {
2945 if (maxcount-- <= 0)
2946 break;
2947 SPLIT_APPEND(self->str, j, i);
2948 i = j = i + 1;
2949 } else
2950 i++;
2951 }
2952 if (j <= len) {
2953 SPLIT_APPEND(self->str, j, len);
2954 }
2955 return list;
2956
2957 onError:
2958 Py_DECREF(list);
2959 return NULL;
2960}
2961
2962static
2963PyObject *split_substring(PyUnicodeObject *self,
2964 PyObject *list,
2965 PyUnicodeObject *substring,
2966 int maxcount)
2967{
2968 register int i;
2969 register int j;
2970 int len = self->length;
2971 int sublen = substring->length;
2972 PyObject *str;
2973
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002974 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 if (Py_UNICODE_MATCH(self, i, substring)) {
2976 if (maxcount-- <= 0)
2977 break;
2978 SPLIT_APPEND(self->str, j, i);
2979 i = j = i + sublen;
2980 } else
2981 i++;
2982 }
2983 if (j <= len) {
2984 SPLIT_APPEND(self->str, j, len);
2985 }
2986 return list;
2987
2988 onError:
2989 Py_DECREF(list);
2990 return NULL;
2991}
2992
2993#undef SPLIT_APPEND
2994
2995static
2996PyObject *split(PyUnicodeObject *self,
2997 PyUnicodeObject *substring,
2998 int maxcount)
2999{
3000 PyObject *list;
3001
3002 if (maxcount < 0)
3003 maxcount = INT_MAX;
3004
3005 list = PyList_New(0);
3006 if (!list)
3007 return NULL;
3008
3009 if (substring == NULL)
3010 return split_whitespace(self,list,maxcount);
3011
3012 else if (substring->length == 1)
3013 return split_char(self,list,substring->str[0],maxcount);
3014
3015 else if (substring->length == 0) {
3016 Py_DECREF(list);
3017 PyErr_SetString(PyExc_ValueError, "empty separator");
3018 return NULL;
3019 }
3020 else
3021 return split_substring(self,list,substring,maxcount);
3022}
3023
3024static
3025PyObject *strip(PyUnicodeObject *self,
3026 int left,
3027 int right)
3028{
3029 Py_UNICODE *p = self->str;
3030 int start = 0;
3031 int end = self->length;
3032
3033 if (left)
3034 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3035 start++;
3036
3037 if (right)
3038 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3039 end--;
3040
3041 if (start == 0 && end == self->length) {
3042 /* couldn't strip anything off, return original string */
3043 Py_INCREF(self);
3044 return (PyObject*) self;
3045 }
3046
3047 return (PyObject*) PyUnicode_FromUnicode(
3048 self->str + start,
3049 end - start
3050 );
3051}
3052
3053static
3054PyObject *replace(PyUnicodeObject *self,
3055 PyUnicodeObject *str1,
3056 PyUnicodeObject *str2,
3057 int maxcount)
3058{
3059 PyUnicodeObject *u;
3060
3061 if (maxcount < 0)
3062 maxcount = INT_MAX;
3063
3064 if (str1->length == 1 && str2->length == 1) {
3065 int i;
3066
3067 /* replace characters */
3068 if (!findchar(self->str, self->length, str1->str[0])) {
3069 /* nothing to replace, return original string */
3070 Py_INCREF(self);
3071 u = self;
3072 } else {
3073 Py_UNICODE u1 = str1->str[0];
3074 Py_UNICODE u2 = str2->str[0];
3075
3076 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3077 self->str,
3078 self->length
3079 );
3080 if (u)
3081 for (i = 0; i < u->length; i++)
3082 if (u->str[i] == u1) {
3083 if (--maxcount < 0)
3084 break;
3085 u->str[i] = u2;
3086 }
3087 }
3088
3089 } else {
3090 int n, i;
3091 Py_UNICODE *p;
3092
3093 /* replace strings */
3094 n = count(self, 0, self->length, str1);
3095 if (n > maxcount)
3096 n = maxcount;
3097 if (n == 0) {
3098 /* nothing to replace, return original string */
3099 Py_INCREF(self);
3100 u = self;
3101 } else {
3102 u = _PyUnicode_New(
3103 self->length + n * (str2->length - str1->length));
3104 if (u) {
3105 i = 0;
3106 p = u->str;
3107 while (i <= self->length - str1->length)
3108 if (Py_UNICODE_MATCH(self, i, str1)) {
3109 /* replace string segment */
3110 Py_UNICODE_COPY(p, str2->str, str2->length);
3111 p += str2->length;
3112 i += str1->length;
3113 if (--n <= 0) {
3114 /* copy remaining part */
3115 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3116 break;
3117 }
3118 } else
3119 *p++ = self->str[i++];
3120 }
3121 }
3122 }
3123
3124 return (PyObject *) u;
3125}
3126
3127/* --- Unicode Object Methods --------------------------------------------- */
3128
3129static char title__doc__[] =
3130"S.title() -> unicode\n\
3131\n\
3132Return a titlecased version of S, i.e. words start with title case\n\
3133characters, all remaining cased characters have lower case.";
3134
3135static PyObject*
3136unicode_title(PyUnicodeObject *self, PyObject *args)
3137{
3138 if (!PyArg_NoArgs(args))
3139 return NULL;
3140 return fixup(self, fixtitle);
3141}
3142
3143static char capitalize__doc__[] =
3144"S.capitalize() -> unicode\n\
3145\n\
3146Return a capitalized version of S, i.e. make the first character\n\
3147have upper case.";
3148
3149static PyObject*
3150unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3151{
3152 if (!PyArg_NoArgs(args))
3153 return NULL;
3154 return fixup(self, fixcapitalize);
3155}
3156
3157#if 0
3158static char capwords__doc__[] =
3159"S.capwords() -> unicode\n\
3160\n\
3161Apply .capitalize() to all words in S and return the result with\n\
3162normalized whitespace (all whitespace strings are replaced by ' ').";
3163
3164static PyObject*
3165unicode_capwords(PyUnicodeObject *self, PyObject *args)
3166{
3167 PyObject *list;
3168 PyObject *item;
3169 int i;
3170
3171 if (!PyArg_NoArgs(args))
3172 return NULL;
3173
3174 /* Split into words */
3175 list = split(self, NULL, -1);
3176 if (!list)
3177 return NULL;
3178
3179 /* Capitalize each word */
3180 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3181 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3182 fixcapitalize);
3183 if (item == NULL)
3184 goto onError;
3185 Py_DECREF(PyList_GET_ITEM(list, i));
3186 PyList_SET_ITEM(list, i, item);
3187 }
3188
3189 /* Join the words to form a new string */
3190 item = PyUnicode_Join(NULL, list);
3191
3192onError:
3193 Py_DECREF(list);
3194 return (PyObject *)item;
3195}
3196#endif
3197
3198static char center__doc__[] =
3199"S.center(width) -> unicode\n\
3200\n\
3201Return S centered in a Unicode string of length width. Padding is done\n\
3202using spaces.";
3203
3204static PyObject *
3205unicode_center(PyUnicodeObject *self, PyObject *args)
3206{
3207 int marg, left;
3208 int width;
3209
3210 if (!PyArg_ParseTuple(args, "i:center", &width))
3211 return NULL;
3212
3213 if (self->length >= width) {
3214 Py_INCREF(self);
3215 return (PyObject*) self;
3216 }
3217
3218 marg = width - self->length;
3219 left = marg / 2 + (marg & width & 1);
3220
3221 return (PyObject*) pad(self, left, marg - left, ' ');
3222}
3223
Marc-André Lemburge5034372000-08-08 08:04:29 +00003224#if 0
3225
3226/* This code should go into some future Unicode collation support
3227 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003228 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003229
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003230/* speedy UTF-16 code point order comparison */
3231/* gleaned from: */
3232/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003234static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003235{
3236 0, 0, 0, 0, 0, 0, 0, 0,
3237 0, 0, 0, 0, 0, 0, 0, 0,
3238 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003239 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003240};
3241
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242static int
3243unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3244{
3245 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003246
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 Py_UNICODE *s1 = str1->str;
3248 Py_UNICODE *s2 = str2->str;
3249
3250 len1 = str1->length;
3251 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003252
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003254 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003255 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003256
3257 c1 = *s1++;
3258 c2 = *s2++;
3259 if (c1 > (1<<11) * 26)
3260 c1 += utf16Fixup[c1>>11];
3261 if (c2 > (1<<11) * 26)
3262 c2 += utf16Fixup[c2>>11];
3263
3264 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003265 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003266 if (diff)
3267 return (diff < 0) ? -1 : (diff != 0);
3268 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 }
3270
3271 return (len1 < len2) ? -1 : (len1 != len2);
3272}
3273
Marc-André Lemburge5034372000-08-08 08:04:29 +00003274#else
3275
3276static int
3277unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3278{
3279 register int len1, len2;
3280
3281 Py_UNICODE *s1 = str1->str;
3282 Py_UNICODE *s2 = str2->str;
3283
3284 len1 = str1->length;
3285 len2 = str2->length;
3286
3287 while (len1 > 0 && len2 > 0) {
3288 register long diff;
3289
3290 diff = (long)*s1++ - (long)*s2++;
3291 if (diff)
3292 return (diff < 0) ? -1 : (diff != 0);
3293 len1--; len2--;
3294 }
3295
3296 return (len1 < len2) ? -1 : (len1 != len2);
3297}
3298
3299#endif
3300
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301int PyUnicode_Compare(PyObject *left,
3302 PyObject *right)
3303{
3304 PyUnicodeObject *u = NULL, *v = NULL;
3305 int result;
3306
3307 /* Coerce the two arguments */
3308 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3309 if (u == NULL)
3310 goto onError;
3311 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3312 if (v == NULL)
3313 goto onError;
3314
Thomas Wouters7e474022000-07-16 12:04:32 +00003315 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (v == u) {
3317 Py_DECREF(u);
3318 Py_DECREF(v);
3319 return 0;
3320 }
3321
3322 result = unicode_compare(u, v);
3323
3324 Py_DECREF(u);
3325 Py_DECREF(v);
3326 return result;
3327
3328onError:
3329 Py_XDECREF(u);
3330 Py_XDECREF(v);
3331 return -1;
3332}
3333
Guido van Rossum403d68b2000-03-13 15:55:09 +00003334int PyUnicode_Contains(PyObject *container,
3335 PyObject *element)
3336{
3337 PyUnicodeObject *u = NULL, *v = NULL;
3338 int result;
3339 register const Py_UNICODE *p, *e;
3340 register Py_UNICODE ch;
3341
3342 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003343 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003344 if (v == NULL) {
3345 PyErr_SetString(PyExc_TypeError,
3346 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003347 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003348 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003349 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3350 if (u == NULL) {
3351 Py_DECREF(v);
3352 goto onError;
3353 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003354
3355 /* Check v in u */
3356 if (PyUnicode_GET_SIZE(v) != 1) {
3357 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003358 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003359 goto onError;
3360 }
3361 ch = *PyUnicode_AS_UNICODE(v);
3362 p = PyUnicode_AS_UNICODE(u);
3363 e = p + PyUnicode_GET_SIZE(u);
3364 result = 0;
3365 while (p < e) {
3366 if (*p++ == ch) {
3367 result = 1;
3368 break;
3369 }
3370 }
3371
3372 Py_DECREF(u);
3373 Py_DECREF(v);
3374 return result;
3375
3376onError:
3377 Py_XDECREF(u);
3378 Py_XDECREF(v);
3379 return -1;
3380}
3381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382/* Concat to string or Unicode object giving a new Unicode object. */
3383
3384PyObject *PyUnicode_Concat(PyObject *left,
3385 PyObject *right)
3386{
3387 PyUnicodeObject *u = NULL, *v = NULL, *w;
3388
3389 /* Coerce the two arguments */
3390 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3391 if (u == NULL)
3392 goto onError;
3393 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3394 if (v == NULL)
3395 goto onError;
3396
3397 /* Shortcuts */
3398 if (v == unicode_empty) {
3399 Py_DECREF(v);
3400 return (PyObject *)u;
3401 }
3402 if (u == unicode_empty) {
3403 Py_DECREF(u);
3404 return (PyObject *)v;
3405 }
3406
3407 /* Concat the two Unicode strings */
3408 w = _PyUnicode_New(u->length + v->length);
3409 if (w == NULL)
3410 goto onError;
3411 Py_UNICODE_COPY(w->str, u->str, u->length);
3412 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3413
3414 Py_DECREF(u);
3415 Py_DECREF(v);
3416 return (PyObject *)w;
3417
3418onError:
3419 Py_XDECREF(u);
3420 Py_XDECREF(v);
3421 return NULL;
3422}
3423
3424static char count__doc__[] =
3425"S.count(sub[, start[, end]]) -> int\n\
3426\n\
3427Return the number of occurrences of substring sub in Unicode string\n\
3428S[start:end]. Optional arguments start and end are\n\
3429interpreted as in slice notation.";
3430
3431static PyObject *
3432unicode_count(PyUnicodeObject *self, PyObject *args)
3433{
3434 PyUnicodeObject *substring;
3435 int start = 0;
3436 int end = INT_MAX;
3437 PyObject *result;
3438
Guido van Rossumb8872e62000-05-09 14:14:27 +00003439 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 return NULL;
3442
3443 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3444 (PyObject *)substring);
3445 if (substring == NULL)
3446 return NULL;
3447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (start < 0)
3449 start += self->length;
3450 if (start < 0)
3451 start = 0;
3452 if (end > self->length)
3453 end = self->length;
3454 if (end < 0)
3455 end += self->length;
3456 if (end < 0)
3457 end = 0;
3458
3459 result = PyInt_FromLong((long) count(self, start, end, substring));
3460
3461 Py_DECREF(substring);
3462 return result;
3463}
3464
3465static char encode__doc__[] =
3466"S.encode([encoding[,errors]]) -> string\n\
3467\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003468Return an encoded string version of S. Default encoding is the current\n\
3469default string encoding. errors may be given to set a different error\n\
3470handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3471a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472
3473static PyObject *
3474unicode_encode(PyUnicodeObject *self, PyObject *args)
3475{
3476 char *encoding = NULL;
3477 char *errors = NULL;
3478 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3479 return NULL;
3480 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3481}
3482
3483static char expandtabs__doc__[] =
3484"S.expandtabs([tabsize]) -> unicode\n\
3485\n\
3486Return a copy of S where all tab characters are expanded using spaces.\n\
3487If tabsize is not given, a tab size of 8 characters is assumed.";
3488
3489static PyObject*
3490unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3491{
3492 Py_UNICODE *e;
3493 Py_UNICODE *p;
3494 Py_UNICODE *q;
3495 int i, j;
3496 PyUnicodeObject *u;
3497 int tabsize = 8;
3498
3499 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3500 return NULL;
3501
Thomas Wouters7e474022000-07-16 12:04:32 +00003502 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 i = j = 0;
3504 e = self->str + self->length;
3505 for (p = self->str; p < e; p++)
3506 if (*p == '\t') {
3507 if (tabsize > 0)
3508 j += tabsize - (j % tabsize);
3509 }
3510 else {
3511 j++;
3512 if (*p == '\n' || *p == '\r') {
3513 i += j;
3514 j = 0;
3515 }
3516 }
3517
3518 /* Second pass: create output string and fill it */
3519 u = _PyUnicode_New(i + j);
3520 if (!u)
3521 return NULL;
3522
3523 j = 0;
3524 q = u->str;
3525
3526 for (p = self->str; p < e; p++)
3527 if (*p == '\t') {
3528 if (tabsize > 0) {
3529 i = tabsize - (j % tabsize);
3530 j += i;
3531 while (i--)
3532 *q++ = ' ';
3533 }
3534 }
3535 else {
3536 j++;
3537 *q++ = *p;
3538 if (*p == '\n' || *p == '\r')
3539 j = 0;
3540 }
3541
3542 return (PyObject*) u;
3543}
3544
3545static char find__doc__[] =
3546"S.find(sub [,start [,end]]) -> int\n\
3547\n\
3548Return the lowest index in S where substring sub is found,\n\
3549such that sub is contained within s[start,end]. Optional\n\
3550arguments start and end are interpreted as in slice notation.\n\
3551\n\
3552Return -1 on failure.";
3553
3554static PyObject *
3555unicode_find(PyUnicodeObject *self, PyObject *args)
3556{
3557 PyUnicodeObject *substring;
3558 int start = 0;
3559 int end = INT_MAX;
3560 PyObject *result;
3561
Guido van Rossumb8872e62000-05-09 14:14:27 +00003562 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3563 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 return NULL;
3565 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3566 (PyObject *)substring);
3567 if (substring == NULL)
3568 return NULL;
3569
3570 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3571
3572 Py_DECREF(substring);
3573 return result;
3574}
3575
3576static PyObject *
3577unicode_getitem(PyUnicodeObject *self, int index)
3578{
3579 if (index < 0 || index >= self->length) {
3580 PyErr_SetString(PyExc_IndexError, "string index out of range");
3581 return NULL;
3582 }
3583
3584 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3585}
3586
3587static long
3588unicode_hash(PyUnicodeObject *self)
3589{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003590 /* Since Unicode objects compare equal to their ASCII string
3591 counterparts, they should use the individual character values
3592 as basis for their hash value. This is needed to assure that
3593 strings and Unicode objects behave in the same way as
3594 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595
Fredrik Lundhdde61642000-07-10 18:27:47 +00003596 register int len;
3597 register Py_UNICODE *p;
3598 register long x;
3599
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 if (self->hash != -1)
3601 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003602 len = PyUnicode_GET_SIZE(self);
3603 p = PyUnicode_AS_UNICODE(self);
3604 x = *p << 7;
3605 while (--len >= 0)
3606 x = (1000003*x) ^ *p++;
3607 x ^= PyUnicode_GET_SIZE(self);
3608 if (x == -1)
3609 x = -2;
3610 self->hash = x;
3611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612}
3613
3614static char index__doc__[] =
3615"S.index(sub [,start [,end]]) -> int\n\
3616\n\
3617Like S.find() but raise ValueError when the substring is not found.";
3618
3619static PyObject *
3620unicode_index(PyUnicodeObject *self, PyObject *args)
3621{
3622 int result;
3623 PyUnicodeObject *substring;
3624 int start = 0;
3625 int end = INT_MAX;
3626
Guido van Rossumb8872e62000-05-09 14:14:27 +00003627 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3628 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 return NULL;
3630
3631 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3632 (PyObject *)substring);
3633 if (substring == NULL)
3634 return NULL;
3635
3636 result = findstring(self, substring, start, end, 1);
3637
3638 Py_DECREF(substring);
3639 if (result < 0) {
3640 PyErr_SetString(PyExc_ValueError, "substring not found");
3641 return NULL;
3642 }
3643 return PyInt_FromLong(result);
3644}
3645
3646static char islower__doc__[] =
3647"S.islower() -> int\n\
3648\n\
3649Return 1 if all cased characters in S are lowercase and there is\n\
3650at least one cased character in S, 0 otherwise.";
3651
3652static PyObject*
3653unicode_islower(PyUnicodeObject *self, PyObject *args)
3654{
3655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3656 register const Py_UNICODE *e;
3657 int cased;
3658
3659 if (!PyArg_NoArgs(args))
3660 return NULL;
3661
3662 /* Shortcut for single character strings */
3663 if (PyUnicode_GET_SIZE(self) == 1)
3664 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003666 /* Special case for empty strings */
3667 if (PyString_GET_SIZE(self) == 0)
3668 return PyInt_FromLong(0);
3669
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 e = p + PyUnicode_GET_SIZE(self);
3671 cased = 0;
3672 for (; p < e; p++) {
3673 register const Py_UNICODE ch = *p;
3674
3675 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3676 return PyInt_FromLong(0);
3677 else if (!cased && Py_UNICODE_ISLOWER(ch))
3678 cased = 1;
3679 }
3680 return PyInt_FromLong(cased);
3681}
3682
3683static char isupper__doc__[] =
3684"S.isupper() -> int\n\
3685\n\
3686Return 1 if all cased characters in S are uppercase and there is\n\
3687at least one cased character in S, 0 otherwise.";
3688
3689static PyObject*
3690unicode_isupper(PyUnicodeObject *self, PyObject *args)
3691{
3692 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3693 register const Py_UNICODE *e;
3694 int cased;
3695
3696 if (!PyArg_NoArgs(args))
3697 return NULL;
3698
3699 /* Shortcut for single character strings */
3700 if (PyUnicode_GET_SIZE(self) == 1)
3701 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003703 /* Special case for empty strings */
3704 if (PyString_GET_SIZE(self) == 0)
3705 return PyInt_FromLong(0);
3706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 e = p + PyUnicode_GET_SIZE(self);
3708 cased = 0;
3709 for (; p < e; p++) {
3710 register const Py_UNICODE ch = *p;
3711
3712 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3713 return PyInt_FromLong(0);
3714 else if (!cased && Py_UNICODE_ISUPPER(ch))
3715 cased = 1;
3716 }
3717 return PyInt_FromLong(cased);
3718}
3719
3720static char istitle__doc__[] =
3721"S.istitle() -> int\n\
3722\n\
3723Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3724may only follow uncased characters and lowercase characters only cased\n\
3725ones. Return 0 otherwise.";
3726
3727static PyObject*
3728unicode_istitle(PyUnicodeObject *self, PyObject *args)
3729{
3730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3731 register const Py_UNICODE *e;
3732 int cased, previous_is_cased;
3733
3734 if (!PyArg_NoArgs(args))
3735 return NULL;
3736
3737 /* Shortcut for single character strings */
3738 if (PyUnicode_GET_SIZE(self) == 1)
3739 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3740 (Py_UNICODE_ISUPPER(*p) != 0));
3741
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003742 /* Special case for empty strings */
3743 if (PyString_GET_SIZE(self) == 0)
3744 return PyInt_FromLong(0);
3745
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 e = p + PyUnicode_GET_SIZE(self);
3747 cased = 0;
3748 previous_is_cased = 0;
3749 for (; p < e; p++) {
3750 register const Py_UNICODE ch = *p;
3751
3752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3753 if (previous_is_cased)
3754 return PyInt_FromLong(0);
3755 previous_is_cased = 1;
3756 cased = 1;
3757 }
3758 else if (Py_UNICODE_ISLOWER(ch)) {
3759 if (!previous_is_cased)
3760 return PyInt_FromLong(0);
3761 previous_is_cased = 1;
3762 cased = 1;
3763 }
3764 else
3765 previous_is_cased = 0;
3766 }
3767 return PyInt_FromLong(cased);
3768}
3769
3770static char isspace__doc__[] =
3771"S.isspace() -> int\n\
3772\n\
3773Return 1 if there are only whitespace characters in S,\n\
37740 otherwise.";
3775
3776static PyObject*
3777unicode_isspace(PyUnicodeObject *self, PyObject *args)
3778{
3779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3780 register const Py_UNICODE *e;
3781
3782 if (!PyArg_NoArgs(args))
3783 return NULL;
3784
3785 /* Shortcut for single character strings */
3786 if (PyUnicode_GET_SIZE(self) == 1 &&
3787 Py_UNICODE_ISSPACE(*p))
3788 return PyInt_FromLong(1);
3789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003790 /* Special case for empty strings */
3791 if (PyString_GET_SIZE(self) == 0)
3792 return PyInt_FromLong(0);
3793
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 e = p + PyUnicode_GET_SIZE(self);
3795 for (; p < e; p++) {
3796 if (!Py_UNICODE_ISSPACE(*p))
3797 return PyInt_FromLong(0);
3798 }
3799 return PyInt_FromLong(1);
3800}
3801
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003802static char isalpha__doc__[] =
3803"S.isalpha() -> int\n\
3804\n\
3805Return 1 if all characters in S are alphabetic\n\
3806and there is at least one character in S, 0 otherwise.";
3807
3808static PyObject*
3809unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3810{
3811 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3812 register const Py_UNICODE *e;
3813
3814 if (!PyArg_NoArgs(args))
3815 return NULL;
3816
3817 /* Shortcut for single character strings */
3818 if (PyUnicode_GET_SIZE(self) == 1 &&
3819 Py_UNICODE_ISALPHA(*p))
3820 return PyInt_FromLong(1);
3821
3822 /* Special case for empty strings */
3823 if (PyString_GET_SIZE(self) == 0)
3824 return PyInt_FromLong(0);
3825
3826 e = p + PyUnicode_GET_SIZE(self);
3827 for (; p < e; p++) {
3828 if (!Py_UNICODE_ISALPHA(*p))
3829 return PyInt_FromLong(0);
3830 }
3831 return PyInt_FromLong(1);
3832}
3833
3834static char isalnum__doc__[] =
3835"S.isalnum() -> int\n\
3836\n\
3837Return 1 if all characters in S are alphanumeric\n\
3838and there is at least one character in S, 0 otherwise.";
3839
3840static PyObject*
3841unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3842{
3843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3844 register const Py_UNICODE *e;
3845
3846 if (!PyArg_NoArgs(args))
3847 return NULL;
3848
3849 /* Shortcut for single character strings */
3850 if (PyUnicode_GET_SIZE(self) == 1 &&
3851 Py_UNICODE_ISALNUM(*p))
3852 return PyInt_FromLong(1);
3853
3854 /* Special case for empty strings */
3855 if (PyString_GET_SIZE(self) == 0)
3856 return PyInt_FromLong(0);
3857
3858 e = p + PyUnicode_GET_SIZE(self);
3859 for (; p < e; p++) {
3860 if (!Py_UNICODE_ISALNUM(*p))
3861 return PyInt_FromLong(0);
3862 }
3863 return PyInt_FromLong(1);
3864}
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866static char isdecimal__doc__[] =
3867"S.isdecimal() -> int\n\
3868\n\
3869Return 1 if there are only decimal characters in S,\n\
38700 otherwise.";
3871
3872static PyObject*
3873unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3874{
3875 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3876 register const Py_UNICODE *e;
3877
3878 if (!PyArg_NoArgs(args))
3879 return NULL;
3880
3881 /* Shortcut for single character strings */
3882 if (PyUnicode_GET_SIZE(self) == 1 &&
3883 Py_UNICODE_ISDECIMAL(*p))
3884 return PyInt_FromLong(1);
3885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003886 /* Special case for empty strings */
3887 if (PyString_GET_SIZE(self) == 0)
3888 return PyInt_FromLong(0);
3889
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 e = p + PyUnicode_GET_SIZE(self);
3891 for (; p < e; p++) {
3892 if (!Py_UNICODE_ISDECIMAL(*p))
3893 return PyInt_FromLong(0);
3894 }
3895 return PyInt_FromLong(1);
3896}
3897
3898static char isdigit__doc__[] =
3899"S.isdigit() -> int\n\
3900\n\
3901Return 1 if there are only digit characters in S,\n\
39020 otherwise.";
3903
3904static PyObject*
3905unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3906{
3907 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3908 register const Py_UNICODE *e;
3909
3910 if (!PyArg_NoArgs(args))
3911 return NULL;
3912
3913 /* Shortcut for single character strings */
3914 if (PyUnicode_GET_SIZE(self) == 1 &&
3915 Py_UNICODE_ISDIGIT(*p))
3916 return PyInt_FromLong(1);
3917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003918 /* Special case for empty strings */
3919 if (PyString_GET_SIZE(self) == 0)
3920 return PyInt_FromLong(0);
3921
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 e = p + PyUnicode_GET_SIZE(self);
3923 for (; p < e; p++) {
3924 if (!Py_UNICODE_ISDIGIT(*p))
3925 return PyInt_FromLong(0);
3926 }
3927 return PyInt_FromLong(1);
3928}
3929
3930static char isnumeric__doc__[] =
3931"S.isnumeric() -> int\n\
3932\n\
3933Return 1 if there are only numeric characters in S,\n\
39340 otherwise.";
3935
3936static PyObject*
3937unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3938{
3939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3940 register const Py_UNICODE *e;
3941
3942 if (!PyArg_NoArgs(args))
3943 return NULL;
3944
3945 /* Shortcut for single character strings */
3946 if (PyUnicode_GET_SIZE(self) == 1 &&
3947 Py_UNICODE_ISNUMERIC(*p))
3948 return PyInt_FromLong(1);
3949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003950 /* Special case for empty strings */
3951 if (PyString_GET_SIZE(self) == 0)
3952 return PyInt_FromLong(0);
3953
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 e = p + PyUnicode_GET_SIZE(self);
3955 for (; p < e; p++) {
3956 if (!Py_UNICODE_ISNUMERIC(*p))
3957 return PyInt_FromLong(0);
3958 }
3959 return PyInt_FromLong(1);
3960}
3961
3962static char join__doc__[] =
3963"S.join(sequence) -> unicode\n\
3964\n\
3965Return a string which is the concatenation of the strings in the\n\
3966sequence. The separator between elements is S.";
3967
3968static PyObject*
3969unicode_join(PyUnicodeObject *self, PyObject *args)
3970{
3971 PyObject *data;
3972 if (!PyArg_ParseTuple(args, "O:join", &data))
3973 return NULL;
3974
3975 return PyUnicode_Join((PyObject *)self, data);
3976}
3977
3978static int
3979unicode_length(PyUnicodeObject *self)
3980{
3981 return self->length;
3982}
3983
3984static char ljust__doc__[] =
3985"S.ljust(width) -> unicode\n\
3986\n\
3987Return S left justified in a Unicode string of length width. Padding is\n\
3988done using spaces.";
3989
3990static PyObject *
3991unicode_ljust(PyUnicodeObject *self, PyObject *args)
3992{
3993 int width;
3994 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3995 return NULL;
3996
3997 if (self->length >= width) {
3998 Py_INCREF(self);
3999 return (PyObject*) self;
4000 }
4001
4002 return (PyObject*) pad(self, 0, width - self->length, ' ');
4003}
4004
4005static char lower__doc__[] =
4006"S.lower() -> unicode\n\
4007\n\
4008Return a copy of the string S converted to lowercase.";
4009
4010static PyObject*
4011unicode_lower(PyUnicodeObject *self, PyObject *args)
4012{
4013 if (!PyArg_NoArgs(args))
4014 return NULL;
4015 return fixup(self, fixlower);
4016}
4017
4018static char lstrip__doc__[] =
4019"S.lstrip() -> unicode\n\
4020\n\
4021Return a copy of the string S with leading whitespace removed.";
4022
4023static PyObject *
4024unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4025{
4026 if (!PyArg_NoArgs(args))
4027 return NULL;
4028 return strip(self, 1, 0);
4029}
4030
4031static PyObject*
4032unicode_repeat(PyUnicodeObject *str, int len)
4033{
4034 PyUnicodeObject *u;
4035 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004036 int nchars;
4037 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
4039 if (len < 0)
4040 len = 0;
4041
4042 if (len == 1) {
4043 /* no repeat, return original string */
4044 Py_INCREF(str);
4045 return (PyObject*) str;
4046 }
Tim Peters8f422462000-09-09 06:13:41 +00004047
4048 /* ensure # of chars needed doesn't overflow int and # of bytes
4049 * needed doesn't overflow size_t
4050 */
4051 nchars = len * str->length;
4052 if (len && nchars / len != str->length) {
4053 PyErr_SetString(PyExc_OverflowError,
4054 "repeated string is too long");
4055 return NULL;
4056 }
4057 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4058 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4059 PyErr_SetString(PyExc_OverflowError,
4060 "repeated string is too long");
4061 return NULL;
4062 }
4063 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 if (!u)
4065 return NULL;
4066
4067 p = u->str;
4068
4069 while (len-- > 0) {
4070 Py_UNICODE_COPY(p, str->str, str->length);
4071 p += str->length;
4072 }
4073
4074 return (PyObject*) u;
4075}
4076
4077PyObject *PyUnicode_Replace(PyObject *obj,
4078 PyObject *subobj,
4079 PyObject *replobj,
4080 int maxcount)
4081{
4082 PyObject *self;
4083 PyObject *str1;
4084 PyObject *str2;
4085 PyObject *result;
4086
4087 self = PyUnicode_FromObject(obj);
4088 if (self == NULL)
4089 return NULL;
4090 str1 = PyUnicode_FromObject(subobj);
4091 if (str1 == NULL) {
4092 Py_DECREF(self);
4093 return NULL;
4094 }
4095 str2 = PyUnicode_FromObject(replobj);
4096 if (str2 == NULL) {
4097 Py_DECREF(self);
4098 Py_DECREF(str1);
4099 return NULL;
4100 }
4101 result = replace((PyUnicodeObject *)self,
4102 (PyUnicodeObject *)str1,
4103 (PyUnicodeObject *)str2,
4104 maxcount);
4105 Py_DECREF(self);
4106 Py_DECREF(str1);
4107 Py_DECREF(str2);
4108 return result;
4109}
4110
4111static char replace__doc__[] =
4112"S.replace (old, new[, maxsplit]) -> unicode\n\
4113\n\
4114Return a copy of S with all occurrences of substring\n\
4115old replaced by new. If the optional argument maxsplit is\n\
4116given, only the first maxsplit occurrences are replaced.";
4117
4118static PyObject*
4119unicode_replace(PyUnicodeObject *self, PyObject *args)
4120{
4121 PyUnicodeObject *str1;
4122 PyUnicodeObject *str2;
4123 int maxcount = -1;
4124 PyObject *result;
4125
4126 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4127 return NULL;
4128 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4129 if (str1 == NULL)
4130 return NULL;
4131 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4132 if (str2 == NULL)
4133 return NULL;
4134
4135 result = replace(self, str1, str2, maxcount);
4136
4137 Py_DECREF(str1);
4138 Py_DECREF(str2);
4139 return result;
4140}
4141
4142static
4143PyObject *unicode_repr(PyObject *unicode)
4144{
4145 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4146 PyUnicode_GET_SIZE(unicode),
4147 1);
4148}
4149
4150static char rfind__doc__[] =
4151"S.rfind(sub [,start [,end]]) -> int\n\
4152\n\
4153Return the highest index in S where substring sub is found,\n\
4154such that sub is contained within s[start,end]. Optional\n\
4155arguments start and end are interpreted as in slice notation.\n\
4156\n\
4157Return -1 on failure.";
4158
4159static PyObject *
4160unicode_rfind(PyUnicodeObject *self, PyObject *args)
4161{
4162 PyUnicodeObject *substring;
4163 int start = 0;
4164 int end = INT_MAX;
4165 PyObject *result;
4166
Guido van Rossumb8872e62000-05-09 14:14:27 +00004167 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4168 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 return NULL;
4170 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4171 (PyObject *)substring);
4172 if (substring == NULL)
4173 return NULL;
4174
4175 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4176
4177 Py_DECREF(substring);
4178 return result;
4179}
4180
4181static char rindex__doc__[] =
4182"S.rindex(sub [,start [,end]]) -> int\n\
4183\n\
4184Like S.rfind() but raise ValueError when the substring is not found.";
4185
4186static PyObject *
4187unicode_rindex(PyUnicodeObject *self, PyObject *args)
4188{
4189 int result;
4190 PyUnicodeObject *substring;
4191 int start = 0;
4192 int end = INT_MAX;
4193
Guido van Rossumb8872e62000-05-09 14:14:27 +00004194 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4195 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 return NULL;
4197 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4198 (PyObject *)substring);
4199 if (substring == NULL)
4200 return NULL;
4201
4202 result = findstring(self, substring, start, end, -1);
4203
4204 Py_DECREF(substring);
4205 if (result < 0) {
4206 PyErr_SetString(PyExc_ValueError, "substring not found");
4207 return NULL;
4208 }
4209 return PyInt_FromLong(result);
4210}
4211
4212static char rjust__doc__[] =
4213"S.rjust(width) -> unicode\n\
4214\n\
4215Return S right justified in a Unicode string of length width. Padding is\n\
4216done using spaces.";
4217
4218static PyObject *
4219unicode_rjust(PyUnicodeObject *self, PyObject *args)
4220{
4221 int width;
4222 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4223 return NULL;
4224
4225 if (self->length >= width) {
4226 Py_INCREF(self);
4227 return (PyObject*) self;
4228 }
4229
4230 return (PyObject*) pad(self, width - self->length, 0, ' ');
4231}
4232
4233static char rstrip__doc__[] =
4234"S.rstrip() -> unicode\n\
4235\n\
4236Return a copy of the string S with trailing whitespace removed.";
4237
4238static PyObject *
4239unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4240{
4241 if (!PyArg_NoArgs(args))
4242 return NULL;
4243 return strip(self, 0, 1);
4244}
4245
4246static PyObject*
4247unicode_slice(PyUnicodeObject *self, int start, int end)
4248{
4249 /* standard clamping */
4250 if (start < 0)
4251 start = 0;
4252 if (end < 0)
4253 end = 0;
4254 if (end > self->length)
4255 end = self->length;
4256 if (start == 0 && end == self->length) {
4257 /* full slice, return original string */
4258 Py_INCREF(self);
4259 return (PyObject*) self;
4260 }
4261 if (start > end)
4262 start = end;
4263 /* copy slice */
4264 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4265 end - start);
4266}
4267
4268PyObject *PyUnicode_Split(PyObject *s,
4269 PyObject *sep,
4270 int maxsplit)
4271{
4272 PyObject *result;
4273
4274 s = PyUnicode_FromObject(s);
4275 if (s == NULL)
4276 return NULL;
4277 if (sep != NULL) {
4278 sep = PyUnicode_FromObject(sep);
4279 if (sep == NULL) {
4280 Py_DECREF(s);
4281 return NULL;
4282 }
4283 }
4284
4285 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4286
4287 Py_DECREF(s);
4288 Py_XDECREF(sep);
4289 return result;
4290}
4291
4292static char split__doc__[] =
4293"S.split([sep [,maxsplit]]) -> list of strings\n\
4294\n\
4295Return a list of the words in S, using sep as the\n\
4296delimiter string. If maxsplit is given, at most maxsplit\n\
4297splits are done. If sep is not specified, any whitespace string\n\
4298is a separator.";
4299
4300static PyObject*
4301unicode_split(PyUnicodeObject *self, PyObject *args)
4302{
4303 PyObject *substring = Py_None;
4304 int maxcount = -1;
4305
4306 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4307 return NULL;
4308
4309 if (substring == Py_None)
4310 return split(self, NULL, maxcount);
4311 else if (PyUnicode_Check(substring))
4312 return split(self, (PyUnicodeObject *)substring, maxcount);
4313 else
4314 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4315}
4316
4317static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004318"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319\n\
4320Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004321Line breaks are not included in the resulting list unless keepends\n\
4322is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
4324static PyObject*
4325unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4326{
Guido van Rossum86662912000-04-11 15:38:46 +00004327 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
Guido van Rossum86662912000-04-11 15:38:46 +00004329 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 return NULL;
4331
Guido van Rossum86662912000-04-11 15:38:46 +00004332 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333}
4334
4335static
4336PyObject *unicode_str(PyUnicodeObject *self)
4337{
Fred Drakee4315f52000-05-09 19:53:39 +00004338 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339}
4340
4341static char strip__doc__[] =
4342"S.strip() -> unicode\n\
4343\n\
4344Return a copy of S with leading and trailing whitespace removed.";
4345
4346static PyObject *
4347unicode_strip(PyUnicodeObject *self, PyObject *args)
4348{
4349 if (!PyArg_NoArgs(args))
4350 return NULL;
4351 return strip(self, 1, 1);
4352}
4353
4354static char swapcase__doc__[] =
4355"S.swapcase() -> unicode\n\
4356\n\
4357Return a copy of S with uppercase characters converted to lowercase\n\
4358and vice versa.";
4359
4360static PyObject*
4361unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4362{
4363 if (!PyArg_NoArgs(args))
4364 return NULL;
4365 return fixup(self, fixswapcase);
4366}
4367
4368static char translate__doc__[] =
4369"S.translate(table) -> unicode\n\
4370\n\
4371Return a copy of the string S, where all characters have been mapped\n\
4372through the given translation table, which must be a mapping of\n\
4373Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4374are left untouched. Characters mapped to None are deleted.";
4375
4376static PyObject*
4377unicode_translate(PyUnicodeObject *self, PyObject *args)
4378{
4379 PyObject *table;
4380
4381 if (!PyArg_ParseTuple(args, "O:translate", &table))
4382 return NULL;
4383 return PyUnicode_TranslateCharmap(self->str,
4384 self->length,
4385 table,
4386 "ignore");
4387}
4388
4389static char upper__doc__[] =
4390"S.upper() -> unicode\n\
4391\n\
4392Return a copy of S converted to uppercase.";
4393
4394static PyObject*
4395unicode_upper(PyUnicodeObject *self, PyObject *args)
4396{
4397 if (!PyArg_NoArgs(args))
4398 return NULL;
4399 return fixup(self, fixupper);
4400}
4401
4402#if 0
4403static char zfill__doc__[] =
4404"S.zfill(width) -> unicode\n\
4405\n\
4406Pad a numeric string x with zeros on the left, to fill a field\n\
4407of the specified width. The string x is never truncated.";
4408
4409static PyObject *
4410unicode_zfill(PyUnicodeObject *self, PyObject *args)
4411{
4412 int fill;
4413 PyUnicodeObject *u;
4414
4415 int width;
4416 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4417 return NULL;
4418
4419 if (self->length >= width) {
4420 Py_INCREF(self);
4421 return (PyObject*) self;
4422 }
4423
4424 fill = width - self->length;
4425
4426 u = pad(self, fill, 0, '0');
4427
4428 if (u->str[fill] == '+' || u->str[fill] == '-') {
4429 /* move sign to beginning of string */
4430 u->str[0] = u->str[fill];
4431 u->str[fill] = '0';
4432 }
4433
4434 return (PyObject*) u;
4435}
4436#endif
4437
4438#if 0
4439static PyObject*
4440unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4441{
4442 if (!PyArg_NoArgs(args))
4443 return NULL;
4444 return PyInt_FromLong(unicode_freelist_size);
4445}
4446#endif
4447
4448static char startswith__doc__[] =
4449"S.startswith(prefix[, start[, end]]) -> int\n\
4450\n\
4451Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4452optional start, test S beginning at that position. With optional end, stop\n\
4453comparing S at that position.";
4454
4455static PyObject *
4456unicode_startswith(PyUnicodeObject *self,
4457 PyObject *args)
4458{
4459 PyUnicodeObject *substring;
4460 int start = 0;
4461 int end = INT_MAX;
4462 PyObject *result;
4463
Guido van Rossumb8872e62000-05-09 14:14:27 +00004464 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4465 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 return NULL;
4467 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4468 (PyObject *)substring);
4469 if (substring == NULL)
4470 return NULL;
4471
4472 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4473
4474 Py_DECREF(substring);
4475 return result;
4476}
4477
4478
4479static char endswith__doc__[] =
4480"S.endswith(suffix[, start[, end]]) -> int\n\
4481\n\
4482Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4483optional start, test S beginning at that position. With optional end, stop\n\
4484comparing S at that position.";
4485
4486static PyObject *
4487unicode_endswith(PyUnicodeObject *self,
4488 PyObject *args)
4489{
4490 PyUnicodeObject *substring;
4491 int start = 0;
4492 int end = INT_MAX;
4493 PyObject *result;
4494
Guido van Rossumb8872e62000-05-09 14:14:27 +00004495 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4496 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 return NULL;
4498 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4499 (PyObject *)substring);
4500 if (substring == NULL)
4501 return NULL;
4502
4503 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4504
4505 Py_DECREF(substring);
4506 return result;
4507}
4508
4509
4510static PyMethodDef unicode_methods[] = {
4511
4512 /* Order is according to common usage: often used methods should
4513 appear first, since lookup is done sequentially. */
4514
4515 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4516 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4517 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4518 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4519 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4520 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4521 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4522 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4523 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4524 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4525 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4526 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4527 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4528 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4529/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4530 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4531 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4532 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4533 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4534 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4535 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4536 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4537 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4538 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4539 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4540 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4541 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4542 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4543 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4544 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4545 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4546 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4547 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004548 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4549 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550#if 0
4551 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4552 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4553#endif
4554
4555#if 0
4556 /* This one is just used for debugging the implementation. */
4557 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4558#endif
4559
4560 {NULL, NULL}
4561};
4562
4563static PyObject *
4564unicode_getattr(PyUnicodeObject *self, char *name)
4565{
4566 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4567}
4568
4569static PySequenceMethods unicode_as_sequence = {
4570 (inquiry) unicode_length, /* sq_length */
4571 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4572 (intargfunc) unicode_repeat, /* sq_repeat */
4573 (intargfunc) unicode_getitem, /* sq_item */
4574 (intintargfunc) unicode_slice, /* sq_slice */
4575 0, /* sq_ass_item */
4576 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004577 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578};
4579
4580static int
4581unicode_buffer_getreadbuf(PyUnicodeObject *self,
4582 int index,
4583 const void **ptr)
4584{
4585 if (index != 0) {
4586 PyErr_SetString(PyExc_SystemError,
4587 "accessing non-existent unicode segment");
4588 return -1;
4589 }
4590 *ptr = (void *) self->str;
4591 return PyUnicode_GET_DATA_SIZE(self);
4592}
4593
4594static int
4595unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4596 const void **ptr)
4597{
4598 PyErr_SetString(PyExc_TypeError,
4599 "cannot use unicode as modifyable buffer");
4600 return -1;
4601}
4602
4603static int
4604unicode_buffer_getsegcount(PyUnicodeObject *self,
4605 int *lenp)
4606{
4607 if (lenp)
4608 *lenp = PyUnicode_GET_DATA_SIZE(self);
4609 return 1;
4610}
4611
4612static int
4613unicode_buffer_getcharbuf(PyUnicodeObject *self,
4614 int index,
4615 const void **ptr)
4616{
4617 PyObject *str;
4618
4619 if (index != 0) {
4620 PyErr_SetString(PyExc_SystemError,
4621 "accessing non-existent unicode segment");
4622 return -1;
4623 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004624 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 if (str == NULL)
4626 return -1;
4627 *ptr = (void *) PyString_AS_STRING(str);
4628 return PyString_GET_SIZE(str);
4629}
4630
4631/* Helpers for PyUnicode_Format() */
4632
4633static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004634getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635{
4636 int argidx = *p_argidx;
4637 if (argidx < arglen) {
4638 (*p_argidx)++;
4639 if (arglen < 0)
4640 return args;
4641 else
4642 return PyTuple_GetItem(args, argidx);
4643 }
4644 PyErr_SetString(PyExc_TypeError,
4645 "not enough arguments for format string");
4646 return NULL;
4647}
4648
4649#define F_LJUST (1<<0)
4650#define F_SIGN (1<<1)
4651#define F_BLANK (1<<2)
4652#define F_ALT (1<<3)
4653#define F_ZERO (1<<4)
4654
4655static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657{
4658 register int i;
4659 int len;
4660 va_list va;
4661 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663
4664 /* First, format the string as char array, then expand to Py_UNICODE
4665 array. */
4666 charbuffer = (char *)buffer;
4667 len = vsprintf(charbuffer, format, va);
4668 for (i = len - 1; i >= 0; i--)
4669 buffer[i] = (Py_UNICODE) charbuffer[i];
4670
4671 va_end(va);
4672 return len;
4673}
4674
4675static int
4676formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004677 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 int flags,
4679 int prec,
4680 int type,
4681 PyObject *v)
4682{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004683 /* fmt = '%#.' + `prec` + `type`
4684 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 char fmt[20];
4686 double x;
4687
4688 x = PyFloat_AsDouble(v);
4689 if (x == -1.0 && PyErr_Occurred())
4690 return -1;
4691 if (prec < 0)
4692 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4694 type = 'g';
4695 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004696 /* worst case length calc to ensure no buffer overrun:
4697 fmt = %#.<prec>g
4698 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4699 for any double rep.)
4700 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4701 If prec=0 the effective precision is 1 (the leading digit is
4702 always given), therefore increase by one to 10+prec. */
4703 if (buflen <= (size_t)10 + (size_t)prec) {
4704 PyErr_SetString(PyExc_OverflowError,
4705 "formatted float is too long (precision too long?)");
4706 return -1;
4707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 return usprintf(buf, fmt, x);
4709}
4710
Tim Peters38fd5b62000-09-21 05:43:11 +00004711static PyObject*
4712formatlong(PyObject *val, int flags, int prec, int type)
4713{
4714 char *buf;
4715 int i, len;
4716 PyObject *str; /* temporary string object. */
4717 PyUnicodeObject *result;
4718
4719 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4720 if (!str)
4721 return NULL;
4722 result = _PyUnicode_New(len);
4723 for (i = 0; i < len; i++)
4724 result->str[i] = buf[i];
4725 result->str[len] = 0;
4726 Py_DECREF(str);
4727 return (PyObject*)result;
4728}
4729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730static int
4731formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004732 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 int flags,
4734 int prec,
4735 int type,
4736 PyObject *v)
4737{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004738 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004739 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4740 + 1 + 1 = 24*/
4741 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 long x;
4743
4744 x = PyInt_AsLong(v);
4745 if (x == -1 && PyErr_Occurred())
4746 return -1;
4747 if (prec < 0)
4748 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004749 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4750 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4751 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4752 PyErr_SetString(PyExc_OverflowError,
4753 "formatted integer is too long (precision too long?)");
4754 return -1;
4755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4757 return usprintf(buf, fmt, x);
4758}
4759
4760static int
4761formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004762 size_t buflen,
4763 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004765 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004766 if (PyUnicode_Check(v)) {
4767 if (PyUnicode_GET_SIZE(v) != 1)
4768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004772 else if (PyString_Check(v)) {
4773 if (PyString_GET_SIZE(v) != 1)
4774 goto onError;
4775 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
4778 else {
4779 /* Integer input truncated to a character */
4780 long x;
4781 x = PyInt_AsLong(v);
4782 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004783 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 buf[0] = (char) x;
4785 }
4786 buf[1] = '\0';
4787 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004788
4789 onError:
4790 PyErr_SetString(PyExc_TypeError,
4791 "%c requires int or char");
4792 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004795/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4796
4797 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4798 chars are formatted. XXX This is a magic number. Each formatting
4799 routine does bounds checking to ensure no overflow, but a better
4800 solution may be to malloc a buffer of appropriate size for each
4801 format. For now, the current solution is sufficient.
4802*/
4803#define FORMATBUFLEN (size_t)120
4804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805PyObject *PyUnicode_Format(PyObject *format,
4806 PyObject *args)
4807{
4808 Py_UNICODE *fmt, *res;
4809 int fmtcnt, rescnt, reslen, arglen, argidx;
4810 int args_owned = 0;
4811 PyUnicodeObject *result = NULL;
4812 PyObject *dict = NULL;
4813 PyObject *uformat;
4814
4815 if (format == NULL || args == NULL) {
4816 PyErr_BadInternalCall();
4817 return NULL;
4818 }
4819 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004820 if (uformat == NULL)
4821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 fmt = PyUnicode_AS_UNICODE(uformat);
4823 fmtcnt = PyUnicode_GET_SIZE(uformat);
4824
4825 reslen = rescnt = fmtcnt + 100;
4826 result = _PyUnicode_New(reslen);
4827 if (result == NULL)
4828 goto onError;
4829 res = PyUnicode_AS_UNICODE(result);
4830
4831 if (PyTuple_Check(args)) {
4832 arglen = PyTuple_Size(args);
4833 argidx = 0;
4834 }
4835 else {
4836 arglen = -1;
4837 argidx = -2;
4838 }
4839 if (args->ob_type->tp_as_mapping)
4840 dict = args;
4841
4842 while (--fmtcnt >= 0) {
4843 if (*fmt != '%') {
4844 if (--rescnt < 0) {
4845 rescnt = fmtcnt + 100;
4846 reslen += rescnt;
4847 if (_PyUnicode_Resize(result, reslen) < 0)
4848 return NULL;
4849 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4850 --rescnt;
4851 }
4852 *res++ = *fmt++;
4853 }
4854 else {
4855 /* Got a format specifier */
4856 int flags = 0;
4857 int width = -1;
4858 int prec = -1;
4859 int size = 0;
4860 Py_UNICODE c = '\0';
4861 Py_UNICODE fill;
4862 PyObject *v = NULL;
4863 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004864 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 Py_UNICODE sign;
4866 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004867 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868
4869 fmt++;
4870 if (*fmt == '(') {
4871 Py_UNICODE *keystart;
4872 int keylen;
4873 PyObject *key;
4874 int pcount = 1;
4875
4876 if (dict == NULL) {
4877 PyErr_SetString(PyExc_TypeError,
4878 "format requires a mapping");
4879 goto onError;
4880 }
4881 ++fmt;
4882 --fmtcnt;
4883 keystart = fmt;
4884 /* Skip over balanced parentheses */
4885 while (pcount > 0 && --fmtcnt >= 0) {
4886 if (*fmt == ')')
4887 --pcount;
4888 else if (*fmt == '(')
4889 ++pcount;
4890 fmt++;
4891 }
4892 keylen = fmt - keystart - 1;
4893 if (fmtcnt < 0 || pcount > 0) {
4894 PyErr_SetString(PyExc_ValueError,
4895 "incomplete format key");
4896 goto onError;
4897 }
Fred Drakee4315f52000-05-09 19:53:39 +00004898 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 then looked up since Python uses strings to hold
4900 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004901 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 key = PyUnicode_EncodeUTF8(keystart,
4903 keylen,
4904 NULL);
4905 if (key == NULL)
4906 goto onError;
4907 if (args_owned) {
4908 Py_DECREF(args);
4909 args_owned = 0;
4910 }
4911 args = PyObject_GetItem(dict, key);
4912 Py_DECREF(key);
4913 if (args == NULL) {
4914 goto onError;
4915 }
4916 args_owned = 1;
4917 arglen = -1;
4918 argidx = -2;
4919 }
4920 while (--fmtcnt >= 0) {
4921 switch (c = *fmt++) {
4922 case '-': flags |= F_LJUST; continue;
4923 case '+': flags |= F_SIGN; continue;
4924 case ' ': flags |= F_BLANK; continue;
4925 case '#': flags |= F_ALT; continue;
4926 case '0': flags |= F_ZERO; continue;
4927 }
4928 break;
4929 }
4930 if (c == '*') {
4931 v = getnextarg(args, arglen, &argidx);
4932 if (v == NULL)
4933 goto onError;
4934 if (!PyInt_Check(v)) {
4935 PyErr_SetString(PyExc_TypeError,
4936 "* wants int");
4937 goto onError;
4938 }
4939 width = PyInt_AsLong(v);
4940 if (width < 0) {
4941 flags |= F_LJUST;
4942 width = -width;
4943 }
4944 if (--fmtcnt >= 0)
4945 c = *fmt++;
4946 }
4947 else if (c >= '0' && c <= '9') {
4948 width = c - '0';
4949 while (--fmtcnt >= 0) {
4950 c = *fmt++;
4951 if (c < '0' || c > '9')
4952 break;
4953 if ((width*10) / 10 != width) {
4954 PyErr_SetString(PyExc_ValueError,
4955 "width too big");
4956 goto onError;
4957 }
4958 width = width*10 + (c - '0');
4959 }
4960 }
4961 if (c == '.') {
4962 prec = 0;
4963 if (--fmtcnt >= 0)
4964 c = *fmt++;
4965 if (c == '*') {
4966 v = getnextarg(args, arglen, &argidx);
4967 if (v == NULL)
4968 goto onError;
4969 if (!PyInt_Check(v)) {
4970 PyErr_SetString(PyExc_TypeError,
4971 "* wants int");
4972 goto onError;
4973 }
4974 prec = PyInt_AsLong(v);
4975 if (prec < 0)
4976 prec = 0;
4977 if (--fmtcnt >= 0)
4978 c = *fmt++;
4979 }
4980 else if (c >= '0' && c <= '9') {
4981 prec = c - '0';
4982 while (--fmtcnt >= 0) {
4983 c = Py_CHARMASK(*fmt++);
4984 if (c < '0' || c > '9')
4985 break;
4986 if ((prec*10) / 10 != prec) {
4987 PyErr_SetString(PyExc_ValueError,
4988 "prec too big");
4989 goto onError;
4990 }
4991 prec = prec*10 + (c - '0');
4992 }
4993 }
4994 } /* prec */
4995 if (fmtcnt >= 0) {
4996 if (c == 'h' || c == 'l' || c == 'L') {
4997 size = c;
4998 if (--fmtcnt >= 0)
4999 c = *fmt++;
5000 }
5001 }
5002 if (fmtcnt < 0) {
5003 PyErr_SetString(PyExc_ValueError,
5004 "incomplete format");
5005 goto onError;
5006 }
5007 if (c != '%') {
5008 v = getnextarg(args, arglen, &argidx);
5009 if (v == NULL)
5010 goto onError;
5011 }
5012 sign = 0;
5013 fill = ' ';
5014 switch (c) {
5015
5016 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005017 pbuf = formatbuf;
5018 /* presume that buffer length is at least 1 */
5019 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 len = 1;
5021 break;
5022
5023 case 's':
5024 case 'r':
5025 if (PyUnicode_Check(v) && c == 's') {
5026 temp = v;
5027 Py_INCREF(temp);
5028 }
5029 else {
5030 PyObject *unicode;
5031 if (c == 's')
5032 temp = PyObject_Str(v);
5033 else
5034 temp = PyObject_Repr(v);
5035 if (temp == NULL)
5036 goto onError;
5037 if (!PyString_Check(temp)) {
5038 /* XXX Note: this should never happen, since
5039 PyObject_Repr() and PyObject_Str() assure
5040 this */
5041 Py_DECREF(temp);
5042 PyErr_SetString(PyExc_TypeError,
5043 "%s argument has non-string str()");
5044 goto onError;
5045 }
Fred Drakee4315f52000-05-09 19:53:39 +00005046 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005048 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 "strict");
5050 Py_DECREF(temp);
5051 temp = unicode;
5052 if (temp == NULL)
5053 goto onError;
5054 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005055 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056 len = PyUnicode_GET_SIZE(temp);
5057 if (prec >= 0 && len > prec)
5058 len = prec;
5059 break;
5060
5061 case 'i':
5062 case 'd':
5063 case 'u':
5064 case 'o':
5065 case 'x':
5066 case 'X':
5067 if (c == 'i')
5068 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005069 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005070 temp = formatlong(v, flags, prec, c);
5071 if (!temp)
5072 goto onError;
5073 pbuf = PyUnicode_AS_UNICODE(temp);
5074 len = PyUnicode_GET_SIZE(temp);
5075 /* unbounded ints can always produce
5076 a sign character! */
5077 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005079 else {
5080 pbuf = formatbuf;
5081 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5082 flags, prec, c, v);
5083 if (len < 0)
5084 goto onError;
5085 /* only d conversion is signed */
5086 sign = c == 'd';
5087 }
5088 if (flags & F_ZERO)
5089 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 break;
5091
5092 case 'e':
5093 case 'E':
5094 case 'f':
5095 case 'g':
5096 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005097 pbuf = formatbuf;
5098 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5099 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 if (len < 0)
5101 goto onError;
5102 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005103 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 fill = '0';
5105 break;
5106
5107 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005108 pbuf = formatbuf;
5109 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 if (len < 0)
5111 goto onError;
5112 break;
5113
5114 default:
5115 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005116 "unsupported format character '%c' (0x%x) "
5117 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005118 (31<=c && c<=126) ? c : '?',
5119 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 goto onError;
5121 }
5122 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005123 if (*pbuf == '-' || *pbuf == '+') {
5124 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 len--;
5126 }
5127 else if (flags & F_SIGN)
5128 sign = '+';
5129 else if (flags & F_BLANK)
5130 sign = ' ';
5131 else
5132 sign = 0;
5133 }
5134 if (width < len)
5135 width = len;
5136 if (rescnt < width + (sign != 0)) {
5137 reslen -= rescnt;
5138 rescnt = width + fmtcnt + 100;
5139 reslen += rescnt;
5140 if (_PyUnicode_Resize(result, reslen) < 0)
5141 return NULL;
5142 res = PyUnicode_AS_UNICODE(result)
5143 + reslen - rescnt;
5144 }
5145 if (sign) {
5146 if (fill != ' ')
5147 *res++ = sign;
5148 rescnt--;
5149 if (width > len)
5150 width--;
5151 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005152 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5153 assert(pbuf[0] == '0');
5154 assert(pbuf[1] == c);
5155 if (fill != ' ') {
5156 *res++ = *pbuf++;
5157 *res++ = *pbuf++;
5158 }
5159 rescnt -= 2;
5160 width -= 2;
5161 if (width < 0)
5162 width = 0;
5163 len -= 2;
5164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 if (width > len && !(flags & F_LJUST)) {
5166 do {
5167 --rescnt;
5168 *res++ = fill;
5169 } while (--width > len);
5170 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005171 if (fill == ' ') {
5172 if (sign)
5173 *res++ = sign;
5174 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5175 assert(pbuf[0] == '0');
5176 assert(pbuf[1] == c);
5177 *res++ = *pbuf++;
5178 *res++ = *pbuf++;
5179 }
5180 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005181 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 res += len;
5183 rescnt -= len;
5184 while (--width >= len) {
5185 --rescnt;
5186 *res++ = ' ';
5187 }
5188 if (dict && (argidx < arglen) && c != '%') {
5189 PyErr_SetString(PyExc_TypeError,
5190 "not all arguments converted");
5191 goto onError;
5192 }
5193 Py_XDECREF(temp);
5194 } /* '%' */
5195 } /* until end */
5196 if (argidx < arglen && !dict) {
5197 PyErr_SetString(PyExc_TypeError,
5198 "not all arguments converted");
5199 goto onError;
5200 }
5201
5202 if (args_owned) {
5203 Py_DECREF(args);
5204 }
5205 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005206 if (_PyUnicode_Resize(result, reslen - rescnt))
5207 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 return (PyObject *)result;
5209
5210 onError:
5211 Py_XDECREF(result);
5212 Py_DECREF(uformat);
5213 if (args_owned) {
5214 Py_DECREF(args);
5215 }
5216 return NULL;
5217}
5218
5219static PyBufferProcs unicode_as_buffer = {
5220 (getreadbufferproc) unicode_buffer_getreadbuf,
5221 (getwritebufferproc) unicode_buffer_getwritebuf,
5222 (getsegcountproc) unicode_buffer_getsegcount,
5223 (getcharbufferproc) unicode_buffer_getcharbuf,
5224};
5225
5226PyTypeObject PyUnicode_Type = {
5227 PyObject_HEAD_INIT(&PyType_Type)
5228 0, /* ob_size */
5229 "unicode", /* tp_name */
5230 sizeof(PyUnicodeObject), /* tp_size */
5231 0, /* tp_itemsize */
5232 /* Slots */
5233 (destructor)_PyUnicode_Free, /* tp_dealloc */
5234 0, /* tp_print */
5235 (getattrfunc)unicode_getattr, /* tp_getattr */
5236 0, /* tp_setattr */
5237 (cmpfunc) unicode_compare, /* tp_compare */
5238 (reprfunc) unicode_repr, /* tp_repr */
5239 0, /* tp_as_number */
5240 &unicode_as_sequence, /* tp_as_sequence */
5241 0, /* tp_as_mapping */
5242 (hashfunc) unicode_hash, /* tp_hash*/
5243 0, /* tp_call*/
5244 (reprfunc) unicode_str, /* tp_str */
5245 (getattrofunc) NULL, /* tp_getattro */
5246 (setattrofunc) NULL, /* tp_setattro */
5247 &unicode_as_buffer, /* tp_as_buffer */
5248 Py_TPFLAGS_DEFAULT, /* tp_flags */
5249};
5250
5251/* Initialize the Unicode implementation */
5252
Thomas Wouters78890102000-07-22 19:25:51 +00005253void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254{
5255 /* Doublecheck the configuration... */
5256 if (sizeof(Py_UNICODE) != 2)
5257 Py_FatalError("Unicode configuration error: "
5258 "sizeof(Py_UNICODE) != 2 bytes");
5259
Fred Drakee4315f52000-05-09 19:53:39 +00005260 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005261 unicode_freelist = NULL;
5262 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005264 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
5267/* Finalize the Unicode implementation */
5268
5269void
Thomas Wouters78890102000-07-22 19:25:51 +00005270_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005272 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005274 Py_XDECREF(unicode_empty);
5275 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005276
5277 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 PyUnicodeObject *v = u;
5279 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005280 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005281 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005282 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005283 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005285 unicode_freelist = NULL;
5286 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287}