blob: 8f7b354c243bc8b6226da05f1bd9f435976729d1 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000070#ifdef MS_WIN32
71#include <windows.h>
72#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073
Guido van Rossumd57fd912000-03-10 22:53:23 +000074/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE 1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
83
Barry Warsaw51ac5802000-03-20 16:36:48 +000084 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000086 malloc()-overhead) bytes of unused garbage.
87
88 Setting the limit to 0 effectively turns the feature off.
89
Guido van Rossumfd4b9572000-04-10 13:51:10 +000090 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
93*/
94
Guido van Rossumfd4b9572000-04-10 13:51:10 +000095#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000096
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105/* --- Globals ------------------------------------------------------------
106
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
109
110*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
112/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000113static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
134{
135 void *oldstr;
136
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000137 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000138 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000139 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000140
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
158 unicode->length = length;
159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
167
168 return 0;
169}
170
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000171int PyUnicode_Resize(PyObject **unicode,
172 int length)
173{
174 PyUnicodeObject *v;
175
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
179 }
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
184 }
185 return _PyUnicode_Resize(v, length);
186}
187
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188/* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
190
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199 register PyUnicodeObject *unicode;
200
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
205 }
206
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000210 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000217 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 }
220 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000221 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000223 }
224 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000270 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
358{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 const char *s;
360 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000361 int owned = 0;
362 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
367 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
377 }
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
383 }
384 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
391 }
392 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 }
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000406 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000407 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000408
409 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 if (len == 0) {
411 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000417 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 return v;
421
422 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427}
428
429PyObject *PyUnicode_Decode(const char *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *buffer = NULL, *unicode;
435
Fred Drakee4315f52000-05-09 19:53:39 +0000436 if (encoding == NULL)
437 encoding = PyUnicode_GetDefaultEncoding();
438
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000442 else if (strcmp(encoding, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s, size, errors);
444 else if (strcmp(encoding, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446
447 /* Decode via the codec registry */
448 buffer = PyBuffer_FromMemory((void *)s, size);
449 if (buffer == NULL)
450 goto onError;
451 unicode = PyCodec_Decode(buffer, encoding, errors);
452 if (unicode == NULL)
453 goto onError;
454 if (!PyUnicode_Check(unicode)) {
455 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000456 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 unicode->ob_type->tp_name);
458 Py_DECREF(unicode);
459 goto onError;
460 }
461 Py_DECREF(buffer);
462 return unicode;
463
464 onError:
465 Py_XDECREF(buffer);
466 return NULL;
467}
468
469PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470 int size,
471 const char *encoding,
472 const char *errors)
473{
474 PyObject *v, *unicode;
475
476 unicode = PyUnicode_FromUnicode(s, size);
477 if (unicode == NULL)
478 return NULL;
479 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480 Py_DECREF(unicode);
481 return v;
482}
483
484PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485 const char *encoding,
486 const char *errors)
487{
488 PyObject *v;
489
490 if (!PyUnicode_Check(unicode)) {
491 PyErr_BadArgument();
492 goto onError;
493 }
Fred Drakee4315f52000-05-09 19:53:39 +0000494
495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
497
498 /* Shortcuts for common default encodings */
499 if (errors == NULL) {
500 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode);
506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507
508 /* Encode via the codec registry */
509 v = PyCodec_Encode(unicode, encoding, errors);
510 if (v == NULL)
511 goto onError;
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v)) {
514 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000515 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 v->ob_type->tp_name);
517 Py_DECREF(v);
518 goto onError;
519 }
520 return v;
521
522 onError:
523 return NULL;
524}
525
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000526/* Return a Python string holding the default encoded value of the
527 Unicode object.
528
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
533
534 The refcount of the string is *not* incremented.
535
536 *** Exported for internal use by the interpreter only !!! ***
537
538*/
539
540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541 const char *errors)
542{
543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
544
545 if (v)
546 return v;
547 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548 if (v && errors == NULL)
549 ((PyUnicodeObject *)unicode)->defenc = v;
550 return v;
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
554{
555 if (!PyUnicode_Check(unicode)) {
556 PyErr_BadArgument();
557 goto onError;
558 }
559 return PyUnicode_AS_UNICODE(unicode);
560
561 onError:
562 return NULL;
563}
564
565int PyUnicode_GetSize(PyObject *unicode)
566{
567 if (!PyUnicode_Check(unicode)) {
568 PyErr_BadArgument();
569 goto onError;
570 }
571 return PyUnicode_GET_SIZE(unicode);
572
573 onError:
574 return -1;
575}
576
Thomas Wouters78890102000-07-22 19:25:51 +0000577const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000578{
579 return unicode_default_encoding;
580}
581
582int PyUnicode_SetDefaultEncoding(const char *encoding)
583{
584 PyObject *v;
585
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v = _PyCodec_Lookup(encoding);
589 if (v == NULL)
590 goto onError;
591 Py_DECREF(v);
592 strncpy(unicode_default_encoding,
593 encoding,
594 sizeof(unicode_default_encoding));
595 return 0;
596
597 onError:
598 return -1;
599}
600
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601/* --- UTF-8 Codec -------------------------------------------------------- */
602
603static
604char utf8_code_length[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
623};
624
625static
626int utf8_decoding_error(const char **source,
627 Py_UNICODE **dest,
628 const char *errors,
629 const char *details)
630{
631 if ((errors == NULL) ||
632 (strcmp(errors,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000634 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 details);
636 return -1;
637 }
638 else if (strcmp(errors,"ignore") == 0) {
639 (*source)++;
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 (*source)++;
644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656PyObject *PyUnicode_DecodeUTF8(const char *s,
657 int size,
658 const char *errors)
659{
660 int n;
661 const char *e;
662 PyUnicodeObject *unicode;
663 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665
666 /* Note: size will always be longer than the resulting Unicode
667 character count */
668 unicode = _PyUnicode_New(size);
669 if (!unicode)
670 return NULL;
671 if (size == 0)
672 return (PyObject *)unicode;
673
674 /* Unpack UTF-8 encoded data */
675 p = unicode->str;
676 e = s + size;
677
678 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000679 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680
681 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683 s++;
684 continue;
685 }
686
687 n = utf8_code_length[ch];
688
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000689 if (s + n > e) {
690 errmsg = "unexpected end of data";
691 goto utf8Error;
692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693
694 switch (n) {
695
696 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 errmsg = "unexpected code byte";
698 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 break;
700
701 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 errmsg = "internal error";
703 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 break;
705
706 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 if ((s[1] & 0xc0) != 0x80) {
708 errmsg = "invalid data";
709 goto utf8Error;
710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000712 if (ch < 0x80) {
713 errmsg = "illegal encoding";
714 goto utf8Error;
715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000717 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 break;
719
720 case 3:
721 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000722 (s[2] & 0xc0) != 0x80) {
723 errmsg = "invalid data";
724 goto utf8Error;
725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728 errmsg = "illegal encoding";
729 goto utf8Error;
730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000732 *p++ = (Py_UNICODE)ch;
733 break;
734
735 case 4:
736 if ((s[1] & 0xc0) != 0x80 ||
737 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 (s[3] & 0xc0) != 0x80) {
739 errmsg = "invalid data";
740 goto utf8Error;
741 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000745 if ((ch < 0x10000) || /* minimum value allowed for 4
746 byte encoding */
747 (ch > 0x10ffff)) { /* maximum value allowed for
748 UTF-16 */
749 errmsg = "illegal encoding";
750 goto utf8Error;
751 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000752 /* compute and append the two surrogates: */
753
754 /* translate from 10000..10FFFF to 0..FFFF */
755 ch -= 0x10000;
756
757 /* high surrogate = top 10 bits added to D800 */
758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
759
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 break;
763
764 default:
765 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000766 errmsg = "unsupported Unicode code range";
767 goto utf8Error;
768 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769 }
770 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000771 continue;
772
773 utf8Error:
774 if (utf8_decoding_error(&s, &p, errors, errmsg))
775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 }
777
778 /* Adjust length */
779 if (_PyUnicode_Resize(unicode, p - unicode->str))
780 goto onError;
781
782 return (PyObject *)unicode;
783
784onError:
785 Py_DECREF(unicode);
786 return NULL;
787}
788
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789/* Not used anymore, now that the encoder supports UTF-16
790 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000791#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792static
793int utf8_encoding_error(const Py_UNICODE **source,
794 char **dest,
795 const char *errors,
796 const char *details)
797{
798 if ((errors == NULL) ||
799 (strcmp(errors,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000801 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 details);
803 return -1;
804 }
805 else if (strcmp(errors,"ignore") == 0) {
806 return 0;
807 }
808 else if (strcmp(errors,"replace") == 0) {
809 **dest = '?';
810 (*dest)++;
811 return 0;
812 }
813 else {
814 PyErr_Format(PyExc_ValueError,
815 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000816 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 errors);
818 return -1;
819 }
820}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824 int size,
825 const char *errors)
826{
827 PyObject *v;
828 char *p;
829 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000830 Py_UCS4 ch2;
831 unsigned int cbAllocated = 3 * size;
832 unsigned int cbWritten = 0;
833 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000835 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (v == NULL)
837 return NULL;
838 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000839 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
841 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000842 while (i < size) {
843 Py_UCS4 ch = s[i++];
844 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000846 cbWritten++;
847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 else if (ch < 0x0800) {
849 *p++ = 0xc0 | (ch >> 6);
850 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000851 cbWritten += 2;
852 }
853 else {
854 /* Check for high surrogate */
855 if (0xD800 <= ch && ch <= 0xDBFF) {
856 if (i != size) {
857 ch2 = s[i];
858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
859
860 if (cbWritten >= (cbAllocated - 4)) {
861 /* Provide enough room for some more
862 surrogates */
863 cbAllocated += 4*10;
864 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000865 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 }
867
868 /* combine the two values */
869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
870
871 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 i++;
874 cbWritten += 4;
875 }
876 }
877 }
878 else {
879 *p++ = (char)(0xe0 | (ch >> 12));
880 cbWritten += 3;
881 }
882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 }
885 }
886 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000887 if (_PyString_Resize(&v, p - q))
888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 return v;
890
891 onError:
892 Py_DECREF(v);
893 return NULL;
894}
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
897{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 if (!PyUnicode_Check(unicode)) {
899 PyErr_BadArgument();
900 return NULL;
901 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903 PyUnicode_GET_SIZE(unicode),
904 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905}
906
907/* --- UTF-16 Codec ------------------------------------------------------- */
908
909static
910int utf16_decoding_error(const Py_UNICODE **source,
911 Py_UNICODE **dest,
912 const char *errors,
913 const char *details)
914{
915 if ((errors == NULL) ||
916 (strcmp(errors,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000918 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000919 details);
920 return -1;
921 }
922 else if (strcmp(errors,"ignore") == 0) {
923 return 0;
924 }
925 else if (strcmp(errors,"replace") == 0) {
926 if (dest) {
927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928 (*dest)++;
929 }
930 return 0;
931 }
932 else {
933 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 errors);
937 return -1;
938 }
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_DecodeUTF16(const char *s,
942 int size,
943 const char *errors,
944 int *byteorder)
945{
946 PyUnicodeObject *unicode;
947 Py_UNICODE *p;
948 const Py_UNICODE *q, *e;
949 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000950 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951
952 /* size should be an even number */
953 if (size % sizeof(Py_UNICODE) != 0) {
954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955 return NULL;
956 /* The remaining input chars are ignored if we fall through
957 here... */
958 }
959
960 /* Note: size will always be longer than the resulting Unicode
961 character count */
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 /* Unpack UTF-16 encoded data */
969 p = unicode->str;
970 q = (Py_UNICODE *)s;
971 e = q + (size / sizeof(Py_UNICODE));
972
973 if (byteorder)
974 bo = *byteorder;
975
976 while (q < e) {
977 register Py_UNICODE ch = *q++;
978
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982 !) */
983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
984 if (ch == 0xFEFF) {
985 bo = -1;
986 continue;
987 } else if (ch == 0xFFFE) {
988 bo = 1;
989 continue;
990 }
991 if (bo == 1)
992 ch = (ch >> 8) | (ch << 8);
993#else
994 if (ch == 0xFEFF) {
995 bo = 1;
996 continue;
997 } else if (ch == 0xFFFE) {
998 bo = -1;
999 continue;
1000 }
1001 if (bo == -1)
1002 ch = (ch >> 8) | (ch << 8);
1003#endif
1004 if (ch < 0xD800 || ch > 0xDFFF) {
1005 *p++ = ch;
1006 continue;
1007 }
1008
1009 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001010 if (q >= e) {
1011 errmsg = "unexpected end of data";
1012 goto utf16Error;
1013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 if (0xDC00 <= *q && *q <= 0xDFFF) {
1015 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001021 errmsg = "code pairs are not supported";
1022 goto utf16Error;
1023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 else
1025 continue;
1026 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "illegal encoding";
1028 /* Fall through to report the error */
1029
1030 utf16Error:
1031 if (utf16_decoding_error(&q, &p, errors, errmsg))
1032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 }
1034
1035 if (byteorder)
1036 *byteorder = bo;
1037
1038 /* Adjust length */
1039 if (_PyUnicode_Resize(unicode, p - unicode->str))
1040 goto onError;
1041
1042 return (PyObject *)unicode;
1043
1044onError:
1045 Py_DECREF(unicode);
1046 return NULL;
1047}
1048
1049#undef UTF16_ERROR
1050
1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052 int size,
1053 const char *errors,
1054 int byteorder)
1055{
1056 PyObject *v;
1057 Py_UNICODE *p;
1058 char *q;
1059
1060 /* We don't create UTF-16 pairs... */
1061 v = PyString_FromStringAndSize(NULL,
1062 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063 if (v == NULL)
1064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 q = PyString_AS_STRING(v);
1067 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (byteorder == 0)
1069 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001070 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (byteorder == 0 ||
1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074 byteorder == -1
1075#else
1076 byteorder == 1
1077#endif
1078 )
1079 memcpy(p, s, size * sizeof(Py_UNICODE));
1080 else
1081 while (size-- > 0) {
1082 Py_UNICODE ch = *s++;
1083 *p++ = (ch >> 8) | (ch << 8);
1084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 return v;
1086}
1087
1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089{
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 return NULL;
1093 }
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095 PyUnicode_GET_SIZE(unicode),
1096 NULL,
1097 0);
1098}
1099
1100/* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102static
1103int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001104 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 const char *errors,
1106 const char *details)
1107{
1108 if ((errors == NULL) ||
1109 (strcmp(errors,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001111 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 details);
1113 return -1;
1114 }
1115 else if (strcmp(errors,"ignore") == 0) {
1116 return 0;
1117 }
1118 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001119 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 return 0;
1121 }
1122 else {
1123 PyErr_Format(PyExc_ValueError,
1124 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 errors);
1127 return -1;
1128 }
1129}
1130
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001131static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133static
1134int mystrnicmp(const char *s1, const char *s2, size_t count)
1135{
1136 char c1, c2;
1137
1138 if (count)
1139 {
1140 do
1141 {
1142 c1 = tolower(*(s1++));
1143 c2 = tolower(*(s2++));
1144 }
1145 while(--count && c1 == c2);
1146
1147 return c1 - c2;
1148 }
1149
1150 return 0;
1151}
1152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154 int size,
1155 const char *errors)
1156{
1157 PyUnicodeObject *v;
1158 Py_UNICODE *p = NULL, *buf = NULL;
1159 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001160 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v = _PyUnicode_New(size);
1166 if (v == NULL)
1167 goto onError;
1168 if (size == 0)
1169 return (PyObject *)v;
1170 p = buf = PyUnicode_AS_UNICODE(v);
1171 end = s + size;
1172 while (s < end) {
1173 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001174 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 int i;
1176
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1178 if (*s != '\\') {
1179 *p++ = (unsigned char)*s++;
1180 continue;
1181 }
1182
1183 /* \ - Escapes */
1184 s++;
1185 switch (*s++) {
1186
1187 /* \x escapes */
1188 case '\n': break;
1189 case '\\': *p++ = '\\'; break;
1190 case '\'': *p++ = '\''; break;
1191 case '\"': *p++ = '\"'; break;
1192 case 'b': *p++ = '\b'; break;
1193 case 'f': *p++ = '\014'; break; /* FF */
1194 case 't': *p++ = '\t'; break;
1195 case 'n': *p++ = '\n'; break;
1196 case 'r': *p++ = '\r'; break;
1197 case 'v': *p++ = '\013'; break; /* VT */
1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001203 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001205 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001207 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001209 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 break;
1211
Fredrik Lundhdf846752000-09-03 11:29:49 +00001212 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001214 for (x = 0, i = 0; i < 2; i++) {
1215 c = (unsigned char)s[i];
1216 if (!isxdigit(c)) {
1217 if (unicodeescape_decoding_error(&s, &x, errors,
1218 "truncated \\xXX"))
1219 goto onError;
1220 i++;
1221 break;
1222 }
1223 x = (x<<4) & ~0xF;
1224 if (c >= '0' && c <= '9')
1225 x += c - '0';
1226 else if (c >= 'a' && c <= 'f')
1227 x += 10 + c - 'a';
1228 else
1229 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001231 s += i;
1232 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 break;
1234
1235 /* \uXXXX with 4 hex digits */
1236 case 'u':
1237 for (x = 0, i = 0; i < 4; i++) {
1238 c = (unsigned char)s[i];
1239 if (!isxdigit(c)) {
1240 if (unicodeescape_decoding_error(&s, &x, errors,
1241 "truncated \\uXXXX"))
1242 goto onError;
1243 i++;
1244 break;
1245 }
1246 x = (x<<4) & ~0xF;
1247 if (c >= '0' && c <= '9')
1248 x += c - '0';
1249 else if (c >= 'a' && c <= 'f')
1250 x += 10 + c - 'a';
1251 else
1252 x += 10 + c - 'A';
1253 }
1254 s += i;
1255 *p++ = x;
1256 break;
1257
Fredrik Lundhdf846752000-09-03 11:29:49 +00001258 /* \UXXXXXXXX with 8 hex digits */
1259 case 'U':
1260 for (chr = 0, i = 0; i < 8; i++) {
1261 c = (unsigned char)s[i];
1262 if (!isxdigit(c)) {
1263 if (unicodeescape_decoding_error(&s, &x, errors,
1264 "truncated \\uXXXX"))
1265 goto onError;
1266 i++;
1267 break;
1268 }
1269 chr = (chr<<4) & ~0xF;
1270 if (c >= '0' && c <= '9')
1271 chr += c - '0';
1272 else if (c >= 'a' && c <= 'f')
1273 chr += 10 + c - 'a';
1274 else
1275 chr += 10 + c - 'A';
1276 }
1277 s += i;
1278 goto store;
1279
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001280 case 'N':
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1283 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001284 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001285 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 mod = PyImport_ImportModule("ucnhash");
1287 if (mod == NULL)
1288 goto onError;
1289 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290 Py_DECREF(mod);
1291 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001292 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001293 pucnHash = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001296 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001297 }
1298
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001300 const char *start = s + 1;
1301 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001302 unsigned long j;
1303
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1306 */
1307 while (*endBrace != '}' &&
1308 (unsigned int)(endBrace - start) <=
1309 pucnHash->cchMax &&
1310 endBrace < end)
1311 {
1312 endBrace++;
1313 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001314 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001315 j = pucnHash->hash(start, endBrace - start);
1316 if (j > pucnHash->cKeys ||
1317 mystrnicmp(
1318 start,
1319 ((_Py_UnicodeCharacterName *)
1320 (pucnHash->getValue(j)))->pszUCN,
1321 (int)(endBrace - start)) != 0)
1322 {
1323 if (unicodeescape_decoding_error(
1324 &s, &x, errors,
1325 "Invalid Unicode Character Name"))
1326 {
1327 goto onError;
1328 }
1329 goto ucnFallthrough;
1330 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001331 chr = ((_Py_UnicodeCharacterName *)
1332 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001333 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001334 goto store;
1335 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Unicode name missing closing brace"))
1339 goto onError;
1340 goto ucnFallthrough;
1341 }
1342 break;
1343 }
1344 if (unicodeescape_decoding_error(
1345 &s, &x, errors,
1346 "Missing opening brace for Unicode Character Name escape"))
1347 goto onError;
1348ucnFallthrough:
1349 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001350 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 *p++ = '\\';
1352 *p++ = (unsigned char)s[-1];
1353 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001354store:
1355 /* when we get here, chr is a 32-bit unicode character */
1356 if (chr <= 0xffff)
1357 /* UCS-2 character */
1358 *p++ = (Py_UNICODE) chr;
1359 else if (chr <= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1361 chr -= 0x10000L;
1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364 } else {
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Illegal Unicode character")
1368 )
1369 goto onError;
1370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 }
1372 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001373 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return (PyObject *)v;
1376
1377 onError:
1378 Py_XDECREF(v);
1379 return NULL;
1380}
1381
1382/* Return a Unicode-Escape string version of the Unicode object.
1383
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1385 appropriate.
1386
1387*/
1388
Barry Warsaw51ac5802000-03-20 16:36:48 +00001389static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390 int size,
1391 Py_UNICODE ch);
1392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393static
1394PyObject *unicodeescape_string(const Py_UNICODE *s,
1395 int size,
1396 int quotes)
1397{
1398 PyObject *repr;
1399 char *p;
1400 char *q;
1401
1402 static const char *hexdigit = "0123456789ABCDEF";
1403
1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405 if (repr == NULL)
1406 return NULL;
1407
1408 p = q = PyString_AS_STRING(repr);
1409
1410 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 *p++ = 'u';
1412 *p++ = (findchar(s, size, '\'') &&
1413 !findchar(s, size, '"')) ? '"' : '\'';
1414 }
1415 while (size-- > 0) {
1416 Py_UNICODE ch = *s++;
1417 /* Escape quotes */
1418 if (quotes && (ch == q[1] || ch == '\\')) {
1419 *p++ = '\\';
1420 *p++ = (char) ch;
1421 }
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch >= 256) {
1424 *p++ = '\\';
1425 *p++ = 'u';
1426 *p++ = hexdigit[(ch >> 12) & 0xf];
1427 *p++ = hexdigit[(ch >> 8) & 0xf];
1428 *p++ = hexdigit[(ch >> 4) & 0xf];
1429 *p++ = hexdigit[ch & 15];
1430 }
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch < ' ' || ch >= 128) {
1433 *p++ = '\\';
1434 *p++ = hexdigit[(ch >> 6) & 7];
1435 *p++ = hexdigit[(ch >> 3) & 7];
1436 *p++ = hexdigit[ch & 7];
1437 }
1438 /* Copy everything else as-is */
1439 else
1440 *p++ = (char) ch;
1441 }
1442 if (quotes)
1443 *p++ = q[1];
1444
1445 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001446 if (_PyString_Resize(&repr, p - q))
1447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001450
1451 onError:
1452 Py_DECREF(repr);
1453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457 int size)
1458{
1459 return unicodeescape_string(s, size, 0);
1460}
1461
1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463{
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 return NULL;
1467 }
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469 PyUnicode_GET_SIZE(unicode));
1470}
1471
1472/* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475 int size,
1476 const char *errors)
1477{
1478 PyUnicodeObject *v;
1479 Py_UNICODE *p, *buf;
1480 const char *end;
1481 const char *bs;
1482
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v = _PyUnicode_New(size);
1487 if (v == NULL)
1488 goto onError;
1489 if (size == 0)
1490 return (PyObject *)v;
1491 p = buf = PyUnicode_AS_UNICODE(v);
1492 end = s + size;
1493 while (s < end) {
1494 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001495 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 int i;
1497
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1499 if (*s != '\\') {
1500 *p++ = (unsigned char)*s++;
1501 continue;
1502 }
1503
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1506 bs = s;
1507 for (;s < end;) {
1508 if (*s != '\\')
1509 break;
1510 *p++ = (unsigned char)*s++;
1511 }
1512 if (((s - bs) & 1) == 0 ||
1513 s >= end ||
1514 *s != 'u') {
1515 continue;
1516 }
1517 p--;
1518 s++;
1519
1520 /* \uXXXX with 4 hex digits */
1521 for (x = 0, i = 0; i < 4; i++) {
1522 c = (unsigned char)s[i];
1523 if (!isxdigit(c)) {
1524 if (unicodeescape_decoding_error(&s, &x, errors,
1525 "truncated \\uXXXX"))
1526 goto onError;
1527 i++;
1528 break;
1529 }
1530 x = (x<<4) & ~0xF;
1531 if (c >= '0' && c <= '9')
1532 x += c - '0';
1533 else if (c >= 'a' && c <= 'f')
1534 x += 10 + c - 'a';
1535 else
1536 x += 10 + c - 'A';
1537 }
1538 s += i;
1539 *p++ = x;
1540 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001541 if (_PyUnicode_Resize(v, (int)(p - buf)))
1542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return (PyObject *)v;
1544
1545 onError:
1546 Py_XDECREF(v);
1547 return NULL;
1548}
1549
1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551 int size)
1552{
1553 PyObject *repr;
1554 char *p;
1555 char *q;
1556
1557 static const char *hexdigit = "0123456789ABCDEF";
1558
1559 repr = PyString_FromStringAndSize(NULL, 6 * size);
1560 if (repr == NULL)
1561 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001562 if (size == 0)
1563 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564
1565 p = q = PyString_AS_STRING(repr);
1566 while (size-- > 0) {
1567 Py_UNICODE ch = *s++;
1568 /* Map 16-bit characters to '\uxxxx' */
1569 if (ch >= 256) {
1570 *p++ = '\\';
1571 *p++ = 'u';
1572 *p++ = hexdigit[(ch >> 12) & 0xf];
1573 *p++ = hexdigit[(ch >> 8) & 0xf];
1574 *p++ = hexdigit[(ch >> 4) & 0xf];
1575 *p++ = hexdigit[ch & 15];
1576 }
1577 /* Copy everything else as-is */
1578 else
1579 *p++ = (char) ch;
1580 }
1581 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001582 if (_PyString_Resize(&repr, p - q))
1583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584
1585 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001586
1587 onError:
1588 Py_DECREF(repr);
1589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590}
1591
1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593{
1594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1597 }
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599 PyUnicode_GET_SIZE(unicode));
1600}
1601
1602/* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604PyObject *PyUnicode_DecodeLatin1(const char *s,
1605 int size,
1606 const char *errors)
1607{
1608 PyUnicodeObject *v;
1609 Py_UNICODE *p;
1610
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v = _PyUnicode_New(size);
1613 if (v == NULL)
1614 goto onError;
1615 if (size == 0)
1616 return (PyObject *)v;
1617 p = PyUnicode_AS_UNICODE(v);
1618 while (size-- > 0)
1619 *p++ = (unsigned char)*s++;
1620 return (PyObject *)v;
1621
1622 onError:
1623 Py_XDECREF(v);
1624 return NULL;
1625}
1626
1627static
1628int latin1_encoding_error(const Py_UNICODE **source,
1629 char **dest,
1630 const char *errors,
1631 const char *details)
1632{
1633 if ((errors == NULL) ||
1634 (strcmp(errors,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001636 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 details);
1638 return -1;
1639 }
1640 else if (strcmp(errors,"ignore") == 0) {
1641 return 0;
1642 }
1643 else if (strcmp(errors,"replace") == 0) {
1644 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 return 0;
1647 }
1648 else {
1649 PyErr_Format(PyExc_ValueError,
1650 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 errors);
1653 return -1;
1654 }
1655}
1656
1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658 int size,
1659 const char *errors)
1660{
1661 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001662 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001663
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 repr = PyString_FromStringAndSize(NULL, size);
1665 if (repr == NULL)
1666 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001667 if (size == 0)
1668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
1670 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001671 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1678 }
1679 else
1680 *s++ = (char)ch;
1681 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return repr;
1687
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1698 }
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1711{
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001715 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 details);
1717 return -1;
1718 }
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1721 }
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1726 }
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 errors);
1732 return -1;
1733 }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1739{
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1742
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1752
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return (PyObject *)v;
1764
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1775{
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001779 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 details);
1781 return -1;
1782 }
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1785 }
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001788 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return 0;
1790 }
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 errors);
1796 return -1;
1797 }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1803{
1804 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001805 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 repr = PyString_FromStringAndSize(NULL, size);
1808 if (repr == NULL)
1809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001810 if (size == 0)
1811 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812
1813 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001814 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (size-- > 0) {
1816 Py_UNICODE ch = *p++;
1817 if (ch >= 128) {
1818 if (ascii_encoding_error(&p, &s, errors,
1819 "ordinal not in range(128)"))
1820 goto onError;
1821 }
1822 else
1823 *s++ = (char)ch;
1824 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001825 /* Resize if error handling skipped some characters */
1826 if (s - start < PyString_GET_SIZE(repr))
1827 if (_PyString_Resize(&repr, s - start))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return repr;
1830
1831 onError:
1832 Py_DECREF(repr);
1833 return NULL;
1834}
1835
1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837{
1838 if (!PyUnicode_Check(unicode)) {
1839 PyErr_BadArgument();
1840 return NULL;
1841 }
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843 PyUnicode_GET_SIZE(unicode),
1844 NULL);
1845}
1846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001847#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001851PyObject *PyUnicode_DecodeMBCS(const char *s,
1852 int size,
1853 const char *errors)
1854{
1855 PyUnicodeObject *v;
1856 Py_UNICODE *p;
1857
1858 /* First get the size of the result */
1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001860 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863 v = _PyUnicode_New(usize);
1864 if (v == NULL)
1865 return NULL;
1866 if (usize == 0)
1867 return (PyObject *)v;
1868 p = PyUnicode_AS_UNICODE(v);
1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870 Py_DECREF(v);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872 }
1873
1874 return (PyObject *)v;
1875}
1876
1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878 int size,
1879 const char *errors)
1880{
1881 PyObject *repr;
1882 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001883 DWORD mbcssize;
1884
1885 /* If there are no characters, bail now! */
1886 if (size==0)
1887 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888
1889 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001891 if (mbcssize==0)
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894 repr = PyString_FromStringAndSize(NULL, mbcssize);
1895 if (repr == NULL)
1896 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001897 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001898 return repr;
1899
1900 /* Do the conversion */
1901 s = PyString_AS_STRING(repr);
1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903 Py_DECREF(repr);
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905 }
1906 return repr;
1907}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909#endif /* MS_WIN32 */
1910
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911/* --- Character Mapping Codec -------------------------------------------- */
1912
1913static
1914int charmap_decoding_error(const char **source,
1915 Py_UNICODE **dest,
1916 const char *errors,
1917 const char *details)
1918{
1919 if ((errors == NULL) ||
1920 (strcmp(errors,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001922 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 details);
1924 return -1;
1925 }
1926 else if (strcmp(errors,"ignore") == 0) {
1927 return 0;
1928 }
1929 else if (strcmp(errors,"replace") == 0) {
1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931 (*dest)++;
1932 return 0;
1933 }
1934 else {
1935 PyErr_Format(PyExc_ValueError,
1936 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001937 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 errors);
1939 return -1;
1940 }
1941}
1942
1943PyObject *PyUnicode_DecodeCharmap(const char *s,
1944 int size,
1945 PyObject *mapping,
1946 const char *errors)
1947{
1948 PyUnicodeObject *v;
1949 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001950 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 /* Default to Latin-1 */
1953 if (mapping == NULL)
1954 return PyUnicode_DecodeLatin1(s, size, errors);
1955
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 return (PyObject *)v;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 unsigned char ch = *s++;
1964 PyObject *w, *x;
1965
1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967 w = PyInt_FromLong((long)ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001974 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001976 x = Py_None;
1977 Py_INCREF(x);
1978 } else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 goto onError;
1980 }
1981
1982 /* Apply mapping */
1983 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001984 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 if (value < 0 || value > 65535) {
1986 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001987 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 Py_DECREF(x);
1989 goto onError;
1990 }
1991 *p++ = (Py_UNICODE)value;
1992 }
1993 else if (x == Py_None) {
1994 /* undefined mapping */
1995 if (charmap_decoding_error(&s, &p, errors,
1996 "character maps to <undefined>")) {
1997 Py_DECREF(x);
1998 goto onError;
1999 }
2000 }
2001 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002002 int targetsize = PyUnicode_GET_SIZE(x);
2003
2004 if (targetsize == 1)
2005 /* 1-1 mapping */
2006 *p++ = *PyUnicode_AS_UNICODE(x);
2007
2008 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002010 if (targetsize > extrachars) {
2011 /* resize first */
2012 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2013 int needed = (targetsize - extrachars) + \
2014 (targetsize << 2);
2015 extrachars += needed;
2016 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 Py_DECREF(x);
2018 goto onError;
2019 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002020 p = PyUnicode_AS_UNICODE(v) + oldpos;
2021 }
2022 Py_UNICODE_COPY(p,
2023 PyUnicode_AS_UNICODE(x),
2024 targetsize);
2025 p += targetsize;
2026 extrachars -= targetsize;
2027 }
2028 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 }
2030 else {
2031 /* wrong return value */
2032 PyErr_SetString(PyExc_TypeError,
2033 "character mapping must return integer, None or unicode");
2034 Py_DECREF(x);
2035 goto onError;
2036 }
2037 Py_DECREF(x);
2038 }
2039 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2040 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2041 goto onError;
2042 return (PyObject *)v;
2043
2044 onError:
2045 Py_XDECREF(v);
2046 return NULL;
2047}
2048
2049static
2050int charmap_encoding_error(const Py_UNICODE **source,
2051 char **dest,
2052 const char *errors,
2053 const char *details)
2054{
2055 if ((errors == NULL) ||
2056 (strcmp(errors,"strict") == 0)) {
2057 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002058 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 details);
2060 return -1;
2061 }
2062 else if (strcmp(errors,"ignore") == 0) {
2063 return 0;
2064 }
2065 else if (strcmp(errors,"replace") == 0) {
2066 **dest = '?';
2067 (*dest)++;
2068 return 0;
2069 }
2070 else {
2071 PyErr_Format(PyExc_ValueError,
2072 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002073 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 errors);
2075 return -1;
2076 }
2077}
2078
2079PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2080 int size,
2081 PyObject *mapping,
2082 const char *errors)
2083{
2084 PyObject *v;
2085 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002086 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087
2088 /* Default to Latin-1 */
2089 if (mapping == NULL)
2090 return PyUnicode_EncodeLatin1(p, size, errors);
2091
2092 v = PyString_FromStringAndSize(NULL, size);
2093 if (v == NULL)
2094 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002095 if (size == 0)
2096 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 s = PyString_AS_STRING(v);
2098 while (size-- > 0) {
2099 Py_UNICODE ch = *p++;
2100 PyObject *w, *x;
2101
2102 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2103 w = PyInt_FromLong((long)ch);
2104 if (w == NULL)
2105 goto onError;
2106 x = PyObject_GetItem(mapping, w);
2107 Py_DECREF(w);
2108 if (x == NULL) {
2109 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002110 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002112 x = Py_None;
2113 Py_INCREF(x);
2114 } else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 goto onError;
2116 }
2117
2118 /* Apply mapping */
2119 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002120 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 if (value < 0 || value > 255) {
2122 PyErr_SetString(PyExc_TypeError,
2123 "character mapping must be in range(256)");
2124 Py_DECREF(x);
2125 goto onError;
2126 }
2127 *s++ = (char)value;
2128 }
2129 else if (x == Py_None) {
2130 /* undefined mapping */
2131 if (charmap_encoding_error(&p, &s, errors,
2132 "character maps to <undefined>")) {
2133 Py_DECREF(x);
2134 goto onError;
2135 }
2136 }
2137 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002138 int targetsize = PyString_GET_SIZE(x);
2139
2140 if (targetsize == 1)
2141 /* 1-1 mapping */
2142 *s++ = *PyString_AS_STRING(x);
2143
2144 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002146 if (targetsize > extrachars) {
2147 /* resize first */
2148 int oldpos = (int)(s - PyString_AS_STRING(v));
2149 int needed = (targetsize - extrachars) + \
2150 (targetsize << 2);
2151 extrachars += needed;
2152 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 Py_DECREF(x);
2154 goto onError;
2155 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002156 s = PyString_AS_STRING(v) + oldpos;
2157 }
2158 memcpy(s,
2159 PyString_AS_STRING(x),
2160 targetsize);
2161 s += targetsize;
2162 extrachars -= targetsize;
2163 }
2164 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 }
2166 else {
2167 /* wrong return value */
2168 PyErr_SetString(PyExc_TypeError,
2169 "character mapping must return integer, None or unicode");
2170 Py_DECREF(x);
2171 goto onError;
2172 }
2173 Py_DECREF(x);
2174 }
2175 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2176 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2177 goto onError;
2178 return v;
2179
2180 onError:
2181 Py_DECREF(v);
2182 return NULL;
2183}
2184
2185PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2186 PyObject *mapping)
2187{
2188 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode),
2194 mapping,
2195 NULL);
2196}
2197
2198static
2199int translate_error(const Py_UNICODE **source,
2200 Py_UNICODE **dest,
2201 const char *errors,
2202 const char *details)
2203{
2204 if ((errors == NULL) ||
2205 (strcmp(errors,"strict") == 0)) {
2206 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002207 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 details);
2209 return -1;
2210 }
2211 else if (strcmp(errors,"ignore") == 0) {
2212 return 0;
2213 }
2214 else if (strcmp(errors,"replace") == 0) {
2215 **dest = '?';
2216 (*dest)++;
2217 return 0;
2218 }
2219 else {
2220 PyErr_Format(PyExc_ValueError,
2221 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002222 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 errors);
2224 return -1;
2225 }
2226}
2227
2228PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2229 int size,
2230 PyObject *mapping,
2231 const char *errors)
2232{
2233 PyUnicodeObject *v;
2234 Py_UNICODE *p;
2235
2236 if (mapping == NULL) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
2240
2241 /* Output will never be longer than input */
2242 v = _PyUnicode_New(size);
2243 if (v == NULL)
2244 goto onError;
2245 if (size == 0)
2246 goto done;
2247 p = PyUnicode_AS_UNICODE(v);
2248 while (size-- > 0) {
2249 Py_UNICODE ch = *s++;
2250 PyObject *w, *x;
2251
2252 /* Get mapping */
2253 w = PyInt_FromLong(ch);
2254 if (w == NULL)
2255 goto onError;
2256 x = PyObject_GetItem(mapping, w);
2257 Py_DECREF(w);
2258 if (x == NULL) {
2259 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2260 /* No mapping found: default to 1-1 mapping */
2261 PyErr_Clear();
2262 *p++ = ch;
2263 continue;
2264 }
2265 goto onError;
2266 }
2267
2268 /* Apply mapping */
2269 if (PyInt_Check(x))
2270 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2271 else if (x == Py_None) {
2272 /* undefined mapping */
2273 if (translate_error(&s, &p, errors,
2274 "character maps to <undefined>")) {
2275 Py_DECREF(x);
2276 goto onError;
2277 }
2278 }
2279 else if (PyUnicode_Check(x)) {
2280 if (PyUnicode_GET_SIZE(x) != 1) {
2281 /* 1-n mapping */
2282 PyErr_SetString(PyExc_NotImplementedError,
2283 "1-n mappings are currently not implemented");
2284 Py_DECREF(x);
2285 goto onError;
2286 }
2287 *p++ = *PyUnicode_AS_UNICODE(x);
2288 }
2289 else {
2290 /* wrong return value */
2291 PyErr_SetString(PyExc_TypeError,
2292 "translate mapping must return integer, None or unicode");
2293 Py_DECREF(x);
2294 goto onError;
2295 }
2296 Py_DECREF(x);
2297 }
2298 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002299 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 done:
2303 return (PyObject *)v;
2304
2305 onError:
2306 Py_XDECREF(v);
2307 return NULL;
2308}
2309
2310PyObject *PyUnicode_Translate(PyObject *str,
2311 PyObject *mapping,
2312 const char *errors)
2313{
2314 PyObject *result;
2315
2316 str = PyUnicode_FromObject(str);
2317 if (str == NULL)
2318 goto onError;
2319 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2320 PyUnicode_GET_SIZE(str),
2321 mapping,
2322 errors);
2323 Py_DECREF(str);
2324 return result;
2325
2326 onError:
2327 Py_XDECREF(str);
2328 return NULL;
2329}
2330
Guido van Rossum9e896b32000-04-05 20:11:21 +00002331/* --- Decimal Encoder ---------------------------------------------------- */
2332
2333int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2334 int length,
2335 char *output,
2336 const char *errors)
2337{
2338 Py_UNICODE *p, *end;
2339
2340 if (output == NULL) {
2341 PyErr_BadArgument();
2342 return -1;
2343 }
2344
2345 p = s;
2346 end = s + length;
2347 while (p < end) {
2348 register Py_UNICODE ch = *p++;
2349 int decimal;
2350
2351 if (Py_UNICODE_ISSPACE(ch)) {
2352 *output++ = ' ';
2353 continue;
2354 }
2355 decimal = Py_UNICODE_TODECIMAL(ch);
2356 if (decimal >= 0) {
2357 *output++ = '0' + decimal;
2358 continue;
2359 }
Guido van Rossumba477042000-04-06 18:18:10 +00002360 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002361 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002362 continue;
2363 }
2364 /* All other characters are considered invalid */
2365 if (errors == NULL || strcmp(errors, "strict") == 0) {
2366 PyErr_SetString(PyExc_ValueError,
2367 "invalid decimal Unicode string");
2368 goto onError;
2369 }
2370 else if (strcmp(errors, "ignore") == 0)
2371 continue;
2372 else if (strcmp(errors, "replace") == 0) {
2373 *output++ = '?';
2374 continue;
2375 }
2376 }
2377 /* 0-terminate the output string */
2378 *output++ = '\0';
2379 return 0;
2380
2381 onError:
2382 return -1;
2383}
2384
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385/* --- Helpers ------------------------------------------------------------ */
2386
2387static
2388int count(PyUnicodeObject *self,
2389 int start,
2390 int end,
2391 PyUnicodeObject *substring)
2392{
2393 int count = 0;
2394
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002395 if (substring->length == 0)
2396 return (end - start + 1);
2397
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 end -= substring->length;
2399
2400 while (start <= end)
2401 if (Py_UNICODE_MATCH(self, start, substring)) {
2402 count++;
2403 start += substring->length;
2404 } else
2405 start++;
2406
2407 return count;
2408}
2409
2410int PyUnicode_Count(PyObject *str,
2411 PyObject *substr,
2412 int start,
2413 int end)
2414{
2415 int result;
2416
2417 str = PyUnicode_FromObject(str);
2418 if (str == NULL)
2419 return -1;
2420 substr = PyUnicode_FromObject(substr);
2421 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002422 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 return -1;
2424 }
2425
2426 result = count((PyUnicodeObject *)str,
2427 start, end,
2428 (PyUnicodeObject *)substr);
2429
2430 Py_DECREF(str);
2431 Py_DECREF(substr);
2432 return result;
2433}
2434
2435static
2436int findstring(PyUnicodeObject *self,
2437 PyUnicodeObject *substring,
2438 int start,
2439 int end,
2440 int direction)
2441{
2442 if (start < 0)
2443 start += self->length;
2444 if (start < 0)
2445 start = 0;
2446
2447 if (substring->length == 0)
2448 return start;
2449
2450 if (end > self->length)
2451 end = self->length;
2452 if (end < 0)
2453 end += self->length;
2454 if (end < 0)
2455 end = 0;
2456
2457 end -= substring->length;
2458
2459 if (direction < 0) {
2460 for (; end >= start; end--)
2461 if (Py_UNICODE_MATCH(self, end, substring))
2462 return end;
2463 } else {
2464 for (; start <= end; start++)
2465 if (Py_UNICODE_MATCH(self, start, substring))
2466 return start;
2467 }
2468
2469 return -1;
2470}
2471
2472int PyUnicode_Find(PyObject *str,
2473 PyObject *substr,
2474 int start,
2475 int end,
2476 int direction)
2477{
2478 int result;
2479
2480 str = PyUnicode_FromObject(str);
2481 if (str == NULL)
2482 return -1;
2483 substr = PyUnicode_FromObject(substr);
2484 if (substr == NULL) {
2485 Py_DECREF(substr);
2486 return -1;
2487 }
2488
2489 result = findstring((PyUnicodeObject *)str,
2490 (PyUnicodeObject *)substr,
2491 start, end, direction);
2492 Py_DECREF(str);
2493 Py_DECREF(substr);
2494 return result;
2495}
2496
2497static
2498int tailmatch(PyUnicodeObject *self,
2499 PyUnicodeObject *substring,
2500 int start,
2501 int end,
2502 int direction)
2503{
2504 if (start < 0)
2505 start += self->length;
2506 if (start < 0)
2507 start = 0;
2508
2509 if (substring->length == 0)
2510 return 1;
2511
2512 if (end > self->length)
2513 end = self->length;
2514 if (end < 0)
2515 end += self->length;
2516 if (end < 0)
2517 end = 0;
2518
2519 end -= substring->length;
2520 if (end < start)
2521 return 0;
2522
2523 if (direction > 0) {
2524 if (Py_UNICODE_MATCH(self, end, substring))
2525 return 1;
2526 } else {
2527 if (Py_UNICODE_MATCH(self, start, substring))
2528 return 1;
2529 }
2530
2531 return 0;
2532}
2533
2534int PyUnicode_Tailmatch(PyObject *str,
2535 PyObject *substr,
2536 int start,
2537 int end,
2538 int direction)
2539{
2540 int result;
2541
2542 str = PyUnicode_FromObject(str);
2543 if (str == NULL)
2544 return -1;
2545 substr = PyUnicode_FromObject(substr);
2546 if (substr == NULL) {
2547 Py_DECREF(substr);
2548 return -1;
2549 }
2550
2551 result = tailmatch((PyUnicodeObject *)str,
2552 (PyUnicodeObject *)substr,
2553 start, end, direction);
2554 Py_DECREF(str);
2555 Py_DECREF(substr);
2556 return result;
2557}
2558
2559static
2560const Py_UNICODE *findchar(const Py_UNICODE *s,
2561 int size,
2562 Py_UNICODE ch)
2563{
2564 /* like wcschr, but doesn't stop at NULL characters */
2565
2566 while (size-- > 0) {
2567 if (*s == ch)
2568 return s;
2569 s++;
2570 }
2571
2572 return NULL;
2573}
2574
2575/* Apply fixfct filter to the Unicode object self and return a
2576 reference to the modified object */
2577
2578static
2579PyObject *fixup(PyUnicodeObject *self,
2580 int (*fixfct)(PyUnicodeObject *s))
2581{
2582
2583 PyUnicodeObject *u;
2584
2585 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2586 self->length);
2587 if (u == NULL)
2588 return NULL;
2589 if (!fixfct(u)) {
2590 /* fixfct should return TRUE if it modified the buffer. If
2591 FALSE, return a reference to the original buffer instead
2592 (to save space, not time) */
2593 Py_INCREF(self);
2594 Py_DECREF(u);
2595 return (PyObject*) self;
2596 }
2597 return (PyObject*) u;
2598}
2599
2600static
2601int fixupper(PyUnicodeObject *self)
2602{
2603 int len = self->length;
2604 Py_UNICODE *s = self->str;
2605 int status = 0;
2606
2607 while (len-- > 0) {
2608 register Py_UNICODE ch;
2609
2610 ch = Py_UNICODE_TOUPPER(*s);
2611 if (ch != *s) {
2612 status = 1;
2613 *s = ch;
2614 }
2615 s++;
2616 }
2617
2618 return status;
2619}
2620
2621static
2622int fixlower(PyUnicodeObject *self)
2623{
2624 int len = self->length;
2625 Py_UNICODE *s = self->str;
2626 int status = 0;
2627
2628 while (len-- > 0) {
2629 register Py_UNICODE ch;
2630
2631 ch = Py_UNICODE_TOLOWER(*s);
2632 if (ch != *s) {
2633 status = 1;
2634 *s = ch;
2635 }
2636 s++;
2637 }
2638
2639 return status;
2640}
2641
2642static
2643int fixswapcase(PyUnicodeObject *self)
2644{
2645 int len = self->length;
2646 Py_UNICODE *s = self->str;
2647 int status = 0;
2648
2649 while (len-- > 0) {
2650 if (Py_UNICODE_ISUPPER(*s)) {
2651 *s = Py_UNICODE_TOLOWER(*s);
2652 status = 1;
2653 } else if (Py_UNICODE_ISLOWER(*s)) {
2654 *s = Py_UNICODE_TOUPPER(*s);
2655 status = 1;
2656 }
2657 s++;
2658 }
2659
2660 return status;
2661}
2662
2663static
2664int fixcapitalize(PyUnicodeObject *self)
2665{
2666 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2667 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2668 return 1;
2669 }
2670 return 0;
2671}
2672
2673static
2674int fixtitle(PyUnicodeObject *self)
2675{
2676 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2677 register Py_UNICODE *e;
2678 int previous_is_cased;
2679
2680 /* Shortcut for single character strings */
2681 if (PyUnicode_GET_SIZE(self) == 1) {
2682 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2683 if (*p != ch) {
2684 *p = ch;
2685 return 1;
2686 }
2687 else
2688 return 0;
2689 }
2690
2691 e = p + PyUnicode_GET_SIZE(self);
2692 previous_is_cased = 0;
2693 for (; p < e; p++) {
2694 register const Py_UNICODE ch = *p;
2695
2696 if (previous_is_cased)
2697 *p = Py_UNICODE_TOLOWER(ch);
2698 else
2699 *p = Py_UNICODE_TOTITLE(ch);
2700
2701 if (Py_UNICODE_ISLOWER(ch) ||
2702 Py_UNICODE_ISUPPER(ch) ||
2703 Py_UNICODE_ISTITLE(ch))
2704 previous_is_cased = 1;
2705 else
2706 previous_is_cased = 0;
2707 }
2708 return 1;
2709}
2710
2711PyObject *PyUnicode_Join(PyObject *separator,
2712 PyObject *seq)
2713{
2714 Py_UNICODE *sep;
2715 int seplen;
2716 PyUnicodeObject *res = NULL;
2717 int reslen = 0;
2718 Py_UNICODE *p;
2719 int seqlen = 0;
2720 int sz = 100;
2721 int i;
2722
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002723 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 if (seqlen < 0 && PyErr_Occurred())
2725 return NULL;
2726
2727 if (separator == NULL) {
2728 Py_UNICODE blank = ' ';
2729 sep = &blank;
2730 seplen = 1;
2731 }
2732 else {
2733 separator = PyUnicode_FromObject(separator);
2734 if (separator == NULL)
2735 return NULL;
2736 sep = PyUnicode_AS_UNICODE(separator);
2737 seplen = PyUnicode_GET_SIZE(separator);
2738 }
2739
2740 res = _PyUnicode_New(sz);
2741 if (res == NULL)
2742 goto onError;
2743 p = PyUnicode_AS_UNICODE(res);
2744 reslen = 0;
2745
2746 for (i = 0; i < seqlen; i++) {
2747 int itemlen;
2748 PyObject *item;
2749
2750 item = PySequence_GetItem(seq, i);
2751 if (item == NULL)
2752 goto onError;
2753 if (!PyUnicode_Check(item)) {
2754 PyObject *v;
2755 v = PyUnicode_FromObject(item);
2756 Py_DECREF(item);
2757 item = v;
2758 if (item == NULL)
2759 goto onError;
2760 }
2761 itemlen = PyUnicode_GET_SIZE(item);
2762 while (reslen + itemlen + seplen >= sz) {
2763 if (_PyUnicode_Resize(res, sz*2))
2764 goto onError;
2765 sz *= 2;
2766 p = PyUnicode_AS_UNICODE(res) + reslen;
2767 }
2768 if (i > 0) {
2769 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2770 p += seplen;
2771 reslen += seplen;
2772 }
2773 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2774 p += itemlen;
2775 reslen += itemlen;
2776 Py_DECREF(item);
2777 }
2778 if (_PyUnicode_Resize(res, reslen))
2779 goto onError;
2780
2781 Py_XDECREF(separator);
2782 return (PyObject *)res;
2783
2784 onError:
2785 Py_XDECREF(separator);
2786 Py_DECREF(res);
2787 return NULL;
2788}
2789
2790static
2791PyUnicodeObject *pad(PyUnicodeObject *self,
2792 int left,
2793 int right,
2794 Py_UNICODE fill)
2795{
2796 PyUnicodeObject *u;
2797
2798 if (left < 0)
2799 left = 0;
2800 if (right < 0)
2801 right = 0;
2802
2803 if (left == 0 && right == 0) {
2804 Py_INCREF(self);
2805 return self;
2806 }
2807
2808 u = _PyUnicode_New(left + self->length + right);
2809 if (u) {
2810 if (left)
2811 Py_UNICODE_FILL(u->str, fill, left);
2812 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2813 if (right)
2814 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2815 }
2816
2817 return u;
2818}
2819
2820#define SPLIT_APPEND(data, left, right) \
2821 str = PyUnicode_FromUnicode(data + left, right - left); \
2822 if (!str) \
2823 goto onError; \
2824 if (PyList_Append(list, str)) { \
2825 Py_DECREF(str); \
2826 goto onError; \
2827 } \
2828 else \
2829 Py_DECREF(str);
2830
2831static
2832PyObject *split_whitespace(PyUnicodeObject *self,
2833 PyObject *list,
2834 int maxcount)
2835{
2836 register int i;
2837 register int j;
2838 int len = self->length;
2839 PyObject *str;
2840
2841 for (i = j = 0; i < len; ) {
2842 /* find a token */
2843 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2844 i++;
2845 j = i;
2846 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2847 i++;
2848 if (j < i) {
2849 if (maxcount-- <= 0)
2850 break;
2851 SPLIT_APPEND(self->str, j, i);
2852 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2853 i++;
2854 j = i;
2855 }
2856 }
2857 if (j < len) {
2858 SPLIT_APPEND(self->str, j, len);
2859 }
2860 return list;
2861
2862 onError:
2863 Py_DECREF(list);
2864 return NULL;
2865}
2866
2867PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002868 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869{
2870 register int i;
2871 register int j;
2872 int len;
2873 PyObject *list;
2874 PyObject *str;
2875 Py_UNICODE *data;
2876
2877 string = PyUnicode_FromObject(string);
2878 if (string == NULL)
2879 return NULL;
2880 data = PyUnicode_AS_UNICODE(string);
2881 len = PyUnicode_GET_SIZE(string);
2882
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 list = PyList_New(0);
2884 if (!list)
2885 goto onError;
2886
2887 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002888 int eol;
2889
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 /* Find a line and append it */
2891 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2892 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
2894 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002895 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 if (i < len) {
2897 if (data[i] == '\r' && i + 1 < len &&
2898 data[i+1] == '\n')
2899 i += 2;
2900 else
2901 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002902 if (keepends)
2903 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 }
Guido van Rossum86662912000-04-11 15:38:46 +00002905 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 j = i;
2907 }
2908 if (j < len) {
2909 SPLIT_APPEND(data, j, len);
2910 }
2911
2912 Py_DECREF(string);
2913 return list;
2914
2915 onError:
2916 Py_DECREF(list);
2917 Py_DECREF(string);
2918 return NULL;
2919}
2920
2921static
2922PyObject *split_char(PyUnicodeObject *self,
2923 PyObject *list,
2924 Py_UNICODE ch,
2925 int maxcount)
2926{
2927 register int i;
2928 register int j;
2929 int len = self->length;
2930 PyObject *str;
2931
2932 for (i = j = 0; i < len; ) {
2933 if (self->str[i] == ch) {
2934 if (maxcount-- <= 0)
2935 break;
2936 SPLIT_APPEND(self->str, j, i);
2937 i = j = i + 1;
2938 } else
2939 i++;
2940 }
2941 if (j <= len) {
2942 SPLIT_APPEND(self->str, j, len);
2943 }
2944 return list;
2945
2946 onError:
2947 Py_DECREF(list);
2948 return NULL;
2949}
2950
2951static
2952PyObject *split_substring(PyUnicodeObject *self,
2953 PyObject *list,
2954 PyUnicodeObject *substring,
2955 int maxcount)
2956{
2957 register int i;
2958 register int j;
2959 int len = self->length;
2960 int sublen = substring->length;
2961 PyObject *str;
2962
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002963 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 if (Py_UNICODE_MATCH(self, i, substring)) {
2965 if (maxcount-- <= 0)
2966 break;
2967 SPLIT_APPEND(self->str, j, i);
2968 i = j = i + sublen;
2969 } else
2970 i++;
2971 }
2972 if (j <= len) {
2973 SPLIT_APPEND(self->str, j, len);
2974 }
2975 return list;
2976
2977 onError:
2978 Py_DECREF(list);
2979 return NULL;
2980}
2981
2982#undef SPLIT_APPEND
2983
2984static
2985PyObject *split(PyUnicodeObject *self,
2986 PyUnicodeObject *substring,
2987 int maxcount)
2988{
2989 PyObject *list;
2990
2991 if (maxcount < 0)
2992 maxcount = INT_MAX;
2993
2994 list = PyList_New(0);
2995 if (!list)
2996 return NULL;
2997
2998 if (substring == NULL)
2999 return split_whitespace(self,list,maxcount);
3000
3001 else if (substring->length == 1)
3002 return split_char(self,list,substring->str[0],maxcount);
3003
3004 else if (substring->length == 0) {
3005 Py_DECREF(list);
3006 PyErr_SetString(PyExc_ValueError, "empty separator");
3007 return NULL;
3008 }
3009 else
3010 return split_substring(self,list,substring,maxcount);
3011}
3012
3013static
3014PyObject *strip(PyUnicodeObject *self,
3015 int left,
3016 int right)
3017{
3018 Py_UNICODE *p = self->str;
3019 int start = 0;
3020 int end = self->length;
3021
3022 if (left)
3023 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3024 start++;
3025
3026 if (right)
3027 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3028 end--;
3029
3030 if (start == 0 && end == self->length) {
3031 /* couldn't strip anything off, return original string */
3032 Py_INCREF(self);
3033 return (PyObject*) self;
3034 }
3035
3036 return (PyObject*) PyUnicode_FromUnicode(
3037 self->str + start,
3038 end - start
3039 );
3040}
3041
3042static
3043PyObject *replace(PyUnicodeObject *self,
3044 PyUnicodeObject *str1,
3045 PyUnicodeObject *str2,
3046 int maxcount)
3047{
3048 PyUnicodeObject *u;
3049
3050 if (maxcount < 0)
3051 maxcount = INT_MAX;
3052
3053 if (str1->length == 1 && str2->length == 1) {
3054 int i;
3055
3056 /* replace characters */
3057 if (!findchar(self->str, self->length, str1->str[0])) {
3058 /* nothing to replace, return original string */
3059 Py_INCREF(self);
3060 u = self;
3061 } else {
3062 Py_UNICODE u1 = str1->str[0];
3063 Py_UNICODE u2 = str2->str[0];
3064
3065 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3066 self->str,
3067 self->length
3068 );
3069 if (u)
3070 for (i = 0; i < u->length; i++)
3071 if (u->str[i] == u1) {
3072 if (--maxcount < 0)
3073 break;
3074 u->str[i] = u2;
3075 }
3076 }
3077
3078 } else {
3079 int n, i;
3080 Py_UNICODE *p;
3081
3082 /* replace strings */
3083 n = count(self, 0, self->length, str1);
3084 if (n > maxcount)
3085 n = maxcount;
3086 if (n == 0) {
3087 /* nothing to replace, return original string */
3088 Py_INCREF(self);
3089 u = self;
3090 } else {
3091 u = _PyUnicode_New(
3092 self->length + n * (str2->length - str1->length));
3093 if (u) {
3094 i = 0;
3095 p = u->str;
3096 while (i <= self->length - str1->length)
3097 if (Py_UNICODE_MATCH(self, i, str1)) {
3098 /* replace string segment */
3099 Py_UNICODE_COPY(p, str2->str, str2->length);
3100 p += str2->length;
3101 i += str1->length;
3102 if (--n <= 0) {
3103 /* copy remaining part */
3104 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3105 break;
3106 }
3107 } else
3108 *p++ = self->str[i++];
3109 }
3110 }
3111 }
3112
3113 return (PyObject *) u;
3114}
3115
3116/* --- Unicode Object Methods --------------------------------------------- */
3117
3118static char title__doc__[] =
3119"S.title() -> unicode\n\
3120\n\
3121Return a titlecased version of S, i.e. words start with title case\n\
3122characters, all remaining cased characters have lower case.";
3123
3124static PyObject*
3125unicode_title(PyUnicodeObject *self, PyObject *args)
3126{
3127 if (!PyArg_NoArgs(args))
3128 return NULL;
3129 return fixup(self, fixtitle);
3130}
3131
3132static char capitalize__doc__[] =
3133"S.capitalize() -> unicode\n\
3134\n\
3135Return a capitalized version of S, i.e. make the first character\n\
3136have upper case.";
3137
3138static PyObject*
3139unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3140{
3141 if (!PyArg_NoArgs(args))
3142 return NULL;
3143 return fixup(self, fixcapitalize);
3144}
3145
3146#if 0
3147static char capwords__doc__[] =
3148"S.capwords() -> unicode\n\
3149\n\
3150Apply .capitalize() to all words in S and return the result with\n\
3151normalized whitespace (all whitespace strings are replaced by ' ').";
3152
3153static PyObject*
3154unicode_capwords(PyUnicodeObject *self, PyObject *args)
3155{
3156 PyObject *list;
3157 PyObject *item;
3158 int i;
3159
3160 if (!PyArg_NoArgs(args))
3161 return NULL;
3162
3163 /* Split into words */
3164 list = split(self, NULL, -1);
3165 if (!list)
3166 return NULL;
3167
3168 /* Capitalize each word */
3169 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3170 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3171 fixcapitalize);
3172 if (item == NULL)
3173 goto onError;
3174 Py_DECREF(PyList_GET_ITEM(list, i));
3175 PyList_SET_ITEM(list, i, item);
3176 }
3177
3178 /* Join the words to form a new string */
3179 item = PyUnicode_Join(NULL, list);
3180
3181onError:
3182 Py_DECREF(list);
3183 return (PyObject *)item;
3184}
3185#endif
3186
3187static char center__doc__[] =
3188"S.center(width) -> unicode\n\
3189\n\
3190Return S centered in a Unicode string of length width. Padding is done\n\
3191using spaces.";
3192
3193static PyObject *
3194unicode_center(PyUnicodeObject *self, PyObject *args)
3195{
3196 int marg, left;
3197 int width;
3198
3199 if (!PyArg_ParseTuple(args, "i:center", &width))
3200 return NULL;
3201
3202 if (self->length >= width) {
3203 Py_INCREF(self);
3204 return (PyObject*) self;
3205 }
3206
3207 marg = width - self->length;
3208 left = marg / 2 + (marg & width & 1);
3209
3210 return (PyObject*) pad(self, left, marg - left, ' ');
3211}
3212
Marc-André Lemburge5034372000-08-08 08:04:29 +00003213#if 0
3214
3215/* This code should go into some future Unicode collation support
3216 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003217 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003218
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003219/* speedy UTF-16 code point order comparison */
3220/* gleaned from: */
3221/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3222
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003223static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003224{
3225 0, 0, 0, 0, 0, 0, 0, 0,
3226 0, 0, 0, 0, 0, 0, 0, 0,
3227 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003228 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003229};
3230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231static int
3232unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3233{
3234 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003235
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 Py_UNICODE *s1 = str1->str;
3237 Py_UNICODE *s2 = str2->str;
3238
3239 len1 = str1->length;
3240 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003241
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003243 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003244 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003245
3246 c1 = *s1++;
3247 c2 = *s2++;
3248 if (c1 > (1<<11) * 26)
3249 c1 += utf16Fixup[c1>>11];
3250 if (c2 > (1<<11) * 26)
3251 c2 += utf16Fixup[c2>>11];
3252
3253 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003254 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003255 if (diff)
3256 return (diff < 0) ? -1 : (diff != 0);
3257 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
3259
3260 return (len1 < len2) ? -1 : (len1 != len2);
3261}
3262
Marc-André Lemburge5034372000-08-08 08:04:29 +00003263#else
3264
3265static int
3266unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3267{
3268 register int len1, len2;
3269
3270 Py_UNICODE *s1 = str1->str;
3271 Py_UNICODE *s2 = str2->str;
3272
3273 len1 = str1->length;
3274 len2 = str2->length;
3275
3276 while (len1 > 0 && len2 > 0) {
3277 register long diff;
3278
3279 diff = (long)*s1++ - (long)*s2++;
3280 if (diff)
3281 return (diff < 0) ? -1 : (diff != 0);
3282 len1--; len2--;
3283 }
3284
3285 return (len1 < len2) ? -1 : (len1 != len2);
3286}
3287
3288#endif
3289
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290int PyUnicode_Compare(PyObject *left,
3291 PyObject *right)
3292{
3293 PyUnicodeObject *u = NULL, *v = NULL;
3294 int result;
3295
3296 /* Coerce the two arguments */
3297 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3298 if (u == NULL)
3299 goto onError;
3300 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3301 if (v == NULL)
3302 goto onError;
3303
Thomas Wouters7e474022000-07-16 12:04:32 +00003304 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 if (v == u) {
3306 Py_DECREF(u);
3307 Py_DECREF(v);
3308 return 0;
3309 }
3310
3311 result = unicode_compare(u, v);
3312
3313 Py_DECREF(u);
3314 Py_DECREF(v);
3315 return result;
3316
3317onError:
3318 Py_XDECREF(u);
3319 Py_XDECREF(v);
3320 return -1;
3321}
3322
Guido van Rossum403d68b2000-03-13 15:55:09 +00003323int PyUnicode_Contains(PyObject *container,
3324 PyObject *element)
3325{
3326 PyUnicodeObject *u = NULL, *v = NULL;
3327 int result;
3328 register const Py_UNICODE *p, *e;
3329 register Py_UNICODE ch;
3330
3331 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003332 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003333 if (v == NULL) {
3334 PyErr_SetString(PyExc_TypeError,
3335 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003336 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003337 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003338 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3339 if (u == NULL) {
3340 Py_DECREF(v);
3341 goto onError;
3342 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003343
3344 /* Check v in u */
3345 if (PyUnicode_GET_SIZE(v) != 1) {
3346 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003347 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003348 goto onError;
3349 }
3350 ch = *PyUnicode_AS_UNICODE(v);
3351 p = PyUnicode_AS_UNICODE(u);
3352 e = p + PyUnicode_GET_SIZE(u);
3353 result = 0;
3354 while (p < e) {
3355 if (*p++ == ch) {
3356 result = 1;
3357 break;
3358 }
3359 }
3360
3361 Py_DECREF(u);
3362 Py_DECREF(v);
3363 return result;
3364
3365onError:
3366 Py_XDECREF(u);
3367 Py_XDECREF(v);
3368 return -1;
3369}
3370
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371/* Concat to string or Unicode object giving a new Unicode object. */
3372
3373PyObject *PyUnicode_Concat(PyObject *left,
3374 PyObject *right)
3375{
3376 PyUnicodeObject *u = NULL, *v = NULL, *w;
3377
3378 /* Coerce the two arguments */
3379 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3380 if (u == NULL)
3381 goto onError;
3382 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3383 if (v == NULL)
3384 goto onError;
3385
3386 /* Shortcuts */
3387 if (v == unicode_empty) {
3388 Py_DECREF(v);
3389 return (PyObject *)u;
3390 }
3391 if (u == unicode_empty) {
3392 Py_DECREF(u);
3393 return (PyObject *)v;
3394 }
3395
3396 /* Concat the two Unicode strings */
3397 w = _PyUnicode_New(u->length + v->length);
3398 if (w == NULL)
3399 goto onError;
3400 Py_UNICODE_COPY(w->str, u->str, u->length);
3401 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3402
3403 Py_DECREF(u);
3404 Py_DECREF(v);
3405 return (PyObject *)w;
3406
3407onError:
3408 Py_XDECREF(u);
3409 Py_XDECREF(v);
3410 return NULL;
3411}
3412
3413static char count__doc__[] =
3414"S.count(sub[, start[, end]]) -> int\n\
3415\n\
3416Return the number of occurrences of substring sub in Unicode string\n\
3417S[start:end]. Optional arguments start and end are\n\
3418interpreted as in slice notation.";
3419
3420static PyObject *
3421unicode_count(PyUnicodeObject *self, PyObject *args)
3422{
3423 PyUnicodeObject *substring;
3424 int start = 0;
3425 int end = INT_MAX;
3426 PyObject *result;
3427
Guido van Rossumb8872e62000-05-09 14:14:27 +00003428 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3429 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 return NULL;
3431
3432 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3433 (PyObject *)substring);
3434 if (substring == NULL)
3435 return NULL;
3436
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 if (start < 0)
3438 start += self->length;
3439 if (start < 0)
3440 start = 0;
3441 if (end > self->length)
3442 end = self->length;
3443 if (end < 0)
3444 end += self->length;
3445 if (end < 0)
3446 end = 0;
3447
3448 result = PyInt_FromLong((long) count(self, start, end, substring));
3449
3450 Py_DECREF(substring);
3451 return result;
3452}
3453
3454static char encode__doc__[] =
3455"S.encode([encoding[,errors]]) -> string\n\
3456\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003457Return an encoded string version of S. Default encoding is the current\n\
3458default string encoding. errors may be given to set a different error\n\
3459handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3460a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
3462static PyObject *
3463unicode_encode(PyUnicodeObject *self, PyObject *args)
3464{
3465 char *encoding = NULL;
3466 char *errors = NULL;
3467 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3468 return NULL;
3469 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3470}
3471
3472static char expandtabs__doc__[] =
3473"S.expandtabs([tabsize]) -> unicode\n\
3474\n\
3475Return a copy of S where all tab characters are expanded using spaces.\n\
3476If tabsize is not given, a tab size of 8 characters is assumed.";
3477
3478static PyObject*
3479unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3480{
3481 Py_UNICODE *e;
3482 Py_UNICODE *p;
3483 Py_UNICODE *q;
3484 int i, j;
3485 PyUnicodeObject *u;
3486 int tabsize = 8;
3487
3488 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3489 return NULL;
3490
Thomas Wouters7e474022000-07-16 12:04:32 +00003491 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 i = j = 0;
3493 e = self->str + self->length;
3494 for (p = self->str; p < e; p++)
3495 if (*p == '\t') {
3496 if (tabsize > 0)
3497 j += tabsize - (j % tabsize);
3498 }
3499 else {
3500 j++;
3501 if (*p == '\n' || *p == '\r') {
3502 i += j;
3503 j = 0;
3504 }
3505 }
3506
3507 /* Second pass: create output string and fill it */
3508 u = _PyUnicode_New(i + j);
3509 if (!u)
3510 return NULL;
3511
3512 j = 0;
3513 q = u->str;
3514
3515 for (p = self->str; p < e; p++)
3516 if (*p == '\t') {
3517 if (tabsize > 0) {
3518 i = tabsize - (j % tabsize);
3519 j += i;
3520 while (i--)
3521 *q++ = ' ';
3522 }
3523 }
3524 else {
3525 j++;
3526 *q++ = *p;
3527 if (*p == '\n' || *p == '\r')
3528 j = 0;
3529 }
3530
3531 return (PyObject*) u;
3532}
3533
3534static char find__doc__[] =
3535"S.find(sub [,start [,end]]) -> int\n\
3536\n\
3537Return the lowest index in S where substring sub is found,\n\
3538such that sub is contained within s[start,end]. Optional\n\
3539arguments start and end are interpreted as in slice notation.\n\
3540\n\
3541Return -1 on failure.";
3542
3543static PyObject *
3544unicode_find(PyUnicodeObject *self, PyObject *args)
3545{
3546 PyUnicodeObject *substring;
3547 int start = 0;
3548 int end = INT_MAX;
3549 PyObject *result;
3550
Guido van Rossumb8872e62000-05-09 14:14:27 +00003551 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3552 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 return NULL;
3554 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3555 (PyObject *)substring);
3556 if (substring == NULL)
3557 return NULL;
3558
3559 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3560
3561 Py_DECREF(substring);
3562 return result;
3563}
3564
3565static PyObject *
3566unicode_getitem(PyUnicodeObject *self, int index)
3567{
3568 if (index < 0 || index >= self->length) {
3569 PyErr_SetString(PyExc_IndexError, "string index out of range");
3570 return NULL;
3571 }
3572
3573 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3574}
3575
3576static long
3577unicode_hash(PyUnicodeObject *self)
3578{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003579 /* Since Unicode objects compare equal to their ASCII string
3580 counterparts, they should use the individual character values
3581 as basis for their hash value. This is needed to assure that
3582 strings and Unicode objects behave in the same way as
3583 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
Fredrik Lundhdde61642000-07-10 18:27:47 +00003585 register int len;
3586 register Py_UNICODE *p;
3587 register long x;
3588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (self->hash != -1)
3590 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003591 len = PyUnicode_GET_SIZE(self);
3592 p = PyUnicode_AS_UNICODE(self);
3593 x = *p << 7;
3594 while (--len >= 0)
3595 x = (1000003*x) ^ *p++;
3596 x ^= PyUnicode_GET_SIZE(self);
3597 if (x == -1)
3598 x = -2;
3599 self->hash = x;
3600 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601}
3602
3603static char index__doc__[] =
3604"S.index(sub [,start [,end]]) -> int\n\
3605\n\
3606Like S.find() but raise ValueError when the substring is not found.";
3607
3608static PyObject *
3609unicode_index(PyUnicodeObject *self, PyObject *args)
3610{
3611 int result;
3612 PyUnicodeObject *substring;
3613 int start = 0;
3614 int end = INT_MAX;
3615
Guido van Rossumb8872e62000-05-09 14:14:27 +00003616 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3617 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 return NULL;
3619
3620 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3621 (PyObject *)substring);
3622 if (substring == NULL)
3623 return NULL;
3624
3625 result = findstring(self, substring, start, end, 1);
3626
3627 Py_DECREF(substring);
3628 if (result < 0) {
3629 PyErr_SetString(PyExc_ValueError, "substring not found");
3630 return NULL;
3631 }
3632 return PyInt_FromLong(result);
3633}
3634
3635static char islower__doc__[] =
3636"S.islower() -> int\n\
3637\n\
3638Return 1 if all cased characters in S are lowercase and there is\n\
3639at least one cased character in S, 0 otherwise.";
3640
3641static PyObject*
3642unicode_islower(PyUnicodeObject *self, PyObject *args)
3643{
3644 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3645 register const Py_UNICODE *e;
3646 int cased;
3647
3648 if (!PyArg_NoArgs(args))
3649 return NULL;
3650
3651 /* Shortcut for single character strings */
3652 if (PyUnicode_GET_SIZE(self) == 1)
3653 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3654
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003655 /* Special case for empty strings */
3656 if (PyString_GET_SIZE(self) == 0)
3657 return PyInt_FromLong(0);
3658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 e = p + PyUnicode_GET_SIZE(self);
3660 cased = 0;
3661 for (; p < e; p++) {
3662 register const Py_UNICODE ch = *p;
3663
3664 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3665 return PyInt_FromLong(0);
3666 else if (!cased && Py_UNICODE_ISLOWER(ch))
3667 cased = 1;
3668 }
3669 return PyInt_FromLong(cased);
3670}
3671
3672static char isupper__doc__[] =
3673"S.isupper() -> int\n\
3674\n\
3675Return 1 if all cased characters in S are uppercase and there is\n\
3676at least one cased character in S, 0 otherwise.";
3677
3678static PyObject*
3679unicode_isupper(PyUnicodeObject *self, PyObject *args)
3680{
3681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3682 register const Py_UNICODE *e;
3683 int cased;
3684
3685 if (!PyArg_NoArgs(args))
3686 return NULL;
3687
3688 /* Shortcut for single character strings */
3689 if (PyUnicode_GET_SIZE(self) == 1)
3690 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3691
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003692 /* Special case for empty strings */
3693 if (PyString_GET_SIZE(self) == 0)
3694 return PyInt_FromLong(0);
3695
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 e = p + PyUnicode_GET_SIZE(self);
3697 cased = 0;
3698 for (; p < e; p++) {
3699 register const Py_UNICODE ch = *p;
3700
3701 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3702 return PyInt_FromLong(0);
3703 else if (!cased && Py_UNICODE_ISUPPER(ch))
3704 cased = 1;
3705 }
3706 return PyInt_FromLong(cased);
3707}
3708
3709static char istitle__doc__[] =
3710"S.istitle() -> int\n\
3711\n\
3712Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3713may only follow uncased characters and lowercase characters only cased\n\
3714ones. Return 0 otherwise.";
3715
3716static PyObject*
3717unicode_istitle(PyUnicodeObject *self, PyObject *args)
3718{
3719 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3720 register const Py_UNICODE *e;
3721 int cased, previous_is_cased;
3722
3723 if (!PyArg_NoArgs(args))
3724 return NULL;
3725
3726 /* Shortcut for single character strings */
3727 if (PyUnicode_GET_SIZE(self) == 1)
3728 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3729 (Py_UNICODE_ISUPPER(*p) != 0));
3730
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003731 /* Special case for empty strings */
3732 if (PyString_GET_SIZE(self) == 0)
3733 return PyInt_FromLong(0);
3734
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 e = p + PyUnicode_GET_SIZE(self);
3736 cased = 0;
3737 previous_is_cased = 0;
3738 for (; p < e; p++) {
3739 register const Py_UNICODE ch = *p;
3740
3741 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3742 if (previous_is_cased)
3743 return PyInt_FromLong(0);
3744 previous_is_cased = 1;
3745 cased = 1;
3746 }
3747 else if (Py_UNICODE_ISLOWER(ch)) {
3748 if (!previous_is_cased)
3749 return PyInt_FromLong(0);
3750 previous_is_cased = 1;
3751 cased = 1;
3752 }
3753 else
3754 previous_is_cased = 0;
3755 }
3756 return PyInt_FromLong(cased);
3757}
3758
3759static char isspace__doc__[] =
3760"S.isspace() -> int\n\
3761\n\
3762Return 1 if there are only whitespace characters in S,\n\
37630 otherwise.";
3764
3765static PyObject*
3766unicode_isspace(PyUnicodeObject *self, PyObject *args)
3767{
3768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3769 register const Py_UNICODE *e;
3770
3771 if (!PyArg_NoArgs(args))
3772 return NULL;
3773
3774 /* Shortcut for single character strings */
3775 if (PyUnicode_GET_SIZE(self) == 1 &&
3776 Py_UNICODE_ISSPACE(*p))
3777 return PyInt_FromLong(1);
3778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003779 /* Special case for empty strings */
3780 if (PyString_GET_SIZE(self) == 0)
3781 return PyInt_FromLong(0);
3782
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 e = p + PyUnicode_GET_SIZE(self);
3784 for (; p < e; p++) {
3785 if (!Py_UNICODE_ISSPACE(*p))
3786 return PyInt_FromLong(0);
3787 }
3788 return PyInt_FromLong(1);
3789}
3790
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003791static char isalpha__doc__[] =
3792"S.isalpha() -> int\n\
3793\n\
3794Return 1 if all characters in S are alphabetic\n\
3795and there is at least one character in S, 0 otherwise.";
3796
3797static PyObject*
3798unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3799{
3800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3801 register const Py_UNICODE *e;
3802
3803 if (!PyArg_NoArgs(args))
3804 return NULL;
3805
3806 /* Shortcut for single character strings */
3807 if (PyUnicode_GET_SIZE(self) == 1 &&
3808 Py_UNICODE_ISALPHA(*p))
3809 return PyInt_FromLong(1);
3810
3811 /* Special case for empty strings */
3812 if (PyString_GET_SIZE(self) == 0)
3813 return PyInt_FromLong(0);
3814
3815 e = p + PyUnicode_GET_SIZE(self);
3816 for (; p < e; p++) {
3817 if (!Py_UNICODE_ISALPHA(*p))
3818 return PyInt_FromLong(0);
3819 }
3820 return PyInt_FromLong(1);
3821}
3822
3823static char isalnum__doc__[] =
3824"S.isalnum() -> int\n\
3825\n\
3826Return 1 if all characters in S are alphanumeric\n\
3827and there is at least one character in S, 0 otherwise.";
3828
3829static PyObject*
3830unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3831{
3832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3833 register const Py_UNICODE *e;
3834
3835 if (!PyArg_NoArgs(args))
3836 return NULL;
3837
3838 /* Shortcut for single character strings */
3839 if (PyUnicode_GET_SIZE(self) == 1 &&
3840 Py_UNICODE_ISALNUM(*p))
3841 return PyInt_FromLong(1);
3842
3843 /* Special case for empty strings */
3844 if (PyString_GET_SIZE(self) == 0)
3845 return PyInt_FromLong(0);
3846
3847 e = p + PyUnicode_GET_SIZE(self);
3848 for (; p < e; p++) {
3849 if (!Py_UNICODE_ISALNUM(*p))
3850 return PyInt_FromLong(0);
3851 }
3852 return PyInt_FromLong(1);
3853}
3854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855static char isdecimal__doc__[] =
3856"S.isdecimal() -> int\n\
3857\n\
3858Return 1 if there are only decimal characters in S,\n\
38590 otherwise.";
3860
3861static PyObject*
3862unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3863{
3864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3865 register const Py_UNICODE *e;
3866
3867 if (!PyArg_NoArgs(args))
3868 return NULL;
3869
3870 /* Shortcut for single character strings */
3871 if (PyUnicode_GET_SIZE(self) == 1 &&
3872 Py_UNICODE_ISDECIMAL(*p))
3873 return PyInt_FromLong(1);
3874
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003875 /* Special case for empty strings */
3876 if (PyString_GET_SIZE(self) == 0)
3877 return PyInt_FromLong(0);
3878
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 e = p + PyUnicode_GET_SIZE(self);
3880 for (; p < e; p++) {
3881 if (!Py_UNICODE_ISDECIMAL(*p))
3882 return PyInt_FromLong(0);
3883 }
3884 return PyInt_FromLong(1);
3885}
3886
3887static char isdigit__doc__[] =
3888"S.isdigit() -> int\n\
3889\n\
3890Return 1 if there are only digit characters in S,\n\
38910 otherwise.";
3892
3893static PyObject*
3894unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3895{
3896 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3897 register const Py_UNICODE *e;
3898
3899 if (!PyArg_NoArgs(args))
3900 return NULL;
3901
3902 /* Shortcut for single character strings */
3903 if (PyUnicode_GET_SIZE(self) == 1 &&
3904 Py_UNICODE_ISDIGIT(*p))
3905 return PyInt_FromLong(1);
3906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003907 /* Special case for empty strings */
3908 if (PyString_GET_SIZE(self) == 0)
3909 return PyInt_FromLong(0);
3910
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 e = p + PyUnicode_GET_SIZE(self);
3912 for (; p < e; p++) {
3913 if (!Py_UNICODE_ISDIGIT(*p))
3914 return PyInt_FromLong(0);
3915 }
3916 return PyInt_FromLong(1);
3917}
3918
3919static char isnumeric__doc__[] =
3920"S.isnumeric() -> int\n\
3921\n\
3922Return 1 if there are only numeric characters in S,\n\
39230 otherwise.";
3924
3925static PyObject*
3926unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3927{
3928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3929 register const Py_UNICODE *e;
3930
3931 if (!PyArg_NoArgs(args))
3932 return NULL;
3933
3934 /* Shortcut for single character strings */
3935 if (PyUnicode_GET_SIZE(self) == 1 &&
3936 Py_UNICODE_ISNUMERIC(*p))
3937 return PyInt_FromLong(1);
3938
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003939 /* Special case for empty strings */
3940 if (PyString_GET_SIZE(self) == 0)
3941 return PyInt_FromLong(0);
3942
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 e = p + PyUnicode_GET_SIZE(self);
3944 for (; p < e; p++) {
3945 if (!Py_UNICODE_ISNUMERIC(*p))
3946 return PyInt_FromLong(0);
3947 }
3948 return PyInt_FromLong(1);
3949}
3950
3951static char join__doc__[] =
3952"S.join(sequence) -> unicode\n\
3953\n\
3954Return a string which is the concatenation of the strings in the\n\
3955sequence. The separator between elements is S.";
3956
3957static PyObject*
3958unicode_join(PyUnicodeObject *self, PyObject *args)
3959{
3960 PyObject *data;
3961 if (!PyArg_ParseTuple(args, "O:join", &data))
3962 return NULL;
3963
3964 return PyUnicode_Join((PyObject *)self, data);
3965}
3966
3967static int
3968unicode_length(PyUnicodeObject *self)
3969{
3970 return self->length;
3971}
3972
3973static char ljust__doc__[] =
3974"S.ljust(width) -> unicode\n\
3975\n\
3976Return S left justified in a Unicode string of length width. Padding is\n\
3977done using spaces.";
3978
3979static PyObject *
3980unicode_ljust(PyUnicodeObject *self, PyObject *args)
3981{
3982 int width;
3983 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3984 return NULL;
3985
3986 if (self->length >= width) {
3987 Py_INCREF(self);
3988 return (PyObject*) self;
3989 }
3990
3991 return (PyObject*) pad(self, 0, width - self->length, ' ');
3992}
3993
3994static char lower__doc__[] =
3995"S.lower() -> unicode\n\
3996\n\
3997Return a copy of the string S converted to lowercase.";
3998
3999static PyObject*
4000unicode_lower(PyUnicodeObject *self, PyObject *args)
4001{
4002 if (!PyArg_NoArgs(args))
4003 return NULL;
4004 return fixup(self, fixlower);
4005}
4006
4007static char lstrip__doc__[] =
4008"S.lstrip() -> unicode\n\
4009\n\
4010Return a copy of the string S with leading whitespace removed.";
4011
4012static PyObject *
4013unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4014{
4015 if (!PyArg_NoArgs(args))
4016 return NULL;
4017 return strip(self, 1, 0);
4018}
4019
4020static PyObject*
4021unicode_repeat(PyUnicodeObject *str, int len)
4022{
4023 PyUnicodeObject *u;
4024 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004025 int nchars;
4026 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027
4028 if (len < 0)
4029 len = 0;
4030
4031 if (len == 1) {
4032 /* no repeat, return original string */
4033 Py_INCREF(str);
4034 return (PyObject*) str;
4035 }
Tim Peters8f422462000-09-09 06:13:41 +00004036
4037 /* ensure # of chars needed doesn't overflow int and # of bytes
4038 * needed doesn't overflow size_t
4039 */
4040 nchars = len * str->length;
4041 if (len && nchars / len != str->length) {
4042 PyErr_SetString(PyExc_OverflowError,
4043 "repeated string is too long");
4044 return NULL;
4045 }
4046 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4047 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4048 PyErr_SetString(PyExc_OverflowError,
4049 "repeated string is too long");
4050 return NULL;
4051 }
4052 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (!u)
4054 return NULL;
4055
4056 p = u->str;
4057
4058 while (len-- > 0) {
4059 Py_UNICODE_COPY(p, str->str, str->length);
4060 p += str->length;
4061 }
4062
4063 return (PyObject*) u;
4064}
4065
4066PyObject *PyUnicode_Replace(PyObject *obj,
4067 PyObject *subobj,
4068 PyObject *replobj,
4069 int maxcount)
4070{
4071 PyObject *self;
4072 PyObject *str1;
4073 PyObject *str2;
4074 PyObject *result;
4075
4076 self = PyUnicode_FromObject(obj);
4077 if (self == NULL)
4078 return NULL;
4079 str1 = PyUnicode_FromObject(subobj);
4080 if (str1 == NULL) {
4081 Py_DECREF(self);
4082 return NULL;
4083 }
4084 str2 = PyUnicode_FromObject(replobj);
4085 if (str2 == NULL) {
4086 Py_DECREF(self);
4087 Py_DECREF(str1);
4088 return NULL;
4089 }
4090 result = replace((PyUnicodeObject *)self,
4091 (PyUnicodeObject *)str1,
4092 (PyUnicodeObject *)str2,
4093 maxcount);
4094 Py_DECREF(self);
4095 Py_DECREF(str1);
4096 Py_DECREF(str2);
4097 return result;
4098}
4099
4100static char replace__doc__[] =
4101"S.replace (old, new[, maxsplit]) -> unicode\n\
4102\n\
4103Return a copy of S with all occurrences of substring\n\
4104old replaced by new. If the optional argument maxsplit is\n\
4105given, only the first maxsplit occurrences are replaced.";
4106
4107static PyObject*
4108unicode_replace(PyUnicodeObject *self, PyObject *args)
4109{
4110 PyUnicodeObject *str1;
4111 PyUnicodeObject *str2;
4112 int maxcount = -1;
4113 PyObject *result;
4114
4115 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4116 return NULL;
4117 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4118 if (str1 == NULL)
4119 return NULL;
4120 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4121 if (str2 == NULL)
4122 return NULL;
4123
4124 result = replace(self, str1, str2, maxcount);
4125
4126 Py_DECREF(str1);
4127 Py_DECREF(str2);
4128 return result;
4129}
4130
4131static
4132PyObject *unicode_repr(PyObject *unicode)
4133{
4134 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4135 PyUnicode_GET_SIZE(unicode),
4136 1);
4137}
4138
4139static char rfind__doc__[] =
4140"S.rfind(sub [,start [,end]]) -> int\n\
4141\n\
4142Return the highest index in S where substring sub is found,\n\
4143such that sub is contained within s[start,end]. Optional\n\
4144arguments start and end are interpreted as in slice notation.\n\
4145\n\
4146Return -1 on failure.";
4147
4148static PyObject *
4149unicode_rfind(PyUnicodeObject *self, PyObject *args)
4150{
4151 PyUnicodeObject *substring;
4152 int start = 0;
4153 int end = INT_MAX;
4154 PyObject *result;
4155
Guido van Rossumb8872e62000-05-09 14:14:27 +00004156 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4157 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 return NULL;
4159 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4160 (PyObject *)substring);
4161 if (substring == NULL)
4162 return NULL;
4163
4164 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4165
4166 Py_DECREF(substring);
4167 return result;
4168}
4169
4170static char rindex__doc__[] =
4171"S.rindex(sub [,start [,end]]) -> int\n\
4172\n\
4173Like S.rfind() but raise ValueError when the substring is not found.";
4174
4175static PyObject *
4176unicode_rindex(PyUnicodeObject *self, PyObject *args)
4177{
4178 int result;
4179 PyUnicodeObject *substring;
4180 int start = 0;
4181 int end = INT_MAX;
4182
Guido van Rossumb8872e62000-05-09 14:14:27 +00004183 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4184 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 return NULL;
4186 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4187 (PyObject *)substring);
4188 if (substring == NULL)
4189 return NULL;
4190
4191 result = findstring(self, substring, start, end, -1);
4192
4193 Py_DECREF(substring);
4194 if (result < 0) {
4195 PyErr_SetString(PyExc_ValueError, "substring not found");
4196 return NULL;
4197 }
4198 return PyInt_FromLong(result);
4199}
4200
4201static char rjust__doc__[] =
4202"S.rjust(width) -> unicode\n\
4203\n\
4204Return S right justified in a Unicode string of length width. Padding is\n\
4205done using spaces.";
4206
4207static PyObject *
4208unicode_rjust(PyUnicodeObject *self, PyObject *args)
4209{
4210 int width;
4211 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4212 return NULL;
4213
4214 if (self->length >= width) {
4215 Py_INCREF(self);
4216 return (PyObject*) self;
4217 }
4218
4219 return (PyObject*) pad(self, width - self->length, 0, ' ');
4220}
4221
4222static char rstrip__doc__[] =
4223"S.rstrip() -> unicode\n\
4224\n\
4225Return a copy of the string S with trailing whitespace removed.";
4226
4227static PyObject *
4228unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4229{
4230 if (!PyArg_NoArgs(args))
4231 return NULL;
4232 return strip(self, 0, 1);
4233}
4234
4235static PyObject*
4236unicode_slice(PyUnicodeObject *self, int start, int end)
4237{
4238 /* standard clamping */
4239 if (start < 0)
4240 start = 0;
4241 if (end < 0)
4242 end = 0;
4243 if (end > self->length)
4244 end = self->length;
4245 if (start == 0 && end == self->length) {
4246 /* full slice, return original string */
4247 Py_INCREF(self);
4248 return (PyObject*) self;
4249 }
4250 if (start > end)
4251 start = end;
4252 /* copy slice */
4253 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4254 end - start);
4255}
4256
4257PyObject *PyUnicode_Split(PyObject *s,
4258 PyObject *sep,
4259 int maxsplit)
4260{
4261 PyObject *result;
4262
4263 s = PyUnicode_FromObject(s);
4264 if (s == NULL)
4265 return NULL;
4266 if (sep != NULL) {
4267 sep = PyUnicode_FromObject(sep);
4268 if (sep == NULL) {
4269 Py_DECREF(s);
4270 return NULL;
4271 }
4272 }
4273
4274 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4275
4276 Py_DECREF(s);
4277 Py_XDECREF(sep);
4278 return result;
4279}
4280
4281static char split__doc__[] =
4282"S.split([sep [,maxsplit]]) -> list of strings\n\
4283\n\
4284Return a list of the words in S, using sep as the\n\
4285delimiter string. If maxsplit is given, at most maxsplit\n\
4286splits are done. If sep is not specified, any whitespace string\n\
4287is a separator.";
4288
4289static PyObject*
4290unicode_split(PyUnicodeObject *self, PyObject *args)
4291{
4292 PyObject *substring = Py_None;
4293 int maxcount = -1;
4294
4295 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4296 return NULL;
4297
4298 if (substring == Py_None)
4299 return split(self, NULL, maxcount);
4300 else if (PyUnicode_Check(substring))
4301 return split(self, (PyUnicodeObject *)substring, maxcount);
4302 else
4303 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4304}
4305
4306static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004307"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308\n\
4309Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004310Line breaks are not included in the resulting list unless keepends\n\
4311is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312
4313static PyObject*
4314unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4315{
Guido van Rossum86662912000-04-11 15:38:46 +00004316 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317
Guido van Rossum86662912000-04-11 15:38:46 +00004318 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 return NULL;
4320
Guido van Rossum86662912000-04-11 15:38:46 +00004321 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
4324static
4325PyObject *unicode_str(PyUnicodeObject *self)
4326{
Fred Drakee4315f52000-05-09 19:53:39 +00004327 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328}
4329
4330static char strip__doc__[] =
4331"S.strip() -> unicode\n\
4332\n\
4333Return a copy of S with leading and trailing whitespace removed.";
4334
4335static PyObject *
4336unicode_strip(PyUnicodeObject *self, PyObject *args)
4337{
4338 if (!PyArg_NoArgs(args))
4339 return NULL;
4340 return strip(self, 1, 1);
4341}
4342
4343static char swapcase__doc__[] =
4344"S.swapcase() -> unicode\n\
4345\n\
4346Return a copy of S with uppercase characters converted to lowercase\n\
4347and vice versa.";
4348
4349static PyObject*
4350unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4351{
4352 if (!PyArg_NoArgs(args))
4353 return NULL;
4354 return fixup(self, fixswapcase);
4355}
4356
4357static char translate__doc__[] =
4358"S.translate(table) -> unicode\n\
4359\n\
4360Return a copy of the string S, where all characters have been mapped\n\
4361through the given translation table, which must be a mapping of\n\
4362Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4363are left untouched. Characters mapped to None are deleted.";
4364
4365static PyObject*
4366unicode_translate(PyUnicodeObject *self, PyObject *args)
4367{
4368 PyObject *table;
4369
4370 if (!PyArg_ParseTuple(args, "O:translate", &table))
4371 return NULL;
4372 return PyUnicode_TranslateCharmap(self->str,
4373 self->length,
4374 table,
4375 "ignore");
4376}
4377
4378static char upper__doc__[] =
4379"S.upper() -> unicode\n\
4380\n\
4381Return a copy of S converted to uppercase.";
4382
4383static PyObject*
4384unicode_upper(PyUnicodeObject *self, PyObject *args)
4385{
4386 if (!PyArg_NoArgs(args))
4387 return NULL;
4388 return fixup(self, fixupper);
4389}
4390
4391#if 0
4392static char zfill__doc__[] =
4393"S.zfill(width) -> unicode\n\
4394\n\
4395Pad a numeric string x with zeros on the left, to fill a field\n\
4396of the specified width. The string x is never truncated.";
4397
4398static PyObject *
4399unicode_zfill(PyUnicodeObject *self, PyObject *args)
4400{
4401 int fill;
4402 PyUnicodeObject *u;
4403
4404 int width;
4405 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4406 return NULL;
4407
4408 if (self->length >= width) {
4409 Py_INCREF(self);
4410 return (PyObject*) self;
4411 }
4412
4413 fill = width - self->length;
4414
4415 u = pad(self, fill, 0, '0');
4416
4417 if (u->str[fill] == '+' || u->str[fill] == '-') {
4418 /* move sign to beginning of string */
4419 u->str[0] = u->str[fill];
4420 u->str[fill] = '0';
4421 }
4422
4423 return (PyObject*) u;
4424}
4425#endif
4426
4427#if 0
4428static PyObject*
4429unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4430{
4431 if (!PyArg_NoArgs(args))
4432 return NULL;
4433 return PyInt_FromLong(unicode_freelist_size);
4434}
4435#endif
4436
4437static char startswith__doc__[] =
4438"S.startswith(prefix[, start[, end]]) -> int\n\
4439\n\
4440Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4441optional start, test S beginning at that position. With optional end, stop\n\
4442comparing S at that position.";
4443
4444static PyObject *
4445unicode_startswith(PyUnicodeObject *self,
4446 PyObject *args)
4447{
4448 PyUnicodeObject *substring;
4449 int start = 0;
4450 int end = INT_MAX;
4451 PyObject *result;
4452
Guido van Rossumb8872e62000-05-09 14:14:27 +00004453 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4454 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 return NULL;
4456 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4457 (PyObject *)substring);
4458 if (substring == NULL)
4459 return NULL;
4460
4461 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4462
4463 Py_DECREF(substring);
4464 return result;
4465}
4466
4467
4468static char endswith__doc__[] =
4469"S.endswith(suffix[, start[, end]]) -> int\n\
4470\n\
4471Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4472optional start, test S beginning at that position. With optional end, stop\n\
4473comparing S at that position.";
4474
4475static PyObject *
4476unicode_endswith(PyUnicodeObject *self,
4477 PyObject *args)
4478{
4479 PyUnicodeObject *substring;
4480 int start = 0;
4481 int end = INT_MAX;
4482 PyObject *result;
4483
Guido van Rossumb8872e62000-05-09 14:14:27 +00004484 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4485 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 return NULL;
4487 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4488 (PyObject *)substring);
4489 if (substring == NULL)
4490 return NULL;
4491
4492 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4493
4494 Py_DECREF(substring);
4495 return result;
4496}
4497
4498
4499static PyMethodDef unicode_methods[] = {
4500
4501 /* Order is according to common usage: often used methods should
4502 appear first, since lookup is done sequentially. */
4503
4504 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4505 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4506 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4507 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4508 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4509 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4510 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4511 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4512 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4513 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4514 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4515 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4516 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4517 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4518/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4519 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4520 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4521 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4522 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4523 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4524 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4525 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4526 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4527 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4528 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4529 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4530 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4531 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4532 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4533 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4534 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4535 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4536 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004537 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4538 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539#if 0
4540 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4541 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4542#endif
4543
4544#if 0
4545 /* This one is just used for debugging the implementation. */
4546 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4547#endif
4548
4549 {NULL, NULL}
4550};
4551
4552static PyObject *
4553unicode_getattr(PyUnicodeObject *self, char *name)
4554{
4555 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4556}
4557
4558static PySequenceMethods unicode_as_sequence = {
4559 (inquiry) unicode_length, /* sq_length */
4560 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4561 (intargfunc) unicode_repeat, /* sq_repeat */
4562 (intargfunc) unicode_getitem, /* sq_item */
4563 (intintargfunc) unicode_slice, /* sq_slice */
4564 0, /* sq_ass_item */
4565 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004566 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567};
4568
4569static int
4570unicode_buffer_getreadbuf(PyUnicodeObject *self,
4571 int index,
4572 const void **ptr)
4573{
4574 if (index != 0) {
4575 PyErr_SetString(PyExc_SystemError,
4576 "accessing non-existent unicode segment");
4577 return -1;
4578 }
4579 *ptr = (void *) self->str;
4580 return PyUnicode_GET_DATA_SIZE(self);
4581}
4582
4583static int
4584unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4585 const void **ptr)
4586{
4587 PyErr_SetString(PyExc_TypeError,
4588 "cannot use unicode as modifyable buffer");
4589 return -1;
4590}
4591
4592static int
4593unicode_buffer_getsegcount(PyUnicodeObject *self,
4594 int *lenp)
4595{
4596 if (lenp)
4597 *lenp = PyUnicode_GET_DATA_SIZE(self);
4598 return 1;
4599}
4600
4601static int
4602unicode_buffer_getcharbuf(PyUnicodeObject *self,
4603 int index,
4604 const void **ptr)
4605{
4606 PyObject *str;
4607
4608 if (index != 0) {
4609 PyErr_SetString(PyExc_SystemError,
4610 "accessing non-existent unicode segment");
4611 return -1;
4612 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004613 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 if (str == NULL)
4615 return -1;
4616 *ptr = (void *) PyString_AS_STRING(str);
4617 return PyString_GET_SIZE(str);
4618}
4619
4620/* Helpers for PyUnicode_Format() */
4621
4622static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004623getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
4625 int argidx = *p_argidx;
4626 if (argidx < arglen) {
4627 (*p_argidx)++;
4628 if (arglen < 0)
4629 return args;
4630 else
4631 return PyTuple_GetItem(args, argidx);
4632 }
4633 PyErr_SetString(PyExc_TypeError,
4634 "not enough arguments for format string");
4635 return NULL;
4636}
4637
4638#define F_LJUST (1<<0)
4639#define F_SIGN (1<<1)
4640#define F_BLANK (1<<2)
4641#define F_ALT (1<<3)
4642#define F_ZERO (1<<4)
4643
4644static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646{
4647 register int i;
4648 int len;
4649 va_list va;
4650 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652
4653 /* First, format the string as char array, then expand to Py_UNICODE
4654 array. */
4655 charbuffer = (char *)buffer;
4656 len = vsprintf(charbuffer, format, va);
4657 for (i = len - 1; i >= 0; i--)
4658 buffer[i] = (Py_UNICODE) charbuffer[i];
4659
4660 va_end(va);
4661 return len;
4662}
4663
4664static int
4665formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004666 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 int flags,
4668 int prec,
4669 int type,
4670 PyObject *v)
4671{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004672 /* fmt = '%#.' + `prec` + `type`
4673 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 char fmt[20];
4675 double x;
4676
4677 x = PyFloat_AsDouble(v);
4678 if (x == -1.0 && PyErr_Occurred())
4679 return -1;
4680 if (prec < 0)
4681 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4683 type = 'g';
4684 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004685 /* worst case length calc to ensure no buffer overrun:
4686 fmt = %#.<prec>g
4687 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4688 for any double rep.)
4689 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4690 If prec=0 the effective precision is 1 (the leading digit is
4691 always given), therefore increase by one to 10+prec. */
4692 if (buflen <= (size_t)10 + (size_t)prec) {
4693 PyErr_SetString(PyExc_OverflowError,
4694 "formatted float is too long (precision too long?)");
4695 return -1;
4696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 return usprintf(buf, fmt, x);
4698}
4699
Tim Peters38fd5b62000-09-21 05:43:11 +00004700static PyObject*
4701formatlong(PyObject *val, int flags, int prec, int type)
4702{
4703 char *buf;
4704 int i, len;
4705 PyObject *str; /* temporary string object. */
4706 PyUnicodeObject *result;
4707
4708 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4709 if (!str)
4710 return NULL;
4711 result = _PyUnicode_New(len);
4712 for (i = 0; i < len; i++)
4713 result->str[i] = buf[i];
4714 result->str[len] = 0;
4715 Py_DECREF(str);
4716 return (PyObject*)result;
4717}
4718
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719static int
4720formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004721 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 int flags,
4723 int prec,
4724 int type,
4725 PyObject *v)
4726{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004727 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004728 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4729 + 1 + 1 = 24*/
4730 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 long x;
4732
4733 x = PyInt_AsLong(v);
4734 if (x == -1 && PyErr_Occurred())
4735 return -1;
4736 if (prec < 0)
4737 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004738 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4739 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4740 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4741 PyErr_SetString(PyExc_OverflowError,
4742 "formatted integer is too long (precision too long?)");
4743 return -1;
4744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4746 return usprintf(buf, fmt, x);
4747}
4748
4749static int
4750formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004751 size_t buflen,
4752 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004754 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004755 if (PyUnicode_Check(v)) {
4756 if (PyUnicode_GET_SIZE(v) != 1)
4757 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004761 else if (PyString_Check(v)) {
4762 if (PyString_GET_SIZE(v) != 1)
4763 goto onError;
4764 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766
4767 else {
4768 /* Integer input truncated to a character */
4769 long x;
4770 x = PyInt_AsLong(v);
4771 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 buf[0] = (char) x;
4774 }
4775 buf[1] = '\0';
4776 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004777
4778 onError:
4779 PyErr_SetString(PyExc_TypeError,
4780 "%c requires int or char");
4781 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782}
4783
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004784/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4785
4786 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4787 chars are formatted. XXX This is a magic number. Each formatting
4788 routine does bounds checking to ensure no overflow, but a better
4789 solution may be to malloc a buffer of appropriate size for each
4790 format. For now, the current solution is sufficient.
4791*/
4792#define FORMATBUFLEN (size_t)120
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794PyObject *PyUnicode_Format(PyObject *format,
4795 PyObject *args)
4796{
4797 Py_UNICODE *fmt, *res;
4798 int fmtcnt, rescnt, reslen, arglen, argidx;
4799 int args_owned = 0;
4800 PyUnicodeObject *result = NULL;
4801 PyObject *dict = NULL;
4802 PyObject *uformat;
4803
4804 if (format == NULL || args == NULL) {
4805 PyErr_BadInternalCall();
4806 return NULL;
4807 }
4808 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004809 if (uformat == NULL)
4810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 fmt = PyUnicode_AS_UNICODE(uformat);
4812 fmtcnt = PyUnicode_GET_SIZE(uformat);
4813
4814 reslen = rescnt = fmtcnt + 100;
4815 result = _PyUnicode_New(reslen);
4816 if (result == NULL)
4817 goto onError;
4818 res = PyUnicode_AS_UNICODE(result);
4819
4820 if (PyTuple_Check(args)) {
4821 arglen = PyTuple_Size(args);
4822 argidx = 0;
4823 }
4824 else {
4825 arglen = -1;
4826 argidx = -2;
4827 }
4828 if (args->ob_type->tp_as_mapping)
4829 dict = args;
4830
4831 while (--fmtcnt >= 0) {
4832 if (*fmt != '%') {
4833 if (--rescnt < 0) {
4834 rescnt = fmtcnt + 100;
4835 reslen += rescnt;
4836 if (_PyUnicode_Resize(result, reslen) < 0)
4837 return NULL;
4838 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4839 --rescnt;
4840 }
4841 *res++ = *fmt++;
4842 }
4843 else {
4844 /* Got a format specifier */
4845 int flags = 0;
4846 int width = -1;
4847 int prec = -1;
4848 int size = 0;
4849 Py_UNICODE c = '\0';
4850 Py_UNICODE fill;
4851 PyObject *v = NULL;
4852 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004853 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 Py_UNICODE sign;
4855 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004856 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858 fmt++;
4859 if (*fmt == '(') {
4860 Py_UNICODE *keystart;
4861 int keylen;
4862 PyObject *key;
4863 int pcount = 1;
4864
4865 if (dict == NULL) {
4866 PyErr_SetString(PyExc_TypeError,
4867 "format requires a mapping");
4868 goto onError;
4869 }
4870 ++fmt;
4871 --fmtcnt;
4872 keystart = fmt;
4873 /* Skip over balanced parentheses */
4874 while (pcount > 0 && --fmtcnt >= 0) {
4875 if (*fmt == ')')
4876 --pcount;
4877 else if (*fmt == '(')
4878 ++pcount;
4879 fmt++;
4880 }
4881 keylen = fmt - keystart - 1;
4882 if (fmtcnt < 0 || pcount > 0) {
4883 PyErr_SetString(PyExc_ValueError,
4884 "incomplete format key");
4885 goto onError;
4886 }
Fred Drakee4315f52000-05-09 19:53:39 +00004887 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 then looked up since Python uses strings to hold
4889 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004890 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 key = PyUnicode_EncodeUTF8(keystart,
4892 keylen,
4893 NULL);
4894 if (key == NULL)
4895 goto onError;
4896 if (args_owned) {
4897 Py_DECREF(args);
4898 args_owned = 0;
4899 }
4900 args = PyObject_GetItem(dict, key);
4901 Py_DECREF(key);
4902 if (args == NULL) {
4903 goto onError;
4904 }
4905 args_owned = 1;
4906 arglen = -1;
4907 argidx = -2;
4908 }
4909 while (--fmtcnt >= 0) {
4910 switch (c = *fmt++) {
4911 case '-': flags |= F_LJUST; continue;
4912 case '+': flags |= F_SIGN; continue;
4913 case ' ': flags |= F_BLANK; continue;
4914 case '#': flags |= F_ALT; continue;
4915 case '0': flags |= F_ZERO; continue;
4916 }
4917 break;
4918 }
4919 if (c == '*') {
4920 v = getnextarg(args, arglen, &argidx);
4921 if (v == NULL)
4922 goto onError;
4923 if (!PyInt_Check(v)) {
4924 PyErr_SetString(PyExc_TypeError,
4925 "* wants int");
4926 goto onError;
4927 }
4928 width = PyInt_AsLong(v);
4929 if (width < 0) {
4930 flags |= F_LJUST;
4931 width = -width;
4932 }
4933 if (--fmtcnt >= 0)
4934 c = *fmt++;
4935 }
4936 else if (c >= '0' && c <= '9') {
4937 width = c - '0';
4938 while (--fmtcnt >= 0) {
4939 c = *fmt++;
4940 if (c < '0' || c > '9')
4941 break;
4942 if ((width*10) / 10 != width) {
4943 PyErr_SetString(PyExc_ValueError,
4944 "width too big");
4945 goto onError;
4946 }
4947 width = width*10 + (c - '0');
4948 }
4949 }
4950 if (c == '.') {
4951 prec = 0;
4952 if (--fmtcnt >= 0)
4953 c = *fmt++;
4954 if (c == '*') {
4955 v = getnextarg(args, arglen, &argidx);
4956 if (v == NULL)
4957 goto onError;
4958 if (!PyInt_Check(v)) {
4959 PyErr_SetString(PyExc_TypeError,
4960 "* wants int");
4961 goto onError;
4962 }
4963 prec = PyInt_AsLong(v);
4964 if (prec < 0)
4965 prec = 0;
4966 if (--fmtcnt >= 0)
4967 c = *fmt++;
4968 }
4969 else if (c >= '0' && c <= '9') {
4970 prec = c - '0';
4971 while (--fmtcnt >= 0) {
4972 c = Py_CHARMASK(*fmt++);
4973 if (c < '0' || c > '9')
4974 break;
4975 if ((prec*10) / 10 != prec) {
4976 PyErr_SetString(PyExc_ValueError,
4977 "prec too big");
4978 goto onError;
4979 }
4980 prec = prec*10 + (c - '0');
4981 }
4982 }
4983 } /* prec */
4984 if (fmtcnt >= 0) {
4985 if (c == 'h' || c == 'l' || c == 'L') {
4986 size = c;
4987 if (--fmtcnt >= 0)
4988 c = *fmt++;
4989 }
4990 }
4991 if (fmtcnt < 0) {
4992 PyErr_SetString(PyExc_ValueError,
4993 "incomplete format");
4994 goto onError;
4995 }
4996 if (c != '%') {
4997 v = getnextarg(args, arglen, &argidx);
4998 if (v == NULL)
4999 goto onError;
5000 }
5001 sign = 0;
5002 fill = ' ';
5003 switch (c) {
5004
5005 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005006 pbuf = formatbuf;
5007 /* presume that buffer length is at least 1 */
5008 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 len = 1;
5010 break;
5011
5012 case 's':
5013 case 'r':
5014 if (PyUnicode_Check(v) && c == 's') {
5015 temp = v;
5016 Py_INCREF(temp);
5017 }
5018 else {
5019 PyObject *unicode;
5020 if (c == 's')
5021 temp = PyObject_Str(v);
5022 else
5023 temp = PyObject_Repr(v);
5024 if (temp == NULL)
5025 goto onError;
5026 if (!PyString_Check(temp)) {
5027 /* XXX Note: this should never happen, since
5028 PyObject_Repr() and PyObject_Str() assure
5029 this */
5030 Py_DECREF(temp);
5031 PyErr_SetString(PyExc_TypeError,
5032 "%s argument has non-string str()");
5033 goto onError;
5034 }
Fred Drakee4315f52000-05-09 19:53:39 +00005035 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005037 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 "strict");
5039 Py_DECREF(temp);
5040 temp = unicode;
5041 if (temp == NULL)
5042 goto onError;
5043 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005044 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045 len = PyUnicode_GET_SIZE(temp);
5046 if (prec >= 0 && len > prec)
5047 len = prec;
5048 break;
5049
5050 case 'i':
5051 case 'd':
5052 case 'u':
5053 case 'o':
5054 case 'x':
5055 case 'X':
5056 if (c == 'i')
5057 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005058 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005059 temp = formatlong(v, flags, prec, c);
5060 if (!temp)
5061 goto onError;
5062 pbuf = PyUnicode_AS_UNICODE(temp);
5063 len = PyUnicode_GET_SIZE(temp);
5064 /* unbounded ints can always produce
5065 a sign character! */
5066 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005068 else {
5069 pbuf = formatbuf;
5070 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5071 flags, prec, c, v);
5072 if (len < 0)
5073 goto onError;
5074 /* only d conversion is signed */
5075 sign = c == 'd';
5076 }
5077 if (flags & F_ZERO)
5078 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 break;
5080
5081 case 'e':
5082 case 'E':
5083 case 'f':
5084 case 'g':
5085 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005086 pbuf = formatbuf;
5087 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5088 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 if (len < 0)
5090 goto onError;
5091 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005092 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 fill = '0';
5094 break;
5095
5096 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005097 pbuf = formatbuf;
5098 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 if (len < 0)
5100 goto onError;
5101 break;
5102
5103 default:
5104 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005105 "unsupported format character '%c' (0x%x) "
5106 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005107 (31<=c && c<=126) ? c : '?',
5108 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 goto onError;
5110 }
5111 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005112 if (*pbuf == '-' || *pbuf == '+') {
5113 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 len--;
5115 }
5116 else if (flags & F_SIGN)
5117 sign = '+';
5118 else if (flags & F_BLANK)
5119 sign = ' ';
5120 else
5121 sign = 0;
5122 }
5123 if (width < len)
5124 width = len;
5125 if (rescnt < width + (sign != 0)) {
5126 reslen -= rescnt;
5127 rescnt = width + fmtcnt + 100;
5128 reslen += rescnt;
5129 if (_PyUnicode_Resize(result, reslen) < 0)
5130 return NULL;
5131 res = PyUnicode_AS_UNICODE(result)
5132 + reslen - rescnt;
5133 }
5134 if (sign) {
5135 if (fill != ' ')
5136 *res++ = sign;
5137 rescnt--;
5138 if (width > len)
5139 width--;
5140 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005141 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5142 assert(pbuf[0] == '0');
5143 assert(pbuf[1] == c);
5144 if (fill != ' ') {
5145 *res++ = *pbuf++;
5146 *res++ = *pbuf++;
5147 }
5148 rescnt -= 2;
5149 width -= 2;
5150 if (width < 0)
5151 width = 0;
5152 len -= 2;
5153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 if (width > len && !(flags & F_LJUST)) {
5155 do {
5156 --rescnt;
5157 *res++ = fill;
5158 } while (--width > len);
5159 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005160 if (fill == ' ') {
5161 if (sign)
5162 *res++ = sign;
5163 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5164 assert(pbuf[0] == '0');
5165 assert(pbuf[1] == c);
5166 *res++ = *pbuf++;
5167 *res++ = *pbuf++;
5168 }
5169 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005170 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 res += len;
5172 rescnt -= len;
5173 while (--width >= len) {
5174 --rescnt;
5175 *res++ = ' ';
5176 }
5177 if (dict && (argidx < arglen) && c != '%') {
5178 PyErr_SetString(PyExc_TypeError,
5179 "not all arguments converted");
5180 goto onError;
5181 }
5182 Py_XDECREF(temp);
5183 } /* '%' */
5184 } /* until end */
5185 if (argidx < arglen && !dict) {
5186 PyErr_SetString(PyExc_TypeError,
5187 "not all arguments converted");
5188 goto onError;
5189 }
5190
5191 if (args_owned) {
5192 Py_DECREF(args);
5193 }
5194 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005195 if (_PyUnicode_Resize(result, reslen - rescnt))
5196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return (PyObject *)result;
5198
5199 onError:
5200 Py_XDECREF(result);
5201 Py_DECREF(uformat);
5202 if (args_owned) {
5203 Py_DECREF(args);
5204 }
5205 return NULL;
5206}
5207
5208static PyBufferProcs unicode_as_buffer = {
5209 (getreadbufferproc) unicode_buffer_getreadbuf,
5210 (getwritebufferproc) unicode_buffer_getwritebuf,
5211 (getsegcountproc) unicode_buffer_getsegcount,
5212 (getcharbufferproc) unicode_buffer_getcharbuf,
5213};
5214
5215PyTypeObject PyUnicode_Type = {
5216 PyObject_HEAD_INIT(&PyType_Type)
5217 0, /* ob_size */
5218 "unicode", /* tp_name */
5219 sizeof(PyUnicodeObject), /* tp_size */
5220 0, /* tp_itemsize */
5221 /* Slots */
5222 (destructor)_PyUnicode_Free, /* tp_dealloc */
5223 0, /* tp_print */
5224 (getattrfunc)unicode_getattr, /* tp_getattr */
5225 0, /* tp_setattr */
5226 (cmpfunc) unicode_compare, /* tp_compare */
5227 (reprfunc) unicode_repr, /* tp_repr */
5228 0, /* tp_as_number */
5229 &unicode_as_sequence, /* tp_as_sequence */
5230 0, /* tp_as_mapping */
5231 (hashfunc) unicode_hash, /* tp_hash*/
5232 0, /* tp_call*/
5233 (reprfunc) unicode_str, /* tp_str */
5234 (getattrofunc) NULL, /* tp_getattro */
5235 (setattrofunc) NULL, /* tp_setattro */
5236 &unicode_as_buffer, /* tp_as_buffer */
5237 Py_TPFLAGS_DEFAULT, /* tp_flags */
5238};
5239
5240/* Initialize the Unicode implementation */
5241
Thomas Wouters78890102000-07-22 19:25:51 +00005242void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
5244 /* Doublecheck the configuration... */
5245 if (sizeof(Py_UNICODE) != 2)
5246 Py_FatalError("Unicode configuration error: "
5247 "sizeof(Py_UNICODE) != 2 bytes");
5248
Fred Drakee4315f52000-05-09 19:53:39 +00005249 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005250 unicode_freelist = NULL;
5251 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005253 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254}
5255
5256/* Finalize the Unicode implementation */
5257
5258void
Thomas Wouters78890102000-07-22 19:25:51 +00005259_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005261 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005263 Py_XDECREF(unicode_empty);
5264 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005265
5266 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 PyUnicodeObject *v = u;
5268 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005269 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005270 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005271 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005272 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005274 unicode_freelist = NULL;
5275 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276}