blob: f4dc9bfe7e60ea822edd5a1f8a2fb7c97f192ab0 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000246 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
364{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 const char *s;
366 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000367 int owned = 0;
368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
373 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000374
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
383 }
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
389 }
390 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
397 }
398 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
403 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414
415 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 if (len == 0) {
417 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return v;
427
428 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000429 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000430 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000431 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433}
434
435PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *buffer = NULL, *unicode;
441
Fred Drakee4315f52000-05-09 19:53:39 +0000442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
444
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000462 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
466 }
467 Py_DECREF(buffer);
468 return unicode;
469
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
479{
480 PyObject *v, *unicode;
481
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *v;
495
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
499 }
Fred Drakee4315f52000-05-09 19:53:39 +0000500
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
503
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000521 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
525 }
526 return v;
527
528 onError:
529 return NULL;
530}
531
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000532/* Return a Python string holding the default encoded value of the
533 Unicode object.
534
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
539
540 The refcount of the string is *not* incremented.
541
542 *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
548{
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
557}
558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
564 }
565 return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568 return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577 return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580 return -1;
581}
582
Thomas Wouters78890102000-07-22 19:25:51 +0000583const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000584{
585 return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590 PyObject *v;
591
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
602
603 onError:
604 return -1;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
636{
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000640 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 details);
642 return -1;
643 }
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
647 }
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
653 }
654 else {
655 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000656 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 errors);
658 return -1;
659 }
660}
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
665{
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000670 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
679
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
683
684 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000685 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686
687 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000688 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 s++;
690 continue;
691 }
692
693 n = utf8_code_length[ch];
694
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699
700 switch (n) {
701
702 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "unexpected code byte";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 errmsg = "internal error";
709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710 break;
711
712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 break;
725
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000738 *p++ = (Py_UNICODE)ch;
739 break;
740
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000758 /* compute and append the two surrogates: */
759
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
762
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 break;
769
770 default:
771 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 }
776 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 continue;
778
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 }
783
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
787
788 return (PyObject *)unicode;
789
790onError:
791 Py_DECREF(unicode);
792 return NULL;
793}
794
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795/* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000797#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798static
799int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
818 }
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 errors);
824 return -1;
825 }
826}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
832{
833 PyObject *v;
834 char *p;
835 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000842 if (v == NULL)
843 return NULL;
844 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000845 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten++;
853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000857 cbWritten += 2;
858 }
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000871 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000872 }
873
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 i++;
880 cbWritten += 4;
881 }
882 }
883 }
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
887 }
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891 }
892 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000893 if (_PyString_Resize(&v, p - q))
894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895 return v;
896
897 onError:
898 Py_DECREF(v);
899 return NULL;
900}
901
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
903{
904 PyObject *str;
905
906 if (!PyUnicode_Check(unicode)) {
907 PyErr_BadArgument();
908 return NULL;
909 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000910 str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
911 PyUnicode_GET_SIZE(unicode),
912 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000913 if (str == NULL)
914 return NULL;
915 Py_INCREF(str);
916 return str;
917}
918
919/* --- UTF-16 Codec ------------------------------------------------------- */
920
921static
922int utf16_decoding_error(const Py_UNICODE **source,
923 Py_UNICODE **dest,
924 const char *errors,
925 const char *details)
926{
927 if ((errors == NULL) ||
928 (strcmp(errors,"strict") == 0)) {
929 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000930 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931 details);
932 return -1;
933 }
934 else if (strcmp(errors,"ignore") == 0) {
935 return 0;
936 }
937 else if (strcmp(errors,"replace") == 0) {
938 if (dest) {
939 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
940 (*dest)++;
941 }
942 return 0;
943 }
944 else {
945 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000946 "UTF-16 decoding error; "
947 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948 errors);
949 return -1;
950 }
951}
952
Guido van Rossumd57fd912000-03-10 22:53:23 +0000953PyObject *PyUnicode_DecodeUTF16(const char *s,
954 int size,
955 const char *errors,
956 int *byteorder)
957{
958 PyUnicodeObject *unicode;
959 Py_UNICODE *p;
960 const Py_UNICODE *q, *e;
961 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000962 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963
964 /* size should be an even number */
965 if (size % sizeof(Py_UNICODE) != 0) {
966 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
967 return NULL;
968 /* The remaining input chars are ignored if we fall through
969 here... */
970 }
971
972 /* Note: size will always be longer than the resulting Unicode
973 character count */
974 unicode = _PyUnicode_New(size);
975 if (!unicode)
976 return NULL;
977 if (size == 0)
978 return (PyObject *)unicode;
979
980 /* Unpack UTF-16 encoded data */
981 p = unicode->str;
982 q = (Py_UNICODE *)s;
983 e = q + (size / sizeof(Py_UNICODE));
984
985 if (byteorder)
986 bo = *byteorder;
987
988 while (q < e) {
989 register Py_UNICODE ch = *q++;
990
991 /* Check for BOM marks (U+FEFF) in the input and adjust
992 current byte order setting accordingly. Swap input
993 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
994 !) */
995#ifdef BYTEORDER_IS_LITTLE_ENDIAN
996 if (ch == 0xFEFF) {
997 bo = -1;
998 continue;
999 } else if (ch == 0xFFFE) {
1000 bo = 1;
1001 continue;
1002 }
1003 if (bo == 1)
1004 ch = (ch >> 8) | (ch << 8);
1005#else
1006 if (ch == 0xFEFF) {
1007 bo = 1;
1008 continue;
1009 } else if (ch == 0xFFFE) {
1010 bo = -1;
1011 continue;
1012 }
1013 if (bo == -1)
1014 ch = (ch >> 8) | (ch << 8);
1015#endif
1016 if (ch < 0xD800 || ch > 0xDFFF) {
1017 *p++ = ch;
1018 continue;
1019 }
1020
1021 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001022 if (q >= e) {
1023 errmsg = "unexpected end of data";
1024 goto utf16Error;
1025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 if (0xDC00 <= *q && *q <= 0xDFFF) {
1027 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001028 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001029 /* This is valid data (a UTF-16 surrogate pair), but
1030 we are not able to store this information since our
1031 Py_UNICODE type only has 16 bits... this might
1032 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001033 errmsg = "code pairs are not supported";
1034 goto utf16Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 else
1037 continue;
1038 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001039 errmsg = "illegal encoding";
1040 /* Fall through to report the error */
1041
1042 utf16Error:
1043 if (utf16_decoding_error(&q, &p, errors, errmsg))
1044 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 }
1046
1047 if (byteorder)
1048 *byteorder = bo;
1049
1050 /* Adjust length */
1051 if (_PyUnicode_Resize(unicode, p - unicode->str))
1052 goto onError;
1053
1054 return (PyObject *)unicode;
1055
1056onError:
1057 Py_DECREF(unicode);
1058 return NULL;
1059}
1060
1061#undef UTF16_ERROR
1062
1063PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1064 int size,
1065 const char *errors,
1066 int byteorder)
1067{
1068 PyObject *v;
1069 Py_UNICODE *p;
1070 char *q;
1071
1072 /* We don't create UTF-16 pairs... */
1073 v = PyString_FromStringAndSize(NULL,
1074 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1075 if (v == NULL)
1076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077
1078 q = PyString_AS_STRING(v);
1079 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 if (byteorder == 0)
1081 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001082 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001083 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084 if (byteorder == 0 ||
1085#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1086 byteorder == -1
1087#else
1088 byteorder == 1
1089#endif
1090 )
1091 memcpy(p, s, size * sizeof(Py_UNICODE));
1092 else
1093 while (size-- > 0) {
1094 Py_UNICODE ch = *s++;
1095 *p++ = (ch >> 8) | (ch << 8);
1096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 return v;
1098}
1099
1100PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1101{
1102 if (!PyUnicode_Check(unicode)) {
1103 PyErr_BadArgument();
1104 return NULL;
1105 }
1106 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1107 PyUnicode_GET_SIZE(unicode),
1108 NULL,
1109 0);
1110}
1111
1112/* --- Unicode Escape Codec ----------------------------------------------- */
1113
1114static
1115int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001116 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 const char *errors,
1118 const char *details)
1119{
1120 if ((errors == NULL) ||
1121 (strcmp(errors,"strict") == 0)) {
1122 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001123 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 details);
1125 return -1;
1126 }
1127 else if (strcmp(errors,"ignore") == 0) {
1128 return 0;
1129 }
1130 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001131 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 return 0;
1133 }
1134 else {
1135 PyErr_Format(PyExc_ValueError,
1136 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001137 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 errors);
1139 return -1;
1140 }
1141}
1142
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001143static _Py_UCNHashAPI *pucnHash = NULL;
1144
1145static
1146int mystrnicmp(const char *s1, const char *s2, size_t count)
1147{
1148 char c1, c2;
1149
1150 if (count)
1151 {
1152 do
1153 {
1154 c1 = tolower(*(s1++));
1155 c2 = tolower(*(s2++));
1156 }
1157 while(--count && c1 == c2);
1158
1159 return c1 - c2;
1160 }
1161
1162 return 0;
1163}
1164
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1166 int size,
1167 const char *errors)
1168{
1169 PyUnicodeObject *v;
1170 Py_UNICODE *p = NULL, *buf = NULL;
1171 const char *end;
1172
1173 /* Escaped strings will always be longer than the resulting
1174 Unicode string, so we start with size here and then reduce the
1175 length after conversion to the true value. */
1176 v = _PyUnicode_New(size);
1177 if (v == NULL)
1178 goto onError;
1179 if (size == 0)
1180 return (PyObject *)v;
1181 p = buf = PyUnicode_AS_UNICODE(v);
1182 end = s + size;
1183 while (s < end) {
1184 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001185 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186 int i;
1187
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1189 if (*s != '\\') {
1190 *p++ = (unsigned char)*s++;
1191 continue;
1192 }
1193
1194 /* \ - Escapes */
1195 s++;
1196 switch (*s++) {
1197
1198 /* \x escapes */
1199 case '\n': break;
1200 case '\\': *p++ = '\\'; break;
1201 case '\'': *p++ = '\''; break;
1202 case '\"': *p++ = '\"'; break;
1203 case 'b': *p++ = '\b'; break;
1204 case 'f': *p++ = '\014'; break; /* FF */
1205 case 't': *p++ = '\t'; break;
1206 case 'n': *p++ = '\n'; break;
1207 case 'r': *p++ = '\r'; break;
1208 case 'v': *p++ = '\013'; break; /* VT */
1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001216 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001218 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001220 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 break;
1222
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001223 /* \xXXXX escape with 1-n hex digits. for compatibility
1224 with 8-bit strings, this code ignores all but the last
1225 two digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 case 'x':
1227 x = 0;
1228 c = (unsigned char)*s;
1229 if (isxdigit(c)) {
1230 do {
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001231 x = (x<<4) & 0xF0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 if ('0' <= c && c <= '9')
1233 x += c - '0';
1234 else if ('a' <= c && c <= 'f')
1235 x += 10 + c - 'a';
1236 else
1237 x += 10 + c - 'A';
1238 c = (unsigned char)*++s;
1239 } while (isxdigit(c));
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001240 *p++ = (unsigned char) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 } else {
1242 *p++ = '\\';
1243 *p++ = (unsigned char)s[-1];
1244 }
1245 break;
1246
1247 /* \uXXXX with 4 hex digits */
1248 case 'u':
1249 for (x = 0, i = 0; i < 4; i++) {
1250 c = (unsigned char)s[i];
1251 if (!isxdigit(c)) {
1252 if (unicodeescape_decoding_error(&s, &x, errors,
1253 "truncated \\uXXXX"))
1254 goto onError;
1255 i++;
1256 break;
1257 }
1258 x = (x<<4) & ~0xF;
1259 if (c >= '0' && c <= '9')
1260 x += c - '0';
1261 else if (c >= 'a' && c <= 'f')
1262 x += 10 + c - 'a';
1263 else
1264 x += 10 + c - 'A';
1265 }
1266 s += i;
1267 *p++ = x;
1268 break;
1269
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 case 'N':
1271 /* Ok, we need to deal with Unicode Character Names now,
1272 * make sure we've imported the hash table data...
1273 */
1274 if (pucnHash == NULL)
1275 {
1276 PyObject *mod = 0, *v = 0;
1277
1278 mod = PyImport_ImportModule("ucnhash");
1279 if (mod == NULL)
1280 goto onError;
1281 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1282 Py_DECREF(mod);
1283 if (v == NULL)
1284 {
1285 goto onError;
1286 }
1287 pucnHash = PyCObject_AsVoidPtr(v);
1288 Py_DECREF(v);
1289 if (pucnHash == NULL)
1290 {
1291 goto onError;
1292 }
1293 }
1294
1295 if (*s == '{')
1296 {
1297 const char *start = s + 1;
1298 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001299 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001300 unsigned long j;
1301
1302 /* look for either the closing brace, or we
1303 * exceed the maximum length of the unicode character names
1304 */
1305 while (*endBrace != '}' &&
1306 (unsigned int)(endBrace - start) <=
1307 pucnHash->cchMax &&
1308 endBrace < end)
1309 {
1310 endBrace++;
1311 }
1312 if (endBrace != end && *endBrace == '}')
1313 {
1314 j = pucnHash->hash(start, endBrace - start);
1315 if (j > pucnHash->cKeys ||
1316 mystrnicmp(
1317 start,
1318 ((_Py_UnicodeCharacterName *)
1319 (pucnHash->getValue(j)))->pszUCN,
1320 (int)(endBrace - start)) != 0)
1321 {
1322 if (unicodeescape_decoding_error(
1323 &s, &x, errors,
1324 "Invalid Unicode Character Name"))
1325 {
1326 goto onError;
1327 }
1328 goto ucnFallthrough;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 value = ((_Py_UnicodeCharacterName *)
1331 (pucnHash->getValue(j)))->value;
1332 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001333 {
1334 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 }
1337 else
1338 {
1339 /* Oops, its in UCS-4 space, */
1340 /* compute and append the two surrogates: */
1341 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001342 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001343
1344 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001345 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001346
1347 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001349 }
1350 s = endBrace + 1;
1351 }
1352 else
1353 {
1354 if (unicodeescape_decoding_error(
1355 &s, &x, errors,
1356 "Unicode name missing closing brace"))
1357 goto onError;
1358 goto ucnFallthrough;
1359 }
1360 break;
1361 }
1362 if (unicodeescape_decoding_error(
1363 &s, &x, errors,
1364 "Missing opening brace for Unicode Character Name escape"))
1365 goto onError;
1366ucnFallthrough:
1367 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001368 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 *p++ = '\\';
1370 *p++ = (unsigned char)s[-1];
1371 break;
1372 }
1373 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001374 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 return (PyObject *)v;
1377
1378 onError:
1379 Py_XDECREF(v);
1380 return NULL;
1381}
1382
1383/* Return a Unicode-Escape string version of the Unicode object.
1384
1385 If quotes is true, the string is enclosed in u"" or u'' quotes as
1386 appropriate.
1387
1388*/
1389
Barry Warsaw51ac5802000-03-20 16:36:48 +00001390static const Py_UNICODE *findchar(const Py_UNICODE *s,
1391 int size,
1392 Py_UNICODE ch);
1393
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394static
1395PyObject *unicodeescape_string(const Py_UNICODE *s,
1396 int size,
1397 int quotes)
1398{
1399 PyObject *repr;
1400 char *p;
1401 char *q;
1402
1403 static const char *hexdigit = "0123456789ABCDEF";
1404
1405 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1406 if (repr == NULL)
1407 return NULL;
1408
1409 p = q = PyString_AS_STRING(repr);
1410
1411 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 *p++ = 'u';
1413 *p++ = (findchar(s, size, '\'') &&
1414 !findchar(s, size, '"')) ? '"' : '\'';
1415 }
1416 while (size-- > 0) {
1417 Py_UNICODE ch = *s++;
1418 /* Escape quotes */
1419 if (quotes && (ch == q[1] || ch == '\\')) {
1420 *p++ = '\\';
1421 *p++ = (char) ch;
1422 }
1423 /* Map 16-bit characters to '\uxxxx' */
1424 else if (ch >= 256) {
1425 *p++ = '\\';
1426 *p++ = 'u';
1427 *p++ = hexdigit[(ch >> 12) & 0xf];
1428 *p++ = hexdigit[(ch >> 8) & 0xf];
1429 *p++ = hexdigit[(ch >> 4) & 0xf];
1430 *p++ = hexdigit[ch & 15];
1431 }
1432 /* Map non-printable US ASCII to '\ooo' */
1433 else if (ch < ' ' || ch >= 128) {
1434 *p++ = '\\';
1435 *p++ = hexdigit[(ch >> 6) & 7];
1436 *p++ = hexdigit[(ch >> 3) & 7];
1437 *p++ = hexdigit[ch & 7];
1438 }
1439 /* Copy everything else as-is */
1440 else
1441 *p++ = (char) ch;
1442 }
1443 if (quotes)
1444 *p++ = q[1];
1445
1446 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001447 if (_PyString_Resize(&repr, p - q))
1448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449
1450 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001451
1452 onError:
1453 Py_DECREF(repr);
1454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455}
1456
1457PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1458 int size)
1459{
1460 return unicodeescape_string(s, size, 0);
1461}
1462
1463PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1464{
1465 if (!PyUnicode_Check(unicode)) {
1466 PyErr_BadArgument();
1467 return NULL;
1468 }
1469 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1470 PyUnicode_GET_SIZE(unicode));
1471}
1472
1473/* --- Raw Unicode Escape Codec ------------------------------------------- */
1474
1475PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1476 int size,
1477 const char *errors)
1478{
1479 PyUnicodeObject *v;
1480 Py_UNICODE *p, *buf;
1481 const char *end;
1482 const char *bs;
1483
1484 /* Escaped strings will always be longer than the resulting
1485 Unicode string, so we start with size here and then reduce the
1486 length after conversion to the true value. */
1487 v = _PyUnicode_New(size);
1488 if (v == NULL)
1489 goto onError;
1490 if (size == 0)
1491 return (PyObject *)v;
1492 p = buf = PyUnicode_AS_UNICODE(v);
1493 end = s + size;
1494 while (s < end) {
1495 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001496 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 int i;
1498
1499 /* Non-escape characters are interpreted as Unicode ordinals */
1500 if (*s != '\\') {
1501 *p++ = (unsigned char)*s++;
1502 continue;
1503 }
1504
1505 /* \u-escapes are only interpreted iff the number of leading
1506 backslashes if odd */
1507 bs = s;
1508 for (;s < end;) {
1509 if (*s != '\\')
1510 break;
1511 *p++ = (unsigned char)*s++;
1512 }
1513 if (((s - bs) & 1) == 0 ||
1514 s >= end ||
1515 *s != 'u') {
1516 continue;
1517 }
1518 p--;
1519 s++;
1520
1521 /* \uXXXX with 4 hex digits */
1522 for (x = 0, i = 0; i < 4; i++) {
1523 c = (unsigned char)s[i];
1524 if (!isxdigit(c)) {
1525 if (unicodeescape_decoding_error(&s, &x, errors,
1526 "truncated \\uXXXX"))
1527 goto onError;
1528 i++;
1529 break;
1530 }
1531 x = (x<<4) & ~0xF;
1532 if (c >= '0' && c <= '9')
1533 x += c - '0';
1534 else if (c >= 'a' && c <= 'f')
1535 x += 10 + c - 'a';
1536 else
1537 x += 10 + c - 'A';
1538 }
1539 s += i;
1540 *p++ = x;
1541 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 if (_PyUnicode_Resize(v, (int)(p - buf)))
1543 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 return (PyObject *)v;
1545
1546 onError:
1547 Py_XDECREF(v);
1548 return NULL;
1549}
1550
1551PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1552 int size)
1553{
1554 PyObject *repr;
1555 char *p;
1556 char *q;
1557
1558 static const char *hexdigit = "0123456789ABCDEF";
1559
1560 repr = PyString_FromStringAndSize(NULL, 6 * size);
1561 if (repr == NULL)
1562 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001563 if (size == 0)
1564 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
1566 p = q = PyString_AS_STRING(repr);
1567 while (size-- > 0) {
1568 Py_UNICODE ch = *s++;
1569 /* Map 16-bit characters to '\uxxxx' */
1570 if (ch >= 256) {
1571 *p++ = '\\';
1572 *p++ = 'u';
1573 *p++ = hexdigit[(ch >> 12) & 0xf];
1574 *p++ = hexdigit[(ch >> 8) & 0xf];
1575 *p++ = hexdigit[(ch >> 4) & 0xf];
1576 *p++ = hexdigit[ch & 15];
1577 }
1578 /* Copy everything else as-is */
1579 else
1580 *p++ = (char) ch;
1581 }
1582 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001583 if (_PyString_Resize(&repr, p - q))
1584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
1586 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001587
1588 onError:
1589 Py_DECREF(repr);
1590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591}
1592
1593PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1594{
1595 if (!PyUnicode_Check(unicode)) {
1596 PyErr_BadArgument();
1597 return NULL;
1598 }
1599 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1600 PyUnicode_GET_SIZE(unicode));
1601}
1602
1603/* --- Latin-1 Codec ------------------------------------------------------ */
1604
1605PyObject *PyUnicode_DecodeLatin1(const char *s,
1606 int size,
1607 const char *errors)
1608{
1609 PyUnicodeObject *v;
1610 Py_UNICODE *p;
1611
1612 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1613 v = _PyUnicode_New(size);
1614 if (v == NULL)
1615 goto onError;
1616 if (size == 0)
1617 return (PyObject *)v;
1618 p = PyUnicode_AS_UNICODE(v);
1619 while (size-- > 0)
1620 *p++ = (unsigned char)*s++;
1621 return (PyObject *)v;
1622
1623 onError:
1624 Py_XDECREF(v);
1625 return NULL;
1626}
1627
1628static
1629int latin1_encoding_error(const Py_UNICODE **source,
1630 char **dest,
1631 const char *errors,
1632 const char *details)
1633{
1634 if ((errors == NULL) ||
1635 (strcmp(errors,"strict") == 0)) {
1636 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001637 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 details);
1639 return -1;
1640 }
1641 else if (strcmp(errors,"ignore") == 0) {
1642 return 0;
1643 }
1644 else if (strcmp(errors,"replace") == 0) {
1645 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001646 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 return 0;
1648 }
1649 else {
1650 PyErr_Format(PyExc_ValueError,
1651 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001652 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 errors);
1654 return -1;
1655 }
1656}
1657
1658PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1659 int size,
1660 const char *errors)
1661{
1662 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001663 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001664
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 repr = PyString_FromStringAndSize(NULL, size);
1666 if (repr == NULL)
1667 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001668 if (size == 0)
1669 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670
1671 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001672 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 while (size-- > 0) {
1674 Py_UNICODE ch = *p++;
1675 if (ch >= 256) {
1676 if (latin1_encoding_error(&p, &s, errors,
1677 "ordinal not in range(256)"))
1678 goto onError;
1679 }
1680 else
1681 *s++ = (char)ch;
1682 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001683 /* Resize if error handling skipped some characters */
1684 if (s - start < PyString_GET_SIZE(repr))
1685 if (_PyString_Resize(&repr, s - start))
1686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 return repr;
1688
1689 onError:
1690 Py_DECREF(repr);
1691 return NULL;
1692}
1693
1694PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1695{
1696 if (!PyUnicode_Check(unicode)) {
1697 PyErr_BadArgument();
1698 return NULL;
1699 }
1700 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 NULL);
1703}
1704
1705/* --- 7-bit ASCII Codec -------------------------------------------------- */
1706
1707static
1708int ascii_decoding_error(const char **source,
1709 Py_UNICODE **dest,
1710 const char *errors,
1711 const char *details)
1712{
1713 if ((errors == NULL) ||
1714 (strcmp(errors,"strict") == 0)) {
1715 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001716 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 details);
1718 return -1;
1719 }
1720 else if (strcmp(errors,"ignore") == 0) {
1721 return 0;
1722 }
1723 else if (strcmp(errors,"replace") == 0) {
1724 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1725 (*dest)++;
1726 return 0;
1727 }
1728 else {
1729 PyErr_Format(PyExc_ValueError,
1730 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001731 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 errors);
1733 return -1;
1734 }
1735}
1736
1737PyObject *PyUnicode_DecodeASCII(const char *s,
1738 int size,
1739 const char *errors)
1740{
1741 PyUnicodeObject *v;
1742 Py_UNICODE *p;
1743
1744 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1745 v = _PyUnicode_New(size);
1746 if (v == NULL)
1747 goto onError;
1748 if (size == 0)
1749 return (PyObject *)v;
1750 p = PyUnicode_AS_UNICODE(v);
1751 while (size-- > 0) {
1752 register unsigned char c;
1753
1754 c = (unsigned char)*s++;
1755 if (c < 128)
1756 *p++ = c;
1757 else if (ascii_decoding_error(&s, &p, errors,
1758 "ordinal not in range(128)"))
1759 goto onError;
1760 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001761 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1762 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1763 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return (PyObject *)v;
1765
1766 onError:
1767 Py_XDECREF(v);
1768 return NULL;
1769}
1770
1771static
1772int ascii_encoding_error(const Py_UNICODE **source,
1773 char **dest,
1774 const char *errors,
1775 const char *details)
1776{
1777 if ((errors == NULL) ||
1778 (strcmp(errors,"strict") == 0)) {
1779 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001780 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 details);
1782 return -1;
1783 }
1784 else if (strcmp(errors,"ignore") == 0) {
1785 return 0;
1786 }
1787 else if (strcmp(errors,"replace") == 0) {
1788 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001789 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 return 0;
1791 }
1792 else {
1793 PyErr_Format(PyExc_ValueError,
1794 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001795 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 errors);
1797 return -1;
1798 }
1799}
1800
1801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1802 int size,
1803 const char *errors)
1804{
1805 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001806 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001807
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 repr = PyString_FromStringAndSize(NULL, size);
1809 if (repr == NULL)
1810 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001811 if (size == 0)
1812 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813
1814 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001815 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 while (size-- > 0) {
1817 Py_UNICODE ch = *p++;
1818 if (ch >= 128) {
1819 if (ascii_encoding_error(&p, &s, errors,
1820 "ordinal not in range(128)"))
1821 goto onError;
1822 }
1823 else
1824 *s++ = (char)ch;
1825 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001826 /* Resize if error handling skipped some characters */
1827 if (s - start < PyString_GET_SIZE(repr))
1828 if (_PyString_Resize(&repr, s - start))
1829 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 return repr;
1831
1832 onError:
1833 Py_DECREF(repr);
1834 return NULL;
1835}
1836
1837PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1838{
1839 if (!PyUnicode_Check(unicode)) {
1840 PyErr_BadArgument();
1841 return NULL;
1842 }
1843 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1844 PyUnicode_GET_SIZE(unicode),
1845 NULL);
1846}
1847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001849
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001850/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001851
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001852PyObject *PyUnicode_DecodeMBCS(const char *s,
1853 int size,
1854 const char *errors)
1855{
1856 PyUnicodeObject *v;
1857 Py_UNICODE *p;
1858
1859 /* First get the size of the result */
1860 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001861 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001862 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1863
1864 v = _PyUnicode_New(usize);
1865 if (v == NULL)
1866 return NULL;
1867 if (usize == 0)
1868 return (PyObject *)v;
1869 p = PyUnicode_AS_UNICODE(v);
1870 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1871 Py_DECREF(v);
1872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1873 }
1874
1875 return (PyObject *)v;
1876}
1877
1878PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1879 int size,
1880 const char *errors)
1881{
1882 PyObject *repr;
1883 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001884 DWORD mbcssize;
1885
1886 /* If there are no characters, bail now! */
1887 if (size==0)
1888 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001889
1890 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001891 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001892 if (mbcssize==0)
1893 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1894
1895 repr = PyString_FromStringAndSize(NULL, mbcssize);
1896 if (repr == NULL)
1897 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001898 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001899 return repr;
1900
1901 /* Do the conversion */
1902 s = PyString_AS_STRING(repr);
1903 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1904 Py_DECREF(repr);
1905 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1906 }
1907 return repr;
1908}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001909
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001910#endif /* MS_WIN32 */
1911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912/* --- Character Mapping Codec -------------------------------------------- */
1913
1914static
1915int charmap_decoding_error(const char **source,
1916 Py_UNICODE **dest,
1917 const char *errors,
1918 const char *details)
1919{
1920 if ((errors == NULL) ||
1921 (strcmp(errors,"strict") == 0)) {
1922 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001923 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 details);
1925 return -1;
1926 }
1927 else if (strcmp(errors,"ignore") == 0) {
1928 return 0;
1929 }
1930 else if (strcmp(errors,"replace") == 0) {
1931 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1932 (*dest)++;
1933 return 0;
1934 }
1935 else {
1936 PyErr_Format(PyExc_ValueError,
1937 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001938 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 errors);
1940 return -1;
1941 }
1942}
1943
1944PyObject *PyUnicode_DecodeCharmap(const char *s,
1945 int size,
1946 PyObject *mapping,
1947 const char *errors)
1948{
1949 PyUnicodeObject *v;
1950 Py_UNICODE *p;
1951
1952 /* Default to Latin-1 */
1953 if (mapping == NULL)
1954 return PyUnicode_DecodeLatin1(s, size, errors);
1955
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 return (PyObject *)v;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 unsigned char ch = *s++;
1964 PyObject *w, *x;
1965
1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967 w = PyInt_FromLong((long)ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974 /* No mapping found: default to Latin-1 mapping */
1975 PyErr_Clear();
1976 *p++ = (Py_UNICODE)ch;
1977 continue;
1978 }
1979 goto onError;
1980 }
1981
1982 /* Apply mapping */
1983 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001984 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 if (value < 0 || value > 65535) {
1986 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001987 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 Py_DECREF(x);
1989 goto onError;
1990 }
1991 *p++ = (Py_UNICODE)value;
1992 }
1993 else if (x == Py_None) {
1994 /* undefined mapping */
1995 if (charmap_decoding_error(&s, &p, errors,
1996 "character maps to <undefined>")) {
1997 Py_DECREF(x);
1998 goto onError;
1999 }
2000 }
2001 else if (PyUnicode_Check(x)) {
2002 if (PyUnicode_GET_SIZE(x) != 1) {
2003 /* 1-n mapping */
2004 PyErr_SetString(PyExc_NotImplementedError,
2005 "1-n mappings are currently not implemented");
2006 Py_DECREF(x);
2007 goto onError;
2008 }
2009 *p++ = *PyUnicode_AS_UNICODE(x);
2010 }
2011 else {
2012 /* wrong return value */
2013 PyErr_SetString(PyExc_TypeError,
2014 "character mapping must return integer, None or unicode");
2015 Py_DECREF(x);
2016 goto onError;
2017 }
2018 Py_DECREF(x);
2019 }
2020 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2021 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2022 goto onError;
2023 return (PyObject *)v;
2024
2025 onError:
2026 Py_XDECREF(v);
2027 return NULL;
2028}
2029
2030static
2031int charmap_encoding_error(const Py_UNICODE **source,
2032 char **dest,
2033 const char *errors,
2034 const char *details)
2035{
2036 if ((errors == NULL) ||
2037 (strcmp(errors,"strict") == 0)) {
2038 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002039 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 details);
2041 return -1;
2042 }
2043 else if (strcmp(errors,"ignore") == 0) {
2044 return 0;
2045 }
2046 else if (strcmp(errors,"replace") == 0) {
2047 **dest = '?';
2048 (*dest)++;
2049 return 0;
2050 }
2051 else {
2052 PyErr_Format(PyExc_ValueError,
2053 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002054 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 errors);
2056 return -1;
2057 }
2058}
2059
2060PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2061 int size,
2062 PyObject *mapping,
2063 const char *errors)
2064{
2065 PyObject *v;
2066 char *s;
2067
2068 /* Default to Latin-1 */
2069 if (mapping == NULL)
2070 return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072 v = PyString_FromStringAndSize(NULL, size);
2073 if (v == NULL)
2074 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002075 if (size == 0)
2076 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 s = PyString_AS_STRING(v);
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *p++;
2080 PyObject *w, *x;
2081
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w = PyInt_FromLong((long)ch);
2084 if (w == NULL)
2085 goto onError;
2086 x = PyObject_GetItem(mapping, w);
2087 Py_DECREF(w);
2088 if (x == NULL) {
2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2090 /* No mapping found: default to Latin-1 mapping if possible */
2091 PyErr_Clear();
2092 if (ch < 256) {
2093 *s++ = (char)ch;
2094 continue;
2095 }
2096 else if (!charmap_encoding_error(&p, &s, errors,
2097 "missing character mapping"))
2098 continue;
2099 }
2100 goto onError;
2101 }
2102
2103 /* Apply mapping */
2104 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002105 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 if (value < 0 || value > 255) {
2107 PyErr_SetString(PyExc_TypeError,
2108 "character mapping must be in range(256)");
2109 Py_DECREF(x);
2110 goto onError;
2111 }
2112 *s++ = (char)value;
2113 }
2114 else if (x == Py_None) {
2115 /* undefined mapping */
2116 if (charmap_encoding_error(&p, &s, errors,
2117 "character maps to <undefined>")) {
2118 Py_DECREF(x);
2119 goto onError;
2120 }
2121 }
2122 else if (PyString_Check(x)) {
2123 if (PyString_GET_SIZE(x) != 1) {
2124 /* 1-n mapping */
2125 PyErr_SetString(PyExc_NotImplementedError,
2126 "1-n mappings are currently not implemented");
2127 Py_DECREF(x);
2128 goto onError;
2129 }
2130 *s++ = *PyString_AS_STRING(x);
2131 }
2132 else {
2133 /* wrong return value */
2134 PyErr_SetString(PyExc_TypeError,
2135 "character mapping must return integer, None or unicode");
2136 Py_DECREF(x);
2137 goto onError;
2138 }
2139 Py_DECREF(x);
2140 }
2141 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2142 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2143 goto onError;
2144 return v;
2145
2146 onError:
2147 Py_DECREF(v);
2148 return NULL;
2149}
2150
2151PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2152 PyObject *mapping)
2153{
2154 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2155 PyErr_BadArgument();
2156 return NULL;
2157 }
2158 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2159 PyUnicode_GET_SIZE(unicode),
2160 mapping,
2161 NULL);
2162}
2163
2164static
2165int translate_error(const Py_UNICODE **source,
2166 Py_UNICODE **dest,
2167 const char *errors,
2168 const char *details)
2169{
2170 if ((errors == NULL) ||
2171 (strcmp(errors,"strict") == 0)) {
2172 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002173 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 details);
2175 return -1;
2176 }
2177 else if (strcmp(errors,"ignore") == 0) {
2178 return 0;
2179 }
2180 else if (strcmp(errors,"replace") == 0) {
2181 **dest = '?';
2182 (*dest)++;
2183 return 0;
2184 }
2185 else {
2186 PyErr_Format(PyExc_ValueError,
2187 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002188 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 errors);
2190 return -1;
2191 }
2192}
2193
2194PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2195 int size,
2196 PyObject *mapping,
2197 const char *errors)
2198{
2199 PyUnicodeObject *v;
2200 Py_UNICODE *p;
2201
2202 if (mapping == NULL) {
2203 PyErr_BadArgument();
2204 return NULL;
2205 }
2206
2207 /* Output will never be longer than input */
2208 v = _PyUnicode_New(size);
2209 if (v == NULL)
2210 goto onError;
2211 if (size == 0)
2212 goto done;
2213 p = PyUnicode_AS_UNICODE(v);
2214 while (size-- > 0) {
2215 Py_UNICODE ch = *s++;
2216 PyObject *w, *x;
2217
2218 /* Get mapping */
2219 w = PyInt_FromLong(ch);
2220 if (w == NULL)
2221 goto onError;
2222 x = PyObject_GetItem(mapping, w);
2223 Py_DECREF(w);
2224 if (x == NULL) {
2225 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2226 /* No mapping found: default to 1-1 mapping */
2227 PyErr_Clear();
2228 *p++ = ch;
2229 continue;
2230 }
2231 goto onError;
2232 }
2233
2234 /* Apply mapping */
2235 if (PyInt_Check(x))
2236 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2237 else if (x == Py_None) {
2238 /* undefined mapping */
2239 if (translate_error(&s, &p, errors,
2240 "character maps to <undefined>")) {
2241 Py_DECREF(x);
2242 goto onError;
2243 }
2244 }
2245 else if (PyUnicode_Check(x)) {
2246 if (PyUnicode_GET_SIZE(x) != 1) {
2247 /* 1-n mapping */
2248 PyErr_SetString(PyExc_NotImplementedError,
2249 "1-n mappings are currently not implemented");
2250 Py_DECREF(x);
2251 goto onError;
2252 }
2253 *p++ = *PyUnicode_AS_UNICODE(x);
2254 }
2255 else {
2256 /* wrong return value */
2257 PyErr_SetString(PyExc_TypeError,
2258 "translate mapping must return integer, None or unicode");
2259 Py_DECREF(x);
2260 goto onError;
2261 }
2262 Py_DECREF(x);
2263 }
2264 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002265 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2266 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267
2268 done:
2269 return (PyObject *)v;
2270
2271 onError:
2272 Py_XDECREF(v);
2273 return NULL;
2274}
2275
2276PyObject *PyUnicode_Translate(PyObject *str,
2277 PyObject *mapping,
2278 const char *errors)
2279{
2280 PyObject *result;
2281
2282 str = PyUnicode_FromObject(str);
2283 if (str == NULL)
2284 goto onError;
2285 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2286 PyUnicode_GET_SIZE(str),
2287 mapping,
2288 errors);
2289 Py_DECREF(str);
2290 return result;
2291
2292 onError:
2293 Py_XDECREF(str);
2294 return NULL;
2295}
2296
Guido van Rossum9e896b32000-04-05 20:11:21 +00002297/* --- Decimal Encoder ---------------------------------------------------- */
2298
2299int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2300 int length,
2301 char *output,
2302 const char *errors)
2303{
2304 Py_UNICODE *p, *end;
2305
2306 if (output == NULL) {
2307 PyErr_BadArgument();
2308 return -1;
2309 }
2310
2311 p = s;
2312 end = s + length;
2313 while (p < end) {
2314 register Py_UNICODE ch = *p++;
2315 int decimal;
2316
2317 if (Py_UNICODE_ISSPACE(ch)) {
2318 *output++ = ' ';
2319 continue;
2320 }
2321 decimal = Py_UNICODE_TODECIMAL(ch);
2322 if (decimal >= 0) {
2323 *output++ = '0' + decimal;
2324 continue;
2325 }
Guido van Rossumba477042000-04-06 18:18:10 +00002326 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002327 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002328 continue;
2329 }
2330 /* All other characters are considered invalid */
2331 if (errors == NULL || strcmp(errors, "strict") == 0) {
2332 PyErr_SetString(PyExc_ValueError,
2333 "invalid decimal Unicode string");
2334 goto onError;
2335 }
2336 else if (strcmp(errors, "ignore") == 0)
2337 continue;
2338 else if (strcmp(errors, "replace") == 0) {
2339 *output++ = '?';
2340 continue;
2341 }
2342 }
2343 /* 0-terminate the output string */
2344 *output++ = '\0';
2345 return 0;
2346
2347 onError:
2348 return -1;
2349}
2350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351/* --- Helpers ------------------------------------------------------------ */
2352
2353static
2354int count(PyUnicodeObject *self,
2355 int start,
2356 int end,
2357 PyUnicodeObject *substring)
2358{
2359 int count = 0;
2360
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002361 if (substring->length == 0)
2362 return (end - start + 1);
2363
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364 end -= substring->length;
2365
2366 while (start <= end)
2367 if (Py_UNICODE_MATCH(self, start, substring)) {
2368 count++;
2369 start += substring->length;
2370 } else
2371 start++;
2372
2373 return count;
2374}
2375
2376int PyUnicode_Count(PyObject *str,
2377 PyObject *substr,
2378 int start,
2379 int end)
2380{
2381 int result;
2382
2383 str = PyUnicode_FromObject(str);
2384 if (str == NULL)
2385 return -1;
2386 substr = PyUnicode_FromObject(substr);
2387 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002388 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 return -1;
2390 }
2391
2392 result = count((PyUnicodeObject *)str,
2393 start, end,
2394 (PyUnicodeObject *)substr);
2395
2396 Py_DECREF(str);
2397 Py_DECREF(substr);
2398 return result;
2399}
2400
2401static
2402int findstring(PyUnicodeObject *self,
2403 PyUnicodeObject *substring,
2404 int start,
2405 int end,
2406 int direction)
2407{
2408 if (start < 0)
2409 start += self->length;
2410 if (start < 0)
2411 start = 0;
2412
2413 if (substring->length == 0)
2414 return start;
2415
2416 if (end > self->length)
2417 end = self->length;
2418 if (end < 0)
2419 end += self->length;
2420 if (end < 0)
2421 end = 0;
2422
2423 end -= substring->length;
2424
2425 if (direction < 0) {
2426 for (; end >= start; end--)
2427 if (Py_UNICODE_MATCH(self, end, substring))
2428 return end;
2429 } else {
2430 for (; start <= end; start++)
2431 if (Py_UNICODE_MATCH(self, start, substring))
2432 return start;
2433 }
2434
2435 return -1;
2436}
2437
2438int PyUnicode_Find(PyObject *str,
2439 PyObject *substr,
2440 int start,
2441 int end,
2442 int direction)
2443{
2444 int result;
2445
2446 str = PyUnicode_FromObject(str);
2447 if (str == NULL)
2448 return -1;
2449 substr = PyUnicode_FromObject(substr);
2450 if (substr == NULL) {
2451 Py_DECREF(substr);
2452 return -1;
2453 }
2454
2455 result = findstring((PyUnicodeObject *)str,
2456 (PyUnicodeObject *)substr,
2457 start, end, direction);
2458 Py_DECREF(str);
2459 Py_DECREF(substr);
2460 return result;
2461}
2462
2463static
2464int tailmatch(PyUnicodeObject *self,
2465 PyUnicodeObject *substring,
2466 int start,
2467 int end,
2468 int direction)
2469{
2470 if (start < 0)
2471 start += self->length;
2472 if (start < 0)
2473 start = 0;
2474
2475 if (substring->length == 0)
2476 return 1;
2477
2478 if (end > self->length)
2479 end = self->length;
2480 if (end < 0)
2481 end += self->length;
2482 if (end < 0)
2483 end = 0;
2484
2485 end -= substring->length;
2486 if (end < start)
2487 return 0;
2488
2489 if (direction > 0) {
2490 if (Py_UNICODE_MATCH(self, end, substring))
2491 return 1;
2492 } else {
2493 if (Py_UNICODE_MATCH(self, start, substring))
2494 return 1;
2495 }
2496
2497 return 0;
2498}
2499
2500int PyUnicode_Tailmatch(PyObject *str,
2501 PyObject *substr,
2502 int start,
2503 int end,
2504 int direction)
2505{
2506 int result;
2507
2508 str = PyUnicode_FromObject(str);
2509 if (str == NULL)
2510 return -1;
2511 substr = PyUnicode_FromObject(substr);
2512 if (substr == NULL) {
2513 Py_DECREF(substr);
2514 return -1;
2515 }
2516
2517 result = tailmatch((PyUnicodeObject *)str,
2518 (PyUnicodeObject *)substr,
2519 start, end, direction);
2520 Py_DECREF(str);
2521 Py_DECREF(substr);
2522 return result;
2523}
2524
2525static
2526const Py_UNICODE *findchar(const Py_UNICODE *s,
2527 int size,
2528 Py_UNICODE ch)
2529{
2530 /* like wcschr, but doesn't stop at NULL characters */
2531
2532 while (size-- > 0) {
2533 if (*s == ch)
2534 return s;
2535 s++;
2536 }
2537
2538 return NULL;
2539}
2540
2541/* Apply fixfct filter to the Unicode object self and return a
2542 reference to the modified object */
2543
2544static
2545PyObject *fixup(PyUnicodeObject *self,
2546 int (*fixfct)(PyUnicodeObject *s))
2547{
2548
2549 PyUnicodeObject *u;
2550
2551 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2552 self->length);
2553 if (u == NULL)
2554 return NULL;
2555 if (!fixfct(u)) {
2556 /* fixfct should return TRUE if it modified the buffer. If
2557 FALSE, return a reference to the original buffer instead
2558 (to save space, not time) */
2559 Py_INCREF(self);
2560 Py_DECREF(u);
2561 return (PyObject*) self;
2562 }
2563 return (PyObject*) u;
2564}
2565
2566static
2567int fixupper(PyUnicodeObject *self)
2568{
2569 int len = self->length;
2570 Py_UNICODE *s = self->str;
2571 int status = 0;
2572
2573 while (len-- > 0) {
2574 register Py_UNICODE ch;
2575
2576 ch = Py_UNICODE_TOUPPER(*s);
2577 if (ch != *s) {
2578 status = 1;
2579 *s = ch;
2580 }
2581 s++;
2582 }
2583
2584 return status;
2585}
2586
2587static
2588int fixlower(PyUnicodeObject *self)
2589{
2590 int len = self->length;
2591 Py_UNICODE *s = self->str;
2592 int status = 0;
2593
2594 while (len-- > 0) {
2595 register Py_UNICODE ch;
2596
2597 ch = Py_UNICODE_TOLOWER(*s);
2598 if (ch != *s) {
2599 status = 1;
2600 *s = ch;
2601 }
2602 s++;
2603 }
2604
2605 return status;
2606}
2607
2608static
2609int fixswapcase(PyUnicodeObject *self)
2610{
2611 int len = self->length;
2612 Py_UNICODE *s = self->str;
2613 int status = 0;
2614
2615 while (len-- > 0) {
2616 if (Py_UNICODE_ISUPPER(*s)) {
2617 *s = Py_UNICODE_TOLOWER(*s);
2618 status = 1;
2619 } else if (Py_UNICODE_ISLOWER(*s)) {
2620 *s = Py_UNICODE_TOUPPER(*s);
2621 status = 1;
2622 }
2623 s++;
2624 }
2625
2626 return status;
2627}
2628
2629static
2630int fixcapitalize(PyUnicodeObject *self)
2631{
2632 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2633 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2634 return 1;
2635 }
2636 return 0;
2637}
2638
2639static
2640int fixtitle(PyUnicodeObject *self)
2641{
2642 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2643 register Py_UNICODE *e;
2644 int previous_is_cased;
2645
2646 /* Shortcut for single character strings */
2647 if (PyUnicode_GET_SIZE(self) == 1) {
2648 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2649 if (*p != ch) {
2650 *p = ch;
2651 return 1;
2652 }
2653 else
2654 return 0;
2655 }
2656
2657 e = p + PyUnicode_GET_SIZE(self);
2658 previous_is_cased = 0;
2659 for (; p < e; p++) {
2660 register const Py_UNICODE ch = *p;
2661
2662 if (previous_is_cased)
2663 *p = Py_UNICODE_TOLOWER(ch);
2664 else
2665 *p = Py_UNICODE_TOTITLE(ch);
2666
2667 if (Py_UNICODE_ISLOWER(ch) ||
2668 Py_UNICODE_ISUPPER(ch) ||
2669 Py_UNICODE_ISTITLE(ch))
2670 previous_is_cased = 1;
2671 else
2672 previous_is_cased = 0;
2673 }
2674 return 1;
2675}
2676
2677PyObject *PyUnicode_Join(PyObject *separator,
2678 PyObject *seq)
2679{
2680 Py_UNICODE *sep;
2681 int seplen;
2682 PyUnicodeObject *res = NULL;
2683 int reslen = 0;
2684 Py_UNICODE *p;
2685 int seqlen = 0;
2686 int sz = 100;
2687 int i;
2688
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002689 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 if (seqlen < 0 && PyErr_Occurred())
2691 return NULL;
2692
2693 if (separator == NULL) {
2694 Py_UNICODE blank = ' ';
2695 sep = &blank;
2696 seplen = 1;
2697 }
2698 else {
2699 separator = PyUnicode_FromObject(separator);
2700 if (separator == NULL)
2701 return NULL;
2702 sep = PyUnicode_AS_UNICODE(separator);
2703 seplen = PyUnicode_GET_SIZE(separator);
2704 }
2705
2706 res = _PyUnicode_New(sz);
2707 if (res == NULL)
2708 goto onError;
2709 p = PyUnicode_AS_UNICODE(res);
2710 reslen = 0;
2711
2712 for (i = 0; i < seqlen; i++) {
2713 int itemlen;
2714 PyObject *item;
2715
2716 item = PySequence_GetItem(seq, i);
2717 if (item == NULL)
2718 goto onError;
2719 if (!PyUnicode_Check(item)) {
2720 PyObject *v;
2721 v = PyUnicode_FromObject(item);
2722 Py_DECREF(item);
2723 item = v;
2724 if (item == NULL)
2725 goto onError;
2726 }
2727 itemlen = PyUnicode_GET_SIZE(item);
2728 while (reslen + itemlen + seplen >= sz) {
2729 if (_PyUnicode_Resize(res, sz*2))
2730 goto onError;
2731 sz *= 2;
2732 p = PyUnicode_AS_UNICODE(res) + reslen;
2733 }
2734 if (i > 0) {
2735 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2736 p += seplen;
2737 reslen += seplen;
2738 }
2739 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2740 p += itemlen;
2741 reslen += itemlen;
2742 Py_DECREF(item);
2743 }
2744 if (_PyUnicode_Resize(res, reslen))
2745 goto onError;
2746
2747 Py_XDECREF(separator);
2748 return (PyObject *)res;
2749
2750 onError:
2751 Py_XDECREF(separator);
2752 Py_DECREF(res);
2753 return NULL;
2754}
2755
2756static
2757PyUnicodeObject *pad(PyUnicodeObject *self,
2758 int left,
2759 int right,
2760 Py_UNICODE fill)
2761{
2762 PyUnicodeObject *u;
2763
2764 if (left < 0)
2765 left = 0;
2766 if (right < 0)
2767 right = 0;
2768
2769 if (left == 0 && right == 0) {
2770 Py_INCREF(self);
2771 return self;
2772 }
2773
2774 u = _PyUnicode_New(left + self->length + right);
2775 if (u) {
2776 if (left)
2777 Py_UNICODE_FILL(u->str, fill, left);
2778 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2779 if (right)
2780 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2781 }
2782
2783 return u;
2784}
2785
2786#define SPLIT_APPEND(data, left, right) \
2787 str = PyUnicode_FromUnicode(data + left, right - left); \
2788 if (!str) \
2789 goto onError; \
2790 if (PyList_Append(list, str)) { \
2791 Py_DECREF(str); \
2792 goto onError; \
2793 } \
2794 else \
2795 Py_DECREF(str);
2796
2797static
2798PyObject *split_whitespace(PyUnicodeObject *self,
2799 PyObject *list,
2800 int maxcount)
2801{
2802 register int i;
2803 register int j;
2804 int len = self->length;
2805 PyObject *str;
2806
2807 for (i = j = 0; i < len; ) {
2808 /* find a token */
2809 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2810 i++;
2811 j = i;
2812 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2813 i++;
2814 if (j < i) {
2815 if (maxcount-- <= 0)
2816 break;
2817 SPLIT_APPEND(self->str, j, i);
2818 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2819 i++;
2820 j = i;
2821 }
2822 }
2823 if (j < len) {
2824 SPLIT_APPEND(self->str, j, len);
2825 }
2826 return list;
2827
2828 onError:
2829 Py_DECREF(list);
2830 return NULL;
2831}
2832
2833PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002834 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835{
2836 register int i;
2837 register int j;
2838 int len;
2839 PyObject *list;
2840 PyObject *str;
2841 Py_UNICODE *data;
2842
2843 string = PyUnicode_FromObject(string);
2844 if (string == NULL)
2845 return NULL;
2846 data = PyUnicode_AS_UNICODE(string);
2847 len = PyUnicode_GET_SIZE(string);
2848
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 list = PyList_New(0);
2850 if (!list)
2851 goto onError;
2852
2853 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002854 int eol;
2855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 /* Find a line and append it */
2857 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2858 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859
2860 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002861 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 if (i < len) {
2863 if (data[i] == '\r' && i + 1 < len &&
2864 data[i+1] == '\n')
2865 i += 2;
2866 else
2867 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002868 if (keepends)
2869 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 }
Guido van Rossum86662912000-04-11 15:38:46 +00002871 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 j = i;
2873 }
2874 if (j < len) {
2875 SPLIT_APPEND(data, j, len);
2876 }
2877
2878 Py_DECREF(string);
2879 return list;
2880
2881 onError:
2882 Py_DECREF(list);
2883 Py_DECREF(string);
2884 return NULL;
2885}
2886
2887static
2888PyObject *split_char(PyUnicodeObject *self,
2889 PyObject *list,
2890 Py_UNICODE ch,
2891 int maxcount)
2892{
2893 register int i;
2894 register int j;
2895 int len = self->length;
2896 PyObject *str;
2897
2898 for (i = j = 0; i < len; ) {
2899 if (self->str[i] == ch) {
2900 if (maxcount-- <= 0)
2901 break;
2902 SPLIT_APPEND(self->str, j, i);
2903 i = j = i + 1;
2904 } else
2905 i++;
2906 }
2907 if (j <= len) {
2908 SPLIT_APPEND(self->str, j, len);
2909 }
2910 return list;
2911
2912 onError:
2913 Py_DECREF(list);
2914 return NULL;
2915}
2916
2917static
2918PyObject *split_substring(PyUnicodeObject *self,
2919 PyObject *list,
2920 PyUnicodeObject *substring,
2921 int maxcount)
2922{
2923 register int i;
2924 register int j;
2925 int len = self->length;
2926 int sublen = substring->length;
2927 PyObject *str;
2928
2929 for (i = j = 0; i < len - sublen; ) {
2930 if (Py_UNICODE_MATCH(self, i, substring)) {
2931 if (maxcount-- <= 0)
2932 break;
2933 SPLIT_APPEND(self->str, j, i);
2934 i = j = i + sublen;
2935 } else
2936 i++;
2937 }
2938 if (j <= len) {
2939 SPLIT_APPEND(self->str, j, len);
2940 }
2941 return list;
2942
2943 onError:
2944 Py_DECREF(list);
2945 return NULL;
2946}
2947
2948#undef SPLIT_APPEND
2949
2950static
2951PyObject *split(PyUnicodeObject *self,
2952 PyUnicodeObject *substring,
2953 int maxcount)
2954{
2955 PyObject *list;
2956
2957 if (maxcount < 0)
2958 maxcount = INT_MAX;
2959
2960 list = PyList_New(0);
2961 if (!list)
2962 return NULL;
2963
2964 if (substring == NULL)
2965 return split_whitespace(self,list,maxcount);
2966
2967 else if (substring->length == 1)
2968 return split_char(self,list,substring->str[0],maxcount);
2969
2970 else if (substring->length == 0) {
2971 Py_DECREF(list);
2972 PyErr_SetString(PyExc_ValueError, "empty separator");
2973 return NULL;
2974 }
2975 else
2976 return split_substring(self,list,substring,maxcount);
2977}
2978
2979static
2980PyObject *strip(PyUnicodeObject *self,
2981 int left,
2982 int right)
2983{
2984 Py_UNICODE *p = self->str;
2985 int start = 0;
2986 int end = self->length;
2987
2988 if (left)
2989 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2990 start++;
2991
2992 if (right)
2993 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2994 end--;
2995
2996 if (start == 0 && end == self->length) {
2997 /* couldn't strip anything off, return original string */
2998 Py_INCREF(self);
2999 return (PyObject*) self;
3000 }
3001
3002 return (PyObject*) PyUnicode_FromUnicode(
3003 self->str + start,
3004 end - start
3005 );
3006}
3007
3008static
3009PyObject *replace(PyUnicodeObject *self,
3010 PyUnicodeObject *str1,
3011 PyUnicodeObject *str2,
3012 int maxcount)
3013{
3014 PyUnicodeObject *u;
3015
3016 if (maxcount < 0)
3017 maxcount = INT_MAX;
3018
3019 if (str1->length == 1 && str2->length == 1) {
3020 int i;
3021
3022 /* replace characters */
3023 if (!findchar(self->str, self->length, str1->str[0])) {
3024 /* nothing to replace, return original string */
3025 Py_INCREF(self);
3026 u = self;
3027 } else {
3028 Py_UNICODE u1 = str1->str[0];
3029 Py_UNICODE u2 = str2->str[0];
3030
3031 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3032 self->str,
3033 self->length
3034 );
3035 if (u)
3036 for (i = 0; i < u->length; i++)
3037 if (u->str[i] == u1) {
3038 if (--maxcount < 0)
3039 break;
3040 u->str[i] = u2;
3041 }
3042 }
3043
3044 } else {
3045 int n, i;
3046 Py_UNICODE *p;
3047
3048 /* replace strings */
3049 n = count(self, 0, self->length, str1);
3050 if (n > maxcount)
3051 n = maxcount;
3052 if (n == 0) {
3053 /* nothing to replace, return original string */
3054 Py_INCREF(self);
3055 u = self;
3056 } else {
3057 u = _PyUnicode_New(
3058 self->length + n * (str2->length - str1->length));
3059 if (u) {
3060 i = 0;
3061 p = u->str;
3062 while (i <= self->length - str1->length)
3063 if (Py_UNICODE_MATCH(self, i, str1)) {
3064 /* replace string segment */
3065 Py_UNICODE_COPY(p, str2->str, str2->length);
3066 p += str2->length;
3067 i += str1->length;
3068 if (--n <= 0) {
3069 /* copy remaining part */
3070 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3071 break;
3072 }
3073 } else
3074 *p++ = self->str[i++];
3075 }
3076 }
3077 }
3078
3079 return (PyObject *) u;
3080}
3081
3082/* --- Unicode Object Methods --------------------------------------------- */
3083
3084static char title__doc__[] =
3085"S.title() -> unicode\n\
3086\n\
3087Return a titlecased version of S, i.e. words start with title case\n\
3088characters, all remaining cased characters have lower case.";
3089
3090static PyObject*
3091unicode_title(PyUnicodeObject *self, PyObject *args)
3092{
3093 if (!PyArg_NoArgs(args))
3094 return NULL;
3095 return fixup(self, fixtitle);
3096}
3097
3098static char capitalize__doc__[] =
3099"S.capitalize() -> unicode\n\
3100\n\
3101Return a capitalized version of S, i.e. make the first character\n\
3102have upper case.";
3103
3104static PyObject*
3105unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3106{
3107 if (!PyArg_NoArgs(args))
3108 return NULL;
3109 return fixup(self, fixcapitalize);
3110}
3111
3112#if 0
3113static char capwords__doc__[] =
3114"S.capwords() -> unicode\n\
3115\n\
3116Apply .capitalize() to all words in S and return the result with\n\
3117normalized whitespace (all whitespace strings are replaced by ' ').";
3118
3119static PyObject*
3120unicode_capwords(PyUnicodeObject *self, PyObject *args)
3121{
3122 PyObject *list;
3123 PyObject *item;
3124 int i;
3125
3126 if (!PyArg_NoArgs(args))
3127 return NULL;
3128
3129 /* Split into words */
3130 list = split(self, NULL, -1);
3131 if (!list)
3132 return NULL;
3133
3134 /* Capitalize each word */
3135 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3137 fixcapitalize);
3138 if (item == NULL)
3139 goto onError;
3140 Py_DECREF(PyList_GET_ITEM(list, i));
3141 PyList_SET_ITEM(list, i, item);
3142 }
3143
3144 /* Join the words to form a new string */
3145 item = PyUnicode_Join(NULL, list);
3146
3147onError:
3148 Py_DECREF(list);
3149 return (PyObject *)item;
3150}
3151#endif
3152
3153static char center__doc__[] =
3154"S.center(width) -> unicode\n\
3155\n\
3156Return S centered in a Unicode string of length width. Padding is done\n\
3157using spaces.";
3158
3159static PyObject *
3160unicode_center(PyUnicodeObject *self, PyObject *args)
3161{
3162 int marg, left;
3163 int width;
3164
3165 if (!PyArg_ParseTuple(args, "i:center", &width))
3166 return NULL;
3167
3168 if (self->length >= width) {
3169 Py_INCREF(self);
3170 return (PyObject*) self;
3171 }
3172
3173 marg = width - self->length;
3174 left = marg / 2 + (marg & width & 1);
3175
3176 return (PyObject*) pad(self, left, marg - left, ' ');
3177}
3178
Marc-André Lemburge5034372000-08-08 08:04:29 +00003179#if 0
3180
3181/* This code should go into some future Unicode collation support
3182 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003183 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003184
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003185/* speedy UTF-16 code point order comparison */
3186/* gleaned from: */
3187/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3188
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003189static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003190{
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
3193 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003194 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003195};
3196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197static int
3198unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3199{
3200 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003201
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 Py_UNICODE *s1 = str1->str;
3203 Py_UNICODE *s2 = str2->str;
3204
3205 len1 = str1->length;
3206 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003207
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003209 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003210 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003211
3212 c1 = *s1++;
3213 c2 = *s2++;
3214 if (c1 > (1<<11) * 26)
3215 c1 += utf16Fixup[c1>>11];
3216 if (c2 > (1<<11) * 26)
3217 c2 += utf16Fixup[c2>>11];
3218
3219 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003220 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003221 if (diff)
3222 return (diff < 0) ? -1 : (diff != 0);
3223 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 }
3225
3226 return (len1 < len2) ? -1 : (len1 != len2);
3227}
3228
Marc-André Lemburge5034372000-08-08 08:04:29 +00003229#else
3230
3231static int
3232unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3233{
3234 register int len1, len2;
3235
3236 Py_UNICODE *s1 = str1->str;
3237 Py_UNICODE *s2 = str2->str;
3238
3239 len1 = str1->length;
3240 len2 = str2->length;
3241
3242 while (len1 > 0 && len2 > 0) {
3243 register long diff;
3244
3245 diff = (long)*s1++ - (long)*s2++;
3246 if (diff)
3247 return (diff < 0) ? -1 : (diff != 0);
3248 len1--; len2--;
3249 }
3250
3251 return (len1 < len2) ? -1 : (len1 != len2);
3252}
3253
3254#endif
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256int PyUnicode_Compare(PyObject *left,
3257 PyObject *right)
3258{
3259 PyUnicodeObject *u = NULL, *v = NULL;
3260 int result;
3261
3262 /* Coerce the two arguments */
3263 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3264 if (u == NULL)
3265 goto onError;
3266 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3267 if (v == NULL)
3268 goto onError;
3269
Thomas Wouters7e474022000-07-16 12:04:32 +00003270 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 if (v == u) {
3272 Py_DECREF(u);
3273 Py_DECREF(v);
3274 return 0;
3275 }
3276
3277 result = unicode_compare(u, v);
3278
3279 Py_DECREF(u);
3280 Py_DECREF(v);
3281 return result;
3282
3283onError:
3284 Py_XDECREF(u);
3285 Py_XDECREF(v);
3286 return -1;
3287}
3288
Guido van Rossum403d68b2000-03-13 15:55:09 +00003289int PyUnicode_Contains(PyObject *container,
3290 PyObject *element)
3291{
3292 PyUnicodeObject *u = NULL, *v = NULL;
3293 int result;
3294 register const Py_UNICODE *p, *e;
3295 register Py_UNICODE ch;
3296
3297 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003298 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003299 if (v == NULL) {
3300 PyErr_SetString(PyExc_TypeError,
3301 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003302 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003303 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003304 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3305 if (u == NULL) {
3306 Py_DECREF(v);
3307 goto onError;
3308 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003309
3310 /* Check v in u */
3311 if (PyUnicode_GET_SIZE(v) != 1) {
3312 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003313 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003314 goto onError;
3315 }
3316 ch = *PyUnicode_AS_UNICODE(v);
3317 p = PyUnicode_AS_UNICODE(u);
3318 e = p + PyUnicode_GET_SIZE(u);
3319 result = 0;
3320 while (p < e) {
3321 if (*p++ == ch) {
3322 result = 1;
3323 break;
3324 }
3325 }
3326
3327 Py_DECREF(u);
3328 Py_DECREF(v);
3329 return result;
3330
3331onError:
3332 Py_XDECREF(u);
3333 Py_XDECREF(v);
3334 return -1;
3335}
3336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337/* Concat to string or Unicode object giving a new Unicode object. */
3338
3339PyObject *PyUnicode_Concat(PyObject *left,
3340 PyObject *right)
3341{
3342 PyUnicodeObject *u = NULL, *v = NULL, *w;
3343
3344 /* Coerce the two arguments */
3345 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3346 if (u == NULL)
3347 goto onError;
3348 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3349 if (v == NULL)
3350 goto onError;
3351
3352 /* Shortcuts */
3353 if (v == unicode_empty) {
3354 Py_DECREF(v);
3355 return (PyObject *)u;
3356 }
3357 if (u == unicode_empty) {
3358 Py_DECREF(u);
3359 return (PyObject *)v;
3360 }
3361
3362 /* Concat the two Unicode strings */
3363 w = _PyUnicode_New(u->length + v->length);
3364 if (w == NULL)
3365 goto onError;
3366 Py_UNICODE_COPY(w->str, u->str, u->length);
3367 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3368
3369 Py_DECREF(u);
3370 Py_DECREF(v);
3371 return (PyObject *)w;
3372
3373onError:
3374 Py_XDECREF(u);
3375 Py_XDECREF(v);
3376 return NULL;
3377}
3378
3379static char count__doc__[] =
3380"S.count(sub[, start[, end]]) -> int\n\
3381\n\
3382Return the number of occurrences of substring sub in Unicode string\n\
3383S[start:end]. Optional arguments start and end are\n\
3384interpreted as in slice notation.";
3385
3386static PyObject *
3387unicode_count(PyUnicodeObject *self, PyObject *args)
3388{
3389 PyUnicodeObject *substring;
3390 int start = 0;
3391 int end = INT_MAX;
3392 PyObject *result;
3393
Guido van Rossumb8872e62000-05-09 14:14:27 +00003394 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3395 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 return NULL;
3397
3398 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3399 (PyObject *)substring);
3400 if (substring == NULL)
3401 return NULL;
3402
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (start < 0)
3404 start += self->length;
3405 if (start < 0)
3406 start = 0;
3407 if (end > self->length)
3408 end = self->length;
3409 if (end < 0)
3410 end += self->length;
3411 if (end < 0)
3412 end = 0;
3413
3414 result = PyInt_FromLong((long) count(self, start, end, substring));
3415
3416 Py_DECREF(substring);
3417 return result;
3418}
3419
3420static char encode__doc__[] =
3421"S.encode([encoding[,errors]]) -> string\n\
3422\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003423Return an encoded string version of S. Default encoding is the current\n\
3424default string encoding. errors may be given to set a different error\n\
3425handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3426a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427
3428static PyObject *
3429unicode_encode(PyUnicodeObject *self, PyObject *args)
3430{
3431 char *encoding = NULL;
3432 char *errors = NULL;
3433 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3434 return NULL;
3435 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3436}
3437
3438static char expandtabs__doc__[] =
3439"S.expandtabs([tabsize]) -> unicode\n\
3440\n\
3441Return a copy of S where all tab characters are expanded using spaces.\n\
3442If tabsize is not given, a tab size of 8 characters is assumed.";
3443
3444static PyObject*
3445unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3446{
3447 Py_UNICODE *e;
3448 Py_UNICODE *p;
3449 Py_UNICODE *q;
3450 int i, j;
3451 PyUnicodeObject *u;
3452 int tabsize = 8;
3453
3454 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3455 return NULL;
3456
Thomas Wouters7e474022000-07-16 12:04:32 +00003457 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 i = j = 0;
3459 e = self->str + self->length;
3460 for (p = self->str; p < e; p++)
3461 if (*p == '\t') {
3462 if (tabsize > 0)
3463 j += tabsize - (j % tabsize);
3464 }
3465 else {
3466 j++;
3467 if (*p == '\n' || *p == '\r') {
3468 i += j;
3469 j = 0;
3470 }
3471 }
3472
3473 /* Second pass: create output string and fill it */
3474 u = _PyUnicode_New(i + j);
3475 if (!u)
3476 return NULL;
3477
3478 j = 0;
3479 q = u->str;
3480
3481 for (p = self->str; p < e; p++)
3482 if (*p == '\t') {
3483 if (tabsize > 0) {
3484 i = tabsize - (j % tabsize);
3485 j += i;
3486 while (i--)
3487 *q++ = ' ';
3488 }
3489 }
3490 else {
3491 j++;
3492 *q++ = *p;
3493 if (*p == '\n' || *p == '\r')
3494 j = 0;
3495 }
3496
3497 return (PyObject*) u;
3498}
3499
3500static char find__doc__[] =
3501"S.find(sub [,start [,end]]) -> int\n\
3502\n\
3503Return the lowest index in S where substring sub is found,\n\
3504such that sub is contained within s[start,end]. Optional\n\
3505arguments start and end are interpreted as in slice notation.\n\
3506\n\
3507Return -1 on failure.";
3508
3509static PyObject *
3510unicode_find(PyUnicodeObject *self, PyObject *args)
3511{
3512 PyUnicodeObject *substring;
3513 int start = 0;
3514 int end = INT_MAX;
3515 PyObject *result;
3516
Guido van Rossumb8872e62000-05-09 14:14:27 +00003517 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3518 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 return NULL;
3520 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3521 (PyObject *)substring);
3522 if (substring == NULL)
3523 return NULL;
3524
3525 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3526
3527 Py_DECREF(substring);
3528 return result;
3529}
3530
3531static PyObject *
3532unicode_getitem(PyUnicodeObject *self, int index)
3533{
3534 if (index < 0 || index >= self->length) {
3535 PyErr_SetString(PyExc_IndexError, "string index out of range");
3536 return NULL;
3537 }
3538
3539 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3540}
3541
3542static long
3543unicode_hash(PyUnicodeObject *self)
3544{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003545 /* Since Unicode objects compare equal to their ASCII string
3546 counterparts, they should use the individual character values
3547 as basis for their hash value. This is needed to assure that
3548 strings and Unicode objects behave in the same way as
3549 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550
Fredrik Lundhdde61642000-07-10 18:27:47 +00003551 register int len;
3552 register Py_UNICODE *p;
3553 register long x;
3554
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 if (self->hash != -1)
3556 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003557 len = PyUnicode_GET_SIZE(self);
3558 p = PyUnicode_AS_UNICODE(self);
3559 x = *p << 7;
3560 while (--len >= 0)
3561 x = (1000003*x) ^ *p++;
3562 x ^= PyUnicode_GET_SIZE(self);
3563 if (x == -1)
3564 x = -2;
3565 self->hash = x;
3566 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567}
3568
3569static char index__doc__[] =
3570"S.index(sub [,start [,end]]) -> int\n\
3571\n\
3572Like S.find() but raise ValueError when the substring is not found.";
3573
3574static PyObject *
3575unicode_index(PyUnicodeObject *self, PyObject *args)
3576{
3577 int result;
3578 PyUnicodeObject *substring;
3579 int start = 0;
3580 int end = INT_MAX;
3581
Guido van Rossumb8872e62000-05-09 14:14:27 +00003582 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 return NULL;
3585
3586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3587 (PyObject *)substring);
3588 if (substring == NULL)
3589 return NULL;
3590
3591 result = findstring(self, substring, start, end, 1);
3592
3593 Py_DECREF(substring);
3594 if (result < 0) {
3595 PyErr_SetString(PyExc_ValueError, "substring not found");
3596 return NULL;
3597 }
3598 return PyInt_FromLong(result);
3599}
3600
3601static char islower__doc__[] =
3602"S.islower() -> int\n\
3603\n\
3604Return 1 if all cased characters in S are lowercase and there is\n\
3605at least one cased character in S, 0 otherwise.";
3606
3607static PyObject*
3608unicode_islower(PyUnicodeObject *self, PyObject *args)
3609{
3610 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3611 register const Py_UNICODE *e;
3612 int cased;
3613
3614 if (!PyArg_NoArgs(args))
3615 return NULL;
3616
3617 /* Shortcut for single character strings */
3618 if (PyUnicode_GET_SIZE(self) == 1)
3619 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3620
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003621 /* Special case for empty strings */
3622 if (PyString_GET_SIZE(self) == 0)
3623 return PyInt_FromLong(0);
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 e = p + PyUnicode_GET_SIZE(self);
3626 cased = 0;
3627 for (; p < e; p++) {
3628 register const Py_UNICODE ch = *p;
3629
3630 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3631 return PyInt_FromLong(0);
3632 else if (!cased && Py_UNICODE_ISLOWER(ch))
3633 cased = 1;
3634 }
3635 return PyInt_FromLong(cased);
3636}
3637
3638static char isupper__doc__[] =
3639"S.isupper() -> int\n\
3640\n\
3641Return 1 if all cased characters in S are uppercase and there is\n\
3642at least one cased character in S, 0 otherwise.";
3643
3644static PyObject*
3645unicode_isupper(PyUnicodeObject *self, PyObject *args)
3646{
3647 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3648 register const Py_UNICODE *e;
3649 int cased;
3650
3651 if (!PyArg_NoArgs(args))
3652 return NULL;
3653
3654 /* Shortcut for single character strings */
3655 if (PyUnicode_GET_SIZE(self) == 1)
3656 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3657
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003658 /* Special case for empty strings */
3659 if (PyString_GET_SIZE(self) == 0)
3660 return PyInt_FromLong(0);
3661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 e = p + PyUnicode_GET_SIZE(self);
3663 cased = 0;
3664 for (; p < e; p++) {
3665 register const Py_UNICODE ch = *p;
3666
3667 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3668 return PyInt_FromLong(0);
3669 else if (!cased && Py_UNICODE_ISUPPER(ch))
3670 cased = 1;
3671 }
3672 return PyInt_FromLong(cased);
3673}
3674
3675static char istitle__doc__[] =
3676"S.istitle() -> int\n\
3677\n\
3678Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3679may only follow uncased characters and lowercase characters only cased\n\
3680ones. Return 0 otherwise.";
3681
3682static PyObject*
3683unicode_istitle(PyUnicodeObject *self, PyObject *args)
3684{
3685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3686 register const Py_UNICODE *e;
3687 int cased, previous_is_cased;
3688
3689 if (!PyArg_NoArgs(args))
3690 return NULL;
3691
3692 /* Shortcut for single character strings */
3693 if (PyUnicode_GET_SIZE(self) == 1)
3694 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3695 (Py_UNICODE_ISUPPER(*p) != 0));
3696
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003697 /* Special case for empty strings */
3698 if (PyString_GET_SIZE(self) == 0)
3699 return PyInt_FromLong(0);
3700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 e = p + PyUnicode_GET_SIZE(self);
3702 cased = 0;
3703 previous_is_cased = 0;
3704 for (; p < e; p++) {
3705 register const Py_UNICODE ch = *p;
3706
3707 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3708 if (previous_is_cased)
3709 return PyInt_FromLong(0);
3710 previous_is_cased = 1;
3711 cased = 1;
3712 }
3713 else if (Py_UNICODE_ISLOWER(ch)) {
3714 if (!previous_is_cased)
3715 return PyInt_FromLong(0);
3716 previous_is_cased = 1;
3717 cased = 1;
3718 }
3719 else
3720 previous_is_cased = 0;
3721 }
3722 return PyInt_FromLong(cased);
3723}
3724
3725static char isspace__doc__[] =
3726"S.isspace() -> int\n\
3727\n\
3728Return 1 if there are only whitespace characters in S,\n\
37290 otherwise.";
3730
3731static PyObject*
3732unicode_isspace(PyUnicodeObject *self, PyObject *args)
3733{
3734 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3735 register const Py_UNICODE *e;
3736
3737 if (!PyArg_NoArgs(args))
3738 return NULL;
3739
3740 /* Shortcut for single character strings */
3741 if (PyUnicode_GET_SIZE(self) == 1 &&
3742 Py_UNICODE_ISSPACE(*p))
3743 return PyInt_FromLong(1);
3744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003745 /* Special case for empty strings */
3746 if (PyString_GET_SIZE(self) == 0)
3747 return PyInt_FromLong(0);
3748
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 e = p + PyUnicode_GET_SIZE(self);
3750 for (; p < e; p++) {
3751 if (!Py_UNICODE_ISSPACE(*p))
3752 return PyInt_FromLong(0);
3753 }
3754 return PyInt_FromLong(1);
3755}
3756
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003757static char isalpha__doc__[] =
3758"S.isalpha() -> int\n\
3759\n\
3760Return 1 if all characters in S are alphabetic\n\
3761and there is at least one character in S, 0 otherwise.";
3762
3763static PyObject*
3764unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3765{
3766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3767 register const Py_UNICODE *e;
3768
3769 if (!PyArg_NoArgs(args))
3770 return NULL;
3771
3772 /* Shortcut for single character strings */
3773 if (PyUnicode_GET_SIZE(self) == 1 &&
3774 Py_UNICODE_ISALPHA(*p))
3775 return PyInt_FromLong(1);
3776
3777 /* Special case for empty strings */
3778 if (PyString_GET_SIZE(self) == 0)
3779 return PyInt_FromLong(0);
3780
3781 e = p + PyUnicode_GET_SIZE(self);
3782 for (; p < e; p++) {
3783 if (!Py_UNICODE_ISALPHA(*p))
3784 return PyInt_FromLong(0);
3785 }
3786 return PyInt_FromLong(1);
3787}
3788
3789static char isalnum__doc__[] =
3790"S.isalnum() -> int\n\
3791\n\
3792Return 1 if all characters in S are alphanumeric\n\
3793and there is at least one character in S, 0 otherwise.";
3794
3795static PyObject*
3796unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3797{
3798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3799 register const Py_UNICODE *e;
3800
3801 if (!PyArg_NoArgs(args))
3802 return NULL;
3803
3804 /* Shortcut for single character strings */
3805 if (PyUnicode_GET_SIZE(self) == 1 &&
3806 Py_UNICODE_ISALNUM(*p))
3807 return PyInt_FromLong(1);
3808
3809 /* Special case for empty strings */
3810 if (PyString_GET_SIZE(self) == 0)
3811 return PyInt_FromLong(0);
3812
3813 e = p + PyUnicode_GET_SIZE(self);
3814 for (; p < e; p++) {
3815 if (!Py_UNICODE_ISALNUM(*p))
3816 return PyInt_FromLong(0);
3817 }
3818 return PyInt_FromLong(1);
3819}
3820
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821static char isdecimal__doc__[] =
3822"S.isdecimal() -> int\n\
3823\n\
3824Return 1 if there are only decimal characters in S,\n\
38250 otherwise.";
3826
3827static PyObject*
3828unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3829{
3830 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3831 register const Py_UNICODE *e;
3832
3833 if (!PyArg_NoArgs(args))
3834 return NULL;
3835
3836 /* Shortcut for single character strings */
3837 if (PyUnicode_GET_SIZE(self) == 1 &&
3838 Py_UNICODE_ISDECIMAL(*p))
3839 return PyInt_FromLong(1);
3840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003841 /* Special case for empty strings */
3842 if (PyString_GET_SIZE(self) == 0)
3843 return PyInt_FromLong(0);
3844
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 e = p + PyUnicode_GET_SIZE(self);
3846 for (; p < e; p++) {
3847 if (!Py_UNICODE_ISDECIMAL(*p))
3848 return PyInt_FromLong(0);
3849 }
3850 return PyInt_FromLong(1);
3851}
3852
3853static char isdigit__doc__[] =
3854"S.isdigit() -> int\n\
3855\n\
3856Return 1 if there are only digit characters in S,\n\
38570 otherwise.";
3858
3859static PyObject*
3860unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3861{
3862 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3863 register const Py_UNICODE *e;
3864
3865 if (!PyArg_NoArgs(args))
3866 return NULL;
3867
3868 /* Shortcut for single character strings */
3869 if (PyUnicode_GET_SIZE(self) == 1 &&
3870 Py_UNICODE_ISDIGIT(*p))
3871 return PyInt_FromLong(1);
3872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003873 /* Special case for empty strings */
3874 if (PyString_GET_SIZE(self) == 0)
3875 return PyInt_FromLong(0);
3876
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 e = p + PyUnicode_GET_SIZE(self);
3878 for (; p < e; p++) {
3879 if (!Py_UNICODE_ISDIGIT(*p))
3880 return PyInt_FromLong(0);
3881 }
3882 return PyInt_FromLong(1);
3883}
3884
3885static char isnumeric__doc__[] =
3886"S.isnumeric() -> int\n\
3887\n\
3888Return 1 if there are only numeric characters in S,\n\
38890 otherwise.";
3890
3891static PyObject*
3892unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3893{
3894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3895 register const Py_UNICODE *e;
3896
3897 if (!PyArg_NoArgs(args))
3898 return NULL;
3899
3900 /* Shortcut for single character strings */
3901 if (PyUnicode_GET_SIZE(self) == 1 &&
3902 Py_UNICODE_ISNUMERIC(*p))
3903 return PyInt_FromLong(1);
3904
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003905 /* Special case for empty strings */
3906 if (PyString_GET_SIZE(self) == 0)
3907 return PyInt_FromLong(0);
3908
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 e = p + PyUnicode_GET_SIZE(self);
3910 for (; p < e; p++) {
3911 if (!Py_UNICODE_ISNUMERIC(*p))
3912 return PyInt_FromLong(0);
3913 }
3914 return PyInt_FromLong(1);
3915}
3916
3917static char join__doc__[] =
3918"S.join(sequence) -> unicode\n\
3919\n\
3920Return a string which is the concatenation of the strings in the\n\
3921sequence. The separator between elements is S.";
3922
3923static PyObject*
3924unicode_join(PyUnicodeObject *self, PyObject *args)
3925{
3926 PyObject *data;
3927 if (!PyArg_ParseTuple(args, "O:join", &data))
3928 return NULL;
3929
3930 return PyUnicode_Join((PyObject *)self, data);
3931}
3932
3933static int
3934unicode_length(PyUnicodeObject *self)
3935{
3936 return self->length;
3937}
3938
3939static char ljust__doc__[] =
3940"S.ljust(width) -> unicode\n\
3941\n\
3942Return S left justified in a Unicode string of length width. Padding is\n\
3943done using spaces.";
3944
3945static PyObject *
3946unicode_ljust(PyUnicodeObject *self, PyObject *args)
3947{
3948 int width;
3949 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3950 return NULL;
3951
3952 if (self->length >= width) {
3953 Py_INCREF(self);
3954 return (PyObject*) self;
3955 }
3956
3957 return (PyObject*) pad(self, 0, width - self->length, ' ');
3958}
3959
3960static char lower__doc__[] =
3961"S.lower() -> unicode\n\
3962\n\
3963Return a copy of the string S converted to lowercase.";
3964
3965static PyObject*
3966unicode_lower(PyUnicodeObject *self, PyObject *args)
3967{
3968 if (!PyArg_NoArgs(args))
3969 return NULL;
3970 return fixup(self, fixlower);
3971}
3972
3973static char lstrip__doc__[] =
3974"S.lstrip() -> unicode\n\
3975\n\
3976Return a copy of the string S with leading whitespace removed.";
3977
3978static PyObject *
3979unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3980{
3981 if (!PyArg_NoArgs(args))
3982 return NULL;
3983 return strip(self, 1, 0);
3984}
3985
3986static PyObject*
3987unicode_repeat(PyUnicodeObject *str, int len)
3988{
3989 PyUnicodeObject *u;
3990 Py_UNICODE *p;
3991
3992 if (len < 0)
3993 len = 0;
3994
3995 if (len == 1) {
3996 /* no repeat, return original string */
3997 Py_INCREF(str);
3998 return (PyObject*) str;
3999 }
4000
4001 u = _PyUnicode_New(len * str->length);
4002 if (!u)
4003 return NULL;
4004
4005 p = u->str;
4006
4007 while (len-- > 0) {
4008 Py_UNICODE_COPY(p, str->str, str->length);
4009 p += str->length;
4010 }
4011
4012 return (PyObject*) u;
4013}
4014
4015PyObject *PyUnicode_Replace(PyObject *obj,
4016 PyObject *subobj,
4017 PyObject *replobj,
4018 int maxcount)
4019{
4020 PyObject *self;
4021 PyObject *str1;
4022 PyObject *str2;
4023 PyObject *result;
4024
4025 self = PyUnicode_FromObject(obj);
4026 if (self == NULL)
4027 return NULL;
4028 str1 = PyUnicode_FromObject(subobj);
4029 if (str1 == NULL) {
4030 Py_DECREF(self);
4031 return NULL;
4032 }
4033 str2 = PyUnicode_FromObject(replobj);
4034 if (str2 == NULL) {
4035 Py_DECREF(self);
4036 Py_DECREF(str1);
4037 return NULL;
4038 }
4039 result = replace((PyUnicodeObject *)self,
4040 (PyUnicodeObject *)str1,
4041 (PyUnicodeObject *)str2,
4042 maxcount);
4043 Py_DECREF(self);
4044 Py_DECREF(str1);
4045 Py_DECREF(str2);
4046 return result;
4047}
4048
4049static char replace__doc__[] =
4050"S.replace (old, new[, maxsplit]) -> unicode\n\
4051\n\
4052Return a copy of S with all occurrences of substring\n\
4053old replaced by new. If the optional argument maxsplit is\n\
4054given, only the first maxsplit occurrences are replaced.";
4055
4056static PyObject*
4057unicode_replace(PyUnicodeObject *self, PyObject *args)
4058{
4059 PyUnicodeObject *str1;
4060 PyUnicodeObject *str2;
4061 int maxcount = -1;
4062 PyObject *result;
4063
4064 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4065 return NULL;
4066 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4067 if (str1 == NULL)
4068 return NULL;
4069 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4070 if (str2 == NULL)
4071 return NULL;
4072
4073 result = replace(self, str1, str2, maxcount);
4074
4075 Py_DECREF(str1);
4076 Py_DECREF(str2);
4077 return result;
4078}
4079
4080static
4081PyObject *unicode_repr(PyObject *unicode)
4082{
4083 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4084 PyUnicode_GET_SIZE(unicode),
4085 1);
4086}
4087
4088static char rfind__doc__[] =
4089"S.rfind(sub [,start [,end]]) -> int\n\
4090\n\
4091Return the highest index in S where substring sub is found,\n\
4092such that sub is contained within s[start,end]. Optional\n\
4093arguments start and end are interpreted as in slice notation.\n\
4094\n\
4095Return -1 on failure.";
4096
4097static PyObject *
4098unicode_rfind(PyUnicodeObject *self, PyObject *args)
4099{
4100 PyUnicodeObject *substring;
4101 int start = 0;
4102 int end = INT_MAX;
4103 PyObject *result;
4104
Guido van Rossumb8872e62000-05-09 14:14:27 +00004105 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4106 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 return NULL;
4108 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4109 (PyObject *)substring);
4110 if (substring == NULL)
4111 return NULL;
4112
4113 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4114
4115 Py_DECREF(substring);
4116 return result;
4117}
4118
4119static char rindex__doc__[] =
4120"S.rindex(sub [,start [,end]]) -> int\n\
4121\n\
4122Like S.rfind() but raise ValueError when the substring is not found.";
4123
4124static PyObject *
4125unicode_rindex(PyUnicodeObject *self, PyObject *args)
4126{
4127 int result;
4128 PyUnicodeObject *substring;
4129 int start = 0;
4130 int end = INT_MAX;
4131
Guido van Rossumb8872e62000-05-09 14:14:27 +00004132 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4133 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 return NULL;
4135 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4136 (PyObject *)substring);
4137 if (substring == NULL)
4138 return NULL;
4139
4140 result = findstring(self, substring, start, end, -1);
4141
4142 Py_DECREF(substring);
4143 if (result < 0) {
4144 PyErr_SetString(PyExc_ValueError, "substring not found");
4145 return NULL;
4146 }
4147 return PyInt_FromLong(result);
4148}
4149
4150static char rjust__doc__[] =
4151"S.rjust(width) -> unicode\n\
4152\n\
4153Return S right justified in a Unicode string of length width. Padding is\n\
4154done using spaces.";
4155
4156static PyObject *
4157unicode_rjust(PyUnicodeObject *self, PyObject *args)
4158{
4159 int width;
4160 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4161 return NULL;
4162
4163 if (self->length >= width) {
4164 Py_INCREF(self);
4165 return (PyObject*) self;
4166 }
4167
4168 return (PyObject*) pad(self, width - self->length, 0, ' ');
4169}
4170
4171static char rstrip__doc__[] =
4172"S.rstrip() -> unicode\n\
4173\n\
4174Return a copy of the string S with trailing whitespace removed.";
4175
4176static PyObject *
4177unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4178{
4179 if (!PyArg_NoArgs(args))
4180 return NULL;
4181 return strip(self, 0, 1);
4182}
4183
4184static PyObject*
4185unicode_slice(PyUnicodeObject *self, int start, int end)
4186{
4187 /* standard clamping */
4188 if (start < 0)
4189 start = 0;
4190 if (end < 0)
4191 end = 0;
4192 if (end > self->length)
4193 end = self->length;
4194 if (start == 0 && end == self->length) {
4195 /* full slice, return original string */
4196 Py_INCREF(self);
4197 return (PyObject*) self;
4198 }
4199 if (start > end)
4200 start = end;
4201 /* copy slice */
4202 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4203 end - start);
4204}
4205
4206PyObject *PyUnicode_Split(PyObject *s,
4207 PyObject *sep,
4208 int maxsplit)
4209{
4210 PyObject *result;
4211
4212 s = PyUnicode_FromObject(s);
4213 if (s == NULL)
4214 return NULL;
4215 if (sep != NULL) {
4216 sep = PyUnicode_FromObject(sep);
4217 if (sep == NULL) {
4218 Py_DECREF(s);
4219 return NULL;
4220 }
4221 }
4222
4223 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4224
4225 Py_DECREF(s);
4226 Py_XDECREF(sep);
4227 return result;
4228}
4229
4230static char split__doc__[] =
4231"S.split([sep [,maxsplit]]) -> list of strings\n\
4232\n\
4233Return a list of the words in S, using sep as the\n\
4234delimiter string. If maxsplit is given, at most maxsplit\n\
4235splits are done. If sep is not specified, any whitespace string\n\
4236is a separator.";
4237
4238static PyObject*
4239unicode_split(PyUnicodeObject *self, PyObject *args)
4240{
4241 PyObject *substring = Py_None;
4242 int maxcount = -1;
4243
4244 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4245 return NULL;
4246
4247 if (substring == Py_None)
4248 return split(self, NULL, maxcount);
4249 else if (PyUnicode_Check(substring))
4250 return split(self, (PyUnicodeObject *)substring, maxcount);
4251 else
4252 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4253}
4254
4255static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004256"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257\n\
4258Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004259Line breaks are not included in the resulting list unless keepends\n\
4260is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261
4262static PyObject*
4263unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4264{
Guido van Rossum86662912000-04-11 15:38:46 +00004265 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266
Guido van Rossum86662912000-04-11 15:38:46 +00004267 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 return NULL;
4269
Guido van Rossum86662912000-04-11 15:38:46 +00004270 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271}
4272
4273static
4274PyObject *unicode_str(PyUnicodeObject *self)
4275{
Fred Drakee4315f52000-05-09 19:53:39 +00004276 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277}
4278
4279static char strip__doc__[] =
4280"S.strip() -> unicode\n\
4281\n\
4282Return a copy of S with leading and trailing whitespace removed.";
4283
4284static PyObject *
4285unicode_strip(PyUnicodeObject *self, PyObject *args)
4286{
4287 if (!PyArg_NoArgs(args))
4288 return NULL;
4289 return strip(self, 1, 1);
4290}
4291
4292static char swapcase__doc__[] =
4293"S.swapcase() -> unicode\n\
4294\n\
4295Return a copy of S with uppercase characters converted to lowercase\n\
4296and vice versa.";
4297
4298static PyObject*
4299unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4300{
4301 if (!PyArg_NoArgs(args))
4302 return NULL;
4303 return fixup(self, fixswapcase);
4304}
4305
4306static char translate__doc__[] =
4307"S.translate(table) -> unicode\n\
4308\n\
4309Return a copy of the string S, where all characters have been mapped\n\
4310through the given translation table, which must be a mapping of\n\
4311Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4312are left untouched. Characters mapped to None are deleted.";
4313
4314static PyObject*
4315unicode_translate(PyUnicodeObject *self, PyObject *args)
4316{
4317 PyObject *table;
4318
4319 if (!PyArg_ParseTuple(args, "O:translate", &table))
4320 return NULL;
4321 return PyUnicode_TranslateCharmap(self->str,
4322 self->length,
4323 table,
4324 "ignore");
4325}
4326
4327static char upper__doc__[] =
4328"S.upper() -> unicode\n\
4329\n\
4330Return a copy of S converted to uppercase.";
4331
4332static PyObject*
4333unicode_upper(PyUnicodeObject *self, PyObject *args)
4334{
4335 if (!PyArg_NoArgs(args))
4336 return NULL;
4337 return fixup(self, fixupper);
4338}
4339
4340#if 0
4341static char zfill__doc__[] =
4342"S.zfill(width) -> unicode\n\
4343\n\
4344Pad a numeric string x with zeros on the left, to fill a field\n\
4345of the specified width. The string x is never truncated.";
4346
4347static PyObject *
4348unicode_zfill(PyUnicodeObject *self, PyObject *args)
4349{
4350 int fill;
4351 PyUnicodeObject *u;
4352
4353 int width;
4354 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4355 return NULL;
4356
4357 if (self->length >= width) {
4358 Py_INCREF(self);
4359 return (PyObject*) self;
4360 }
4361
4362 fill = width - self->length;
4363
4364 u = pad(self, fill, 0, '0');
4365
4366 if (u->str[fill] == '+' || u->str[fill] == '-') {
4367 /* move sign to beginning of string */
4368 u->str[0] = u->str[fill];
4369 u->str[fill] = '0';
4370 }
4371
4372 return (PyObject*) u;
4373}
4374#endif
4375
4376#if 0
4377static PyObject*
4378unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4379{
4380 if (!PyArg_NoArgs(args))
4381 return NULL;
4382 return PyInt_FromLong(unicode_freelist_size);
4383}
4384#endif
4385
4386static char startswith__doc__[] =
4387"S.startswith(prefix[, start[, end]]) -> int\n\
4388\n\
4389Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4390optional start, test S beginning at that position. With optional end, stop\n\
4391comparing S at that position.";
4392
4393static PyObject *
4394unicode_startswith(PyUnicodeObject *self,
4395 PyObject *args)
4396{
4397 PyUnicodeObject *substring;
4398 int start = 0;
4399 int end = INT_MAX;
4400 PyObject *result;
4401
Guido van Rossumb8872e62000-05-09 14:14:27 +00004402 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4403 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 return NULL;
4405 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4406 (PyObject *)substring);
4407 if (substring == NULL)
4408 return NULL;
4409
4410 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4411
4412 Py_DECREF(substring);
4413 return result;
4414}
4415
4416
4417static char endswith__doc__[] =
4418"S.endswith(suffix[, start[, end]]) -> int\n\
4419\n\
4420Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4421optional start, test S beginning at that position. With optional end, stop\n\
4422comparing S at that position.";
4423
4424static PyObject *
4425unicode_endswith(PyUnicodeObject *self,
4426 PyObject *args)
4427{
4428 PyUnicodeObject *substring;
4429 int start = 0;
4430 int end = INT_MAX;
4431 PyObject *result;
4432
Guido van Rossumb8872e62000-05-09 14:14:27 +00004433 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4434 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 return NULL;
4436 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4437 (PyObject *)substring);
4438 if (substring == NULL)
4439 return NULL;
4440
4441 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4442
4443 Py_DECREF(substring);
4444 return result;
4445}
4446
4447
4448static PyMethodDef unicode_methods[] = {
4449
4450 /* Order is according to common usage: often used methods should
4451 appear first, since lookup is done sequentially. */
4452
4453 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4454 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4455 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4456 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4457 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4458 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4459 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4460 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4461 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4462 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4463 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4464 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4465 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4466 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4467/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4468 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4469 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4470 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4471 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4472 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4473 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4474 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4475 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4476 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4477 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4478 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4479 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4480 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4481 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4482 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4483 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4484 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4485 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004486 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4487 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488#if 0
4489 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4490 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4491#endif
4492
4493#if 0
4494 /* This one is just used for debugging the implementation. */
4495 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4496#endif
4497
4498 {NULL, NULL}
4499};
4500
4501static PyObject *
4502unicode_getattr(PyUnicodeObject *self, char *name)
4503{
4504 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4505}
4506
4507static PySequenceMethods unicode_as_sequence = {
4508 (inquiry) unicode_length, /* sq_length */
4509 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4510 (intargfunc) unicode_repeat, /* sq_repeat */
4511 (intargfunc) unicode_getitem, /* sq_item */
4512 (intintargfunc) unicode_slice, /* sq_slice */
4513 0, /* sq_ass_item */
4514 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004515 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516};
4517
4518static int
4519unicode_buffer_getreadbuf(PyUnicodeObject *self,
4520 int index,
4521 const void **ptr)
4522{
4523 if (index != 0) {
4524 PyErr_SetString(PyExc_SystemError,
4525 "accessing non-existent unicode segment");
4526 return -1;
4527 }
4528 *ptr = (void *) self->str;
4529 return PyUnicode_GET_DATA_SIZE(self);
4530}
4531
4532static int
4533unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4534 const void **ptr)
4535{
4536 PyErr_SetString(PyExc_TypeError,
4537 "cannot use unicode as modifyable buffer");
4538 return -1;
4539}
4540
4541static int
4542unicode_buffer_getsegcount(PyUnicodeObject *self,
4543 int *lenp)
4544{
4545 if (lenp)
4546 *lenp = PyUnicode_GET_DATA_SIZE(self);
4547 return 1;
4548}
4549
4550static int
4551unicode_buffer_getcharbuf(PyUnicodeObject *self,
4552 int index,
4553 const void **ptr)
4554{
4555 PyObject *str;
4556
4557 if (index != 0) {
4558 PyErr_SetString(PyExc_SystemError,
4559 "accessing non-existent unicode segment");
4560 return -1;
4561 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004562 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 if (str == NULL)
4564 return -1;
4565 *ptr = (void *) PyString_AS_STRING(str);
4566 return PyString_GET_SIZE(str);
4567}
4568
4569/* Helpers for PyUnicode_Format() */
4570
4571static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004572getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573{
4574 int argidx = *p_argidx;
4575 if (argidx < arglen) {
4576 (*p_argidx)++;
4577 if (arglen < 0)
4578 return args;
4579 else
4580 return PyTuple_GetItem(args, argidx);
4581 }
4582 PyErr_SetString(PyExc_TypeError,
4583 "not enough arguments for format string");
4584 return NULL;
4585}
4586
4587#define F_LJUST (1<<0)
4588#define F_SIGN (1<<1)
4589#define F_BLANK (1<<2)
4590#define F_ALT (1<<3)
4591#define F_ZERO (1<<4)
4592
4593static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595{
4596 register int i;
4597 int len;
4598 va_list va;
4599 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601
4602 /* First, format the string as char array, then expand to Py_UNICODE
4603 array. */
4604 charbuffer = (char *)buffer;
4605 len = vsprintf(charbuffer, format, va);
4606 for (i = len - 1; i >= 0; i--)
4607 buffer[i] = (Py_UNICODE) charbuffer[i];
4608
4609 va_end(va);
4610 return len;
4611}
4612
4613static int
4614formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004615 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 int flags,
4617 int prec,
4618 int type,
4619 PyObject *v)
4620{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004621 /* fmt = '%#.' + `prec` + `type`
4622 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 char fmt[20];
4624 double x;
4625
4626 x = PyFloat_AsDouble(v);
4627 if (x == -1.0 && PyErr_Occurred())
4628 return -1;
4629 if (prec < 0)
4630 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4632 type = 'g';
4633 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004634 /* worst case length calc to ensure no buffer overrun:
4635 fmt = %#.<prec>g
4636 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4637 for any double rep.)
4638 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4639 If prec=0 the effective precision is 1 (the leading digit is
4640 always given), therefore increase by one to 10+prec. */
4641 if (buflen <= (size_t)10 + (size_t)prec) {
4642 PyErr_SetString(PyExc_OverflowError,
4643 "formatted float is too long (precision too long?)");
4644 return -1;
4645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 return usprintf(buf, fmt, x);
4647}
4648
4649static int
4650formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004651 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 int flags,
4653 int prec,
4654 int type,
4655 PyObject *v)
4656{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004657 /* fmt = '%#.' + `prec` + 'l' + `type`
4658 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 char fmt[20];
4660 long x;
4661
4662 x = PyInt_AsLong(v);
4663 if (x == -1 && PyErr_Occurred())
4664 return -1;
4665 if (prec < 0)
4666 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004667 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4668 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4669 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4670 PyErr_SetString(PyExc_OverflowError,
4671 "formatted integer is too long (precision too long?)");
4672 return -1;
4673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4675 return usprintf(buf, fmt, x);
4676}
4677
4678static int
4679formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004680 size_t buflen,
4681 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004683 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004684 if (PyUnicode_Check(v)) {
4685 if (PyUnicode_GET_SIZE(v) != 1)
4686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004690 else if (PyString_Check(v)) {
4691 if (PyString_GET_SIZE(v) != 1)
4692 goto onError;
4693 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
4696 else {
4697 /* Integer input truncated to a character */
4698 long x;
4699 x = PyInt_AsLong(v);
4700 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 buf[0] = (char) x;
4703 }
4704 buf[1] = '\0';
4705 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004706
4707 onError:
4708 PyErr_SetString(PyExc_TypeError,
4709 "%c requires int or char");
4710 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711}
4712
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004713/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4714
4715 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4716 chars are formatted. XXX This is a magic number. Each formatting
4717 routine does bounds checking to ensure no overflow, but a better
4718 solution may be to malloc a buffer of appropriate size for each
4719 format. For now, the current solution is sufficient.
4720*/
4721#define FORMATBUFLEN (size_t)120
4722
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723PyObject *PyUnicode_Format(PyObject *format,
4724 PyObject *args)
4725{
4726 Py_UNICODE *fmt, *res;
4727 int fmtcnt, rescnt, reslen, arglen, argidx;
4728 int args_owned = 0;
4729 PyUnicodeObject *result = NULL;
4730 PyObject *dict = NULL;
4731 PyObject *uformat;
4732
4733 if (format == NULL || args == NULL) {
4734 PyErr_BadInternalCall();
4735 return NULL;
4736 }
4737 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004738 if (uformat == NULL)
4739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 fmt = PyUnicode_AS_UNICODE(uformat);
4741 fmtcnt = PyUnicode_GET_SIZE(uformat);
4742
4743 reslen = rescnt = fmtcnt + 100;
4744 result = _PyUnicode_New(reslen);
4745 if (result == NULL)
4746 goto onError;
4747 res = PyUnicode_AS_UNICODE(result);
4748
4749 if (PyTuple_Check(args)) {
4750 arglen = PyTuple_Size(args);
4751 argidx = 0;
4752 }
4753 else {
4754 arglen = -1;
4755 argidx = -2;
4756 }
4757 if (args->ob_type->tp_as_mapping)
4758 dict = args;
4759
4760 while (--fmtcnt >= 0) {
4761 if (*fmt != '%') {
4762 if (--rescnt < 0) {
4763 rescnt = fmtcnt + 100;
4764 reslen += rescnt;
4765 if (_PyUnicode_Resize(result, reslen) < 0)
4766 return NULL;
4767 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4768 --rescnt;
4769 }
4770 *res++ = *fmt++;
4771 }
4772 else {
4773 /* Got a format specifier */
4774 int flags = 0;
4775 int width = -1;
4776 int prec = -1;
4777 int size = 0;
4778 Py_UNICODE c = '\0';
4779 Py_UNICODE fill;
4780 PyObject *v = NULL;
4781 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004782 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 Py_UNICODE sign;
4784 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004785 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786
4787 fmt++;
4788 if (*fmt == '(') {
4789 Py_UNICODE *keystart;
4790 int keylen;
4791 PyObject *key;
4792 int pcount = 1;
4793
4794 if (dict == NULL) {
4795 PyErr_SetString(PyExc_TypeError,
4796 "format requires a mapping");
4797 goto onError;
4798 }
4799 ++fmt;
4800 --fmtcnt;
4801 keystart = fmt;
4802 /* Skip over balanced parentheses */
4803 while (pcount > 0 && --fmtcnt >= 0) {
4804 if (*fmt == ')')
4805 --pcount;
4806 else if (*fmt == '(')
4807 ++pcount;
4808 fmt++;
4809 }
4810 keylen = fmt - keystart - 1;
4811 if (fmtcnt < 0 || pcount > 0) {
4812 PyErr_SetString(PyExc_ValueError,
4813 "incomplete format key");
4814 goto onError;
4815 }
Fred Drakee4315f52000-05-09 19:53:39 +00004816 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 then looked up since Python uses strings to hold
4818 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004819 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 key = PyUnicode_EncodeUTF8(keystart,
4821 keylen,
4822 NULL);
4823 if (key == NULL)
4824 goto onError;
4825 if (args_owned) {
4826 Py_DECREF(args);
4827 args_owned = 0;
4828 }
4829 args = PyObject_GetItem(dict, key);
4830 Py_DECREF(key);
4831 if (args == NULL) {
4832 goto onError;
4833 }
4834 args_owned = 1;
4835 arglen = -1;
4836 argidx = -2;
4837 }
4838 while (--fmtcnt >= 0) {
4839 switch (c = *fmt++) {
4840 case '-': flags |= F_LJUST; continue;
4841 case '+': flags |= F_SIGN; continue;
4842 case ' ': flags |= F_BLANK; continue;
4843 case '#': flags |= F_ALT; continue;
4844 case '0': flags |= F_ZERO; continue;
4845 }
4846 break;
4847 }
4848 if (c == '*') {
4849 v = getnextarg(args, arglen, &argidx);
4850 if (v == NULL)
4851 goto onError;
4852 if (!PyInt_Check(v)) {
4853 PyErr_SetString(PyExc_TypeError,
4854 "* wants int");
4855 goto onError;
4856 }
4857 width = PyInt_AsLong(v);
4858 if (width < 0) {
4859 flags |= F_LJUST;
4860 width = -width;
4861 }
4862 if (--fmtcnt >= 0)
4863 c = *fmt++;
4864 }
4865 else if (c >= '0' && c <= '9') {
4866 width = c - '0';
4867 while (--fmtcnt >= 0) {
4868 c = *fmt++;
4869 if (c < '0' || c > '9')
4870 break;
4871 if ((width*10) / 10 != width) {
4872 PyErr_SetString(PyExc_ValueError,
4873 "width too big");
4874 goto onError;
4875 }
4876 width = width*10 + (c - '0');
4877 }
4878 }
4879 if (c == '.') {
4880 prec = 0;
4881 if (--fmtcnt >= 0)
4882 c = *fmt++;
4883 if (c == '*') {
4884 v = getnextarg(args, arglen, &argidx);
4885 if (v == NULL)
4886 goto onError;
4887 if (!PyInt_Check(v)) {
4888 PyErr_SetString(PyExc_TypeError,
4889 "* wants int");
4890 goto onError;
4891 }
4892 prec = PyInt_AsLong(v);
4893 if (prec < 0)
4894 prec = 0;
4895 if (--fmtcnt >= 0)
4896 c = *fmt++;
4897 }
4898 else if (c >= '0' && c <= '9') {
4899 prec = c - '0';
4900 while (--fmtcnt >= 0) {
4901 c = Py_CHARMASK(*fmt++);
4902 if (c < '0' || c > '9')
4903 break;
4904 if ((prec*10) / 10 != prec) {
4905 PyErr_SetString(PyExc_ValueError,
4906 "prec too big");
4907 goto onError;
4908 }
4909 prec = prec*10 + (c - '0');
4910 }
4911 }
4912 } /* prec */
4913 if (fmtcnt >= 0) {
4914 if (c == 'h' || c == 'l' || c == 'L') {
4915 size = c;
4916 if (--fmtcnt >= 0)
4917 c = *fmt++;
4918 }
4919 }
4920 if (fmtcnt < 0) {
4921 PyErr_SetString(PyExc_ValueError,
4922 "incomplete format");
4923 goto onError;
4924 }
4925 if (c != '%') {
4926 v = getnextarg(args, arglen, &argidx);
4927 if (v == NULL)
4928 goto onError;
4929 }
4930 sign = 0;
4931 fill = ' ';
4932 switch (c) {
4933
4934 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004935 pbuf = formatbuf;
4936 /* presume that buffer length is at least 1 */
4937 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 len = 1;
4939 break;
4940
4941 case 's':
4942 case 'r':
4943 if (PyUnicode_Check(v) && c == 's') {
4944 temp = v;
4945 Py_INCREF(temp);
4946 }
4947 else {
4948 PyObject *unicode;
4949 if (c == 's')
4950 temp = PyObject_Str(v);
4951 else
4952 temp = PyObject_Repr(v);
4953 if (temp == NULL)
4954 goto onError;
4955 if (!PyString_Check(temp)) {
4956 /* XXX Note: this should never happen, since
4957 PyObject_Repr() and PyObject_Str() assure
4958 this */
4959 Py_DECREF(temp);
4960 PyErr_SetString(PyExc_TypeError,
4961 "%s argument has non-string str()");
4962 goto onError;
4963 }
Fred Drakee4315f52000-05-09 19:53:39 +00004964 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004966 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 "strict");
4968 Py_DECREF(temp);
4969 temp = unicode;
4970 if (temp == NULL)
4971 goto onError;
4972 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004973 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 len = PyUnicode_GET_SIZE(temp);
4975 if (prec >= 0 && len > prec)
4976 len = prec;
4977 break;
4978
4979 case 'i':
4980 case 'd':
4981 case 'u':
4982 case 'o':
4983 case 'x':
4984 case 'X':
4985 if (c == 'i')
4986 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004987 pbuf = formatbuf;
4988 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4989 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 if (len < 0)
4991 goto onError;
4992 sign = (c == 'd');
4993 if (flags & F_ZERO) {
4994 fill = '0';
4995 if ((flags&F_ALT) &&
4996 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004997 pbuf[0] == '0' && pbuf[1] == c) {
4998 *res++ = *pbuf++;
4999 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 rescnt -= 2;
5001 len -= 2;
5002 width -= 2;
5003 if (width < 0)
5004 width = 0;
5005 }
5006 }
5007 break;
5008
5009 case 'e':
5010 case 'E':
5011 case 'f':
5012 case 'g':
5013 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005014 pbuf = formatbuf;
5015 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5016 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 if (len < 0)
5018 goto onError;
5019 sign = 1;
5020 if (flags&F_ZERO)
5021 fill = '0';
5022 break;
5023
5024 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005025 pbuf = formatbuf;
5026 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 if (len < 0)
5028 goto onError;
5029 break;
5030
5031 default:
5032 PyErr_Format(PyExc_ValueError,
5033 "unsupported format character '%c' (0x%x)",
5034 c, c);
5035 goto onError;
5036 }
5037 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005038 if (*pbuf == '-' || *pbuf == '+') {
5039 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 len--;
5041 }
5042 else if (flags & F_SIGN)
5043 sign = '+';
5044 else if (flags & F_BLANK)
5045 sign = ' ';
5046 else
5047 sign = 0;
5048 }
5049 if (width < len)
5050 width = len;
5051 if (rescnt < width + (sign != 0)) {
5052 reslen -= rescnt;
5053 rescnt = width + fmtcnt + 100;
5054 reslen += rescnt;
5055 if (_PyUnicode_Resize(result, reslen) < 0)
5056 return NULL;
5057 res = PyUnicode_AS_UNICODE(result)
5058 + reslen - rescnt;
5059 }
5060 if (sign) {
5061 if (fill != ' ')
5062 *res++ = sign;
5063 rescnt--;
5064 if (width > len)
5065 width--;
5066 }
5067 if (width > len && !(flags & F_LJUST)) {
5068 do {
5069 --rescnt;
5070 *res++ = fill;
5071 } while (--width > len);
5072 }
5073 if (sign && fill == ' ')
5074 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005075 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 res += len;
5077 rescnt -= len;
5078 while (--width >= len) {
5079 --rescnt;
5080 *res++ = ' ';
5081 }
5082 if (dict && (argidx < arglen) && c != '%') {
5083 PyErr_SetString(PyExc_TypeError,
5084 "not all arguments converted");
5085 goto onError;
5086 }
5087 Py_XDECREF(temp);
5088 } /* '%' */
5089 } /* until end */
5090 if (argidx < arglen && !dict) {
5091 PyErr_SetString(PyExc_TypeError,
5092 "not all arguments converted");
5093 goto onError;
5094 }
5095
5096 if (args_owned) {
5097 Py_DECREF(args);
5098 }
5099 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005100 if (_PyUnicode_Resize(result, reslen - rescnt))
5101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 return (PyObject *)result;
5103
5104 onError:
5105 Py_XDECREF(result);
5106 Py_DECREF(uformat);
5107 if (args_owned) {
5108 Py_DECREF(args);
5109 }
5110 return NULL;
5111}
5112
5113static PyBufferProcs unicode_as_buffer = {
5114 (getreadbufferproc) unicode_buffer_getreadbuf,
5115 (getwritebufferproc) unicode_buffer_getwritebuf,
5116 (getsegcountproc) unicode_buffer_getsegcount,
5117 (getcharbufferproc) unicode_buffer_getcharbuf,
5118};
5119
5120PyTypeObject PyUnicode_Type = {
5121 PyObject_HEAD_INIT(&PyType_Type)
5122 0, /* ob_size */
5123 "unicode", /* tp_name */
5124 sizeof(PyUnicodeObject), /* tp_size */
5125 0, /* tp_itemsize */
5126 /* Slots */
5127 (destructor)_PyUnicode_Free, /* tp_dealloc */
5128 0, /* tp_print */
5129 (getattrfunc)unicode_getattr, /* tp_getattr */
5130 0, /* tp_setattr */
5131 (cmpfunc) unicode_compare, /* tp_compare */
5132 (reprfunc) unicode_repr, /* tp_repr */
5133 0, /* tp_as_number */
5134 &unicode_as_sequence, /* tp_as_sequence */
5135 0, /* tp_as_mapping */
5136 (hashfunc) unicode_hash, /* tp_hash*/
5137 0, /* tp_call*/
5138 (reprfunc) unicode_str, /* tp_str */
5139 (getattrofunc) NULL, /* tp_getattro */
5140 (setattrofunc) NULL, /* tp_setattro */
5141 &unicode_as_buffer, /* tp_as_buffer */
5142 Py_TPFLAGS_DEFAULT, /* tp_flags */
5143};
5144
5145/* Initialize the Unicode implementation */
5146
Thomas Wouters78890102000-07-22 19:25:51 +00005147void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148{
5149 /* Doublecheck the configuration... */
5150 if (sizeof(Py_UNICODE) != 2)
5151 Py_FatalError("Unicode configuration error: "
5152 "sizeof(Py_UNICODE) != 2 bytes");
5153
Fred Drakee4315f52000-05-09 19:53:39 +00005154 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005155 unicode_freelist = NULL;
5156 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005158 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159}
5160
5161/* Finalize the Unicode implementation */
5162
5163void
Thomas Wouters78890102000-07-22 19:25:51 +00005164_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165{
5166 PyUnicodeObject *u = unicode_freelist;
5167
5168 while (u != NULL) {
5169 PyUnicodeObject *v = u;
5170 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005171 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005172 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005173 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005174 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005176 unicode_freelist = NULL;
5177 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005179 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180}