blob: 11146ff0dd8e7bfeac7ecac731fbc9ff8dceff8e [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000246 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
364{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 const char *s;
366 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000367 int owned = 0;
368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
373 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000374
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
383 }
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
389 }
390 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
397 }
398 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
403 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414
415 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 if (len == 0) {
417 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return v;
427
428 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000429 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000430 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000431 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433}
434
435PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *buffer = NULL, *unicode;
441
Fred Drakee4315f52000-05-09 19:53:39 +0000442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
444
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000462 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
466 }
467 Py_DECREF(buffer);
468 return unicode;
469
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
479{
480 PyObject *v, *unicode;
481
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *v;
495
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
499 }
Fred Drakee4315f52000-05-09 19:53:39 +0000500
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
503
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000521 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
525 }
526 return v;
527
528 onError:
529 return NULL;
530}
531
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000532/* Return a Python string holding the default encoded value of the
533 Unicode object.
534
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
539
540 The refcount of the string is *not* incremented.
541
542 *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
548{
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
557}
558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
564 }
565 return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568 return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577 return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580 return -1;
581}
582
Thomas Wouters78890102000-07-22 19:25:51 +0000583const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000584{
585 return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590 PyObject *v;
591
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
602
603 onError:
604 return -1;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
636{
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000640 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 details);
642 return -1;
643 }
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
647 }
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
653 }
654 else {
655 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000656 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 errors);
658 return -1;
659 }
660}
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
665{
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000670 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
679
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
683
684 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000685 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686
687 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000688 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 s++;
690 continue;
691 }
692
693 n = utf8_code_length[ch];
694
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699
700 switch (n) {
701
702 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "unexpected code byte";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 errmsg = "internal error";
709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710 break;
711
712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 break;
725
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000738 *p++ = (Py_UNICODE)ch;
739 break;
740
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000758 /* compute and append the two surrogates: */
759
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
762
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 break;
769
770 default:
771 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 }
776 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 continue;
778
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 }
783
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
787
788 return (PyObject *)unicode;
789
790onError:
791 Py_DECREF(unicode);
792 return NULL;
793}
794
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795/* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000797#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798static
799int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
818 }
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 errors);
824 return -1;
825 }
826}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
832{
833 PyObject *v;
834 char *p;
835 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000842 if (v == NULL)
843 return NULL;
844 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000845 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten++;
853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000857 cbWritten += 2;
858 }
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000871 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000872 }
873
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 i++;
880 cbWritten += 4;
881 }
882 }
883 }
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
887 }
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891 }
892 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000893 if (_PyString_Resize(&v, p - q))
894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895 return v;
896
897 onError:
898 Py_DECREF(v);
899 return NULL;
900}
901
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
903{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 if (!PyUnicode_Check(unicode)) {
905 PyErr_BadArgument();
906 return NULL;
907 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000908 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
909 PyUnicode_GET_SIZE(unicode),
910 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911}
912
913/* --- UTF-16 Codec ------------------------------------------------------- */
914
915static
916int utf16_decoding_error(const Py_UNICODE **source,
917 Py_UNICODE **dest,
918 const char *errors,
919 const char *details)
920{
921 if ((errors == NULL) ||
922 (strcmp(errors,"strict") == 0)) {
923 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000924 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000925 details);
926 return -1;
927 }
928 else if (strcmp(errors,"ignore") == 0) {
929 return 0;
930 }
931 else if (strcmp(errors,"replace") == 0) {
932 if (dest) {
933 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
934 (*dest)++;
935 }
936 return 0;
937 }
938 else {
939 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000940 "UTF-16 decoding error; "
941 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942 errors);
943 return -1;
944 }
945}
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947PyObject *PyUnicode_DecodeUTF16(const char *s,
948 int size,
949 const char *errors,
950 int *byteorder)
951{
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const Py_UNICODE *q, *e;
955 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000956 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000957
958 /* size should be an even number */
959 if (size % sizeof(Py_UNICODE) != 0) {
960 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
961 return NULL;
962 /* The remaining input chars are ignored if we fall through
963 here... */
964 }
965
966 /* Note: size will always be longer than the resulting Unicode
967 character count */
968 unicode = _PyUnicode_New(size);
969 if (!unicode)
970 return NULL;
971 if (size == 0)
972 return (PyObject *)unicode;
973
974 /* Unpack UTF-16 encoded data */
975 p = unicode->str;
976 q = (Py_UNICODE *)s;
977 e = q + (size / sizeof(Py_UNICODE));
978
979 if (byteorder)
980 bo = *byteorder;
981
982 while (q < e) {
983 register Py_UNICODE ch = *q++;
984
985 /* Check for BOM marks (U+FEFF) in the input and adjust
986 current byte order setting accordingly. Swap input
987 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
988 !) */
989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
990 if (ch == 0xFEFF) {
991 bo = -1;
992 continue;
993 } else if (ch == 0xFFFE) {
994 bo = 1;
995 continue;
996 }
997 if (bo == 1)
998 ch = (ch >> 8) | (ch << 8);
999#else
1000 if (ch == 0xFEFF) {
1001 bo = 1;
1002 continue;
1003 } else if (ch == 0xFFFE) {
1004 bo = -1;
1005 continue;
1006 }
1007 if (bo == -1)
1008 ch = (ch >> 8) | (ch << 8);
1009#endif
1010 if (ch < 0xD800 || ch > 0xDFFF) {
1011 *p++ = ch;
1012 continue;
1013 }
1014
1015 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (q >= e) {
1017 errmsg = "unexpected end of data";
1018 goto utf16Error;
1019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020 if (0xDC00 <= *q && *q <= 0xDFFF) {
1021 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001022 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 /* This is valid data (a UTF-16 surrogate pair), but
1024 we are not able to store this information since our
1025 Py_UNICODE type only has 16 bits... this might
1026 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "code pairs are not supported";
1028 goto utf16Error;
1029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 else
1031 continue;
1032 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001033 errmsg = "illegal encoding";
1034 /* Fall through to report the error */
1035
1036 utf16Error:
1037 if (utf16_decoding_error(&q, &p, errors, errmsg))
1038 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 }
1040
1041 if (byteorder)
1042 *byteorder = bo;
1043
1044 /* Adjust length */
1045 if (_PyUnicode_Resize(unicode, p - unicode->str))
1046 goto onError;
1047
1048 return (PyObject *)unicode;
1049
1050onError:
1051 Py_DECREF(unicode);
1052 return NULL;
1053}
1054
1055#undef UTF16_ERROR
1056
1057PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1058 int size,
1059 const char *errors,
1060 int byteorder)
1061{
1062 PyObject *v;
1063 Py_UNICODE *p;
1064 char *q;
1065
1066 /* We don't create UTF-16 pairs... */
1067 v = PyString_FromStringAndSize(NULL,
1068 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1069 if (v == NULL)
1070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
1072 q = PyString_AS_STRING(v);
1073 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 if (byteorder == 0)
1075 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001076 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001077 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 if (byteorder == 0 ||
1079#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1080 byteorder == -1
1081#else
1082 byteorder == 1
1083#endif
1084 )
1085 memcpy(p, s, size * sizeof(Py_UNICODE));
1086 else
1087 while (size-- > 0) {
1088 Py_UNICODE ch = *s++;
1089 *p++ = (ch >> 8) | (ch << 8);
1090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 return v;
1092}
1093
1094PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1095{
1096 if (!PyUnicode_Check(unicode)) {
1097 PyErr_BadArgument();
1098 return NULL;
1099 }
1100 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1101 PyUnicode_GET_SIZE(unicode),
1102 NULL,
1103 0);
1104}
1105
1106/* --- Unicode Escape Codec ----------------------------------------------- */
1107
1108static
1109int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001110 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 const char *errors,
1112 const char *details)
1113{
1114 if ((errors == NULL) ||
1115 (strcmp(errors,"strict") == 0)) {
1116 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001117 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 details);
1119 return -1;
1120 }
1121 else if (strcmp(errors,"ignore") == 0) {
1122 return 0;
1123 }
1124 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001125 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 return 0;
1127 }
1128 else {
1129 PyErr_Format(PyExc_ValueError,
1130 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001131 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 errors);
1133 return -1;
1134 }
1135}
1136
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001137static _Py_UCNHashAPI *pucnHash = NULL;
1138
1139static
1140int mystrnicmp(const char *s1, const char *s2, size_t count)
1141{
1142 char c1, c2;
1143
1144 if (count)
1145 {
1146 do
1147 {
1148 c1 = tolower(*(s1++));
1149 c2 = tolower(*(s2++));
1150 }
1151 while(--count && c1 == c2);
1152
1153 return c1 - c2;
1154 }
1155
1156 return 0;
1157}
1158
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1160 int size,
1161 const char *errors)
1162{
1163 PyUnicodeObject *v;
1164 Py_UNICODE *p = NULL, *buf = NULL;
1165 const char *end;
1166
1167 /* Escaped strings will always be longer than the resulting
1168 Unicode string, so we start with size here and then reduce the
1169 length after conversion to the true value. */
1170 v = _PyUnicode_New(size);
1171 if (v == NULL)
1172 goto onError;
1173 if (size == 0)
1174 return (PyObject *)v;
1175 p = buf = PyUnicode_AS_UNICODE(v);
1176 end = s + size;
1177 while (s < end) {
1178 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001179 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 int i;
1181
1182 /* Non-escape characters are interpreted as Unicode ordinals */
1183 if (*s != '\\') {
1184 *p++ = (unsigned char)*s++;
1185 continue;
1186 }
1187
1188 /* \ - Escapes */
1189 s++;
1190 switch (*s++) {
1191
1192 /* \x escapes */
1193 case '\n': break;
1194 case '\\': *p++ = '\\'; break;
1195 case '\'': *p++ = '\''; break;
1196 case '\"': *p++ = '\"'; break;
1197 case 'b': *p++ = '\b'; break;
1198 case 'f': *p++ = '\014'; break; /* FF */
1199 case 't': *p++ = '\t'; break;
1200 case 'n': *p++ = '\n'; break;
1201 case 'r': *p++ = '\r'; break;
1202 case 'v': *p++ = '\013'; break; /* VT */
1203 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1204
1205 /* \OOO (octal) escapes */
1206 case '0': case '1': case '2': case '3':
1207 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001208 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001210 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001212 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 break;
1216
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001217 /* \xXXXX escape with 1-n hex digits. for compatibility
1218 with 8-bit strings, this code ignores all but the last
1219 two digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 case 'x':
1221 x = 0;
1222 c = (unsigned char)*s;
1223 if (isxdigit(c)) {
1224 do {
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001225 x = (x<<4) & 0xF0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 if ('0' <= c && c <= '9')
1227 x += c - '0';
1228 else if ('a' <= c && c <= 'f')
1229 x += 10 + c - 'a';
1230 else
1231 x += 10 + c - 'A';
1232 c = (unsigned char)*++s;
1233 } while (isxdigit(c));
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001234 *p++ = (unsigned char) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 } else {
1236 *p++ = '\\';
1237 *p++ = (unsigned char)s[-1];
1238 }
1239 break;
1240
1241 /* \uXXXX with 4 hex digits */
1242 case 'u':
1243 for (x = 0, i = 0; i < 4; i++) {
1244 c = (unsigned char)s[i];
1245 if (!isxdigit(c)) {
1246 if (unicodeescape_decoding_error(&s, &x, errors,
1247 "truncated \\uXXXX"))
1248 goto onError;
1249 i++;
1250 break;
1251 }
1252 x = (x<<4) & ~0xF;
1253 if (c >= '0' && c <= '9')
1254 x += c - '0';
1255 else if (c >= 'a' && c <= 'f')
1256 x += 10 + c - 'a';
1257 else
1258 x += 10 + c - 'A';
1259 }
1260 s += i;
1261 *p++ = x;
1262 break;
1263
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001264 case 'N':
1265 /* Ok, we need to deal with Unicode Character Names now,
1266 * make sure we've imported the hash table data...
1267 */
1268 if (pucnHash == NULL)
1269 {
1270 PyObject *mod = 0, *v = 0;
1271
1272 mod = PyImport_ImportModule("ucnhash");
1273 if (mod == NULL)
1274 goto onError;
1275 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1276 Py_DECREF(mod);
1277 if (v == NULL)
1278 {
1279 goto onError;
1280 }
1281 pucnHash = PyCObject_AsVoidPtr(v);
1282 Py_DECREF(v);
1283 if (pucnHash == NULL)
1284 {
1285 goto onError;
1286 }
1287 }
1288
1289 if (*s == '{')
1290 {
1291 const char *start = s + 1;
1292 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001294 unsigned long j;
1295
1296 /* look for either the closing brace, or we
1297 * exceed the maximum length of the unicode character names
1298 */
1299 while (*endBrace != '}' &&
1300 (unsigned int)(endBrace - start) <=
1301 pucnHash->cchMax &&
1302 endBrace < end)
1303 {
1304 endBrace++;
1305 }
1306 if (endBrace != end && *endBrace == '}')
1307 {
1308 j = pucnHash->hash(start, endBrace - start);
1309 if (j > pucnHash->cKeys ||
1310 mystrnicmp(
1311 start,
1312 ((_Py_UnicodeCharacterName *)
1313 (pucnHash->getValue(j)))->pszUCN,
1314 (int)(endBrace - start)) != 0)
1315 {
1316 if (unicodeescape_decoding_error(
1317 &s, &x, errors,
1318 "Invalid Unicode Character Name"))
1319 {
1320 goto onError;
1321 }
1322 goto ucnFallthrough;
1323 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 value = ((_Py_UnicodeCharacterName *)
1325 (pucnHash->getValue(j)))->value;
1326 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001327 {
1328 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001329 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001330 }
1331 else
1332 {
1333 /* Oops, its in UCS-4 space, */
1334 /* compute and append the two surrogates: */
1335 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001336 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001337
1338 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001339 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001340
1341 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001342 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001343 }
1344 s = endBrace + 1;
1345 }
1346 else
1347 {
1348 if (unicodeescape_decoding_error(
1349 &s, &x, errors,
1350 "Unicode name missing closing brace"))
1351 goto onError;
1352 goto ucnFallthrough;
1353 }
1354 break;
1355 }
1356 if (unicodeescape_decoding_error(
1357 &s, &x, errors,
1358 "Missing opening brace for Unicode Character Name escape"))
1359 goto onError;
1360ucnFallthrough:
1361 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001362 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 *p++ = '\\';
1364 *p++ = (unsigned char)s[-1];
1365 break;
1366 }
1367 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001368 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001369 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 return (PyObject *)v;
1371
1372 onError:
1373 Py_XDECREF(v);
1374 return NULL;
1375}
1376
1377/* Return a Unicode-Escape string version of the Unicode object.
1378
1379 If quotes is true, the string is enclosed in u"" or u'' quotes as
1380 appropriate.
1381
1382*/
1383
Barry Warsaw51ac5802000-03-20 16:36:48 +00001384static const Py_UNICODE *findchar(const Py_UNICODE *s,
1385 int size,
1386 Py_UNICODE ch);
1387
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388static
1389PyObject *unicodeescape_string(const Py_UNICODE *s,
1390 int size,
1391 int quotes)
1392{
1393 PyObject *repr;
1394 char *p;
1395 char *q;
1396
1397 static const char *hexdigit = "0123456789ABCDEF";
1398
1399 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1400 if (repr == NULL)
1401 return NULL;
1402
1403 p = q = PyString_AS_STRING(repr);
1404
1405 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 *p++ = 'u';
1407 *p++ = (findchar(s, size, '\'') &&
1408 !findchar(s, size, '"')) ? '"' : '\'';
1409 }
1410 while (size-- > 0) {
1411 Py_UNICODE ch = *s++;
1412 /* Escape quotes */
1413 if (quotes && (ch == q[1] || ch == '\\')) {
1414 *p++ = '\\';
1415 *p++ = (char) ch;
1416 }
1417 /* Map 16-bit characters to '\uxxxx' */
1418 else if (ch >= 256) {
1419 *p++ = '\\';
1420 *p++ = 'u';
1421 *p++ = hexdigit[(ch >> 12) & 0xf];
1422 *p++ = hexdigit[(ch >> 8) & 0xf];
1423 *p++ = hexdigit[(ch >> 4) & 0xf];
1424 *p++ = hexdigit[ch & 15];
1425 }
1426 /* Map non-printable US ASCII to '\ooo' */
1427 else if (ch < ' ' || ch >= 128) {
1428 *p++ = '\\';
1429 *p++ = hexdigit[(ch >> 6) & 7];
1430 *p++ = hexdigit[(ch >> 3) & 7];
1431 *p++ = hexdigit[ch & 7];
1432 }
1433 /* Copy everything else as-is */
1434 else
1435 *p++ = (char) ch;
1436 }
1437 if (quotes)
1438 *p++ = q[1];
1439
1440 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001441 if (_PyString_Resize(&repr, p - q))
1442 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443
1444 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001445
1446 onError:
1447 Py_DECREF(repr);
1448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449}
1450
1451PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1452 int size)
1453{
1454 return unicodeescape_string(s, size, 0);
1455}
1456
1457PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1458{
1459 if (!PyUnicode_Check(unicode)) {
1460 PyErr_BadArgument();
1461 return NULL;
1462 }
1463 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1464 PyUnicode_GET_SIZE(unicode));
1465}
1466
1467/* --- Raw Unicode Escape Codec ------------------------------------------- */
1468
1469PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1470 int size,
1471 const char *errors)
1472{
1473 PyUnicodeObject *v;
1474 Py_UNICODE *p, *buf;
1475 const char *end;
1476 const char *bs;
1477
1478 /* Escaped strings will always be longer than the resulting
1479 Unicode string, so we start with size here and then reduce the
1480 length after conversion to the true value. */
1481 v = _PyUnicode_New(size);
1482 if (v == NULL)
1483 goto onError;
1484 if (size == 0)
1485 return (PyObject *)v;
1486 p = buf = PyUnicode_AS_UNICODE(v);
1487 end = s + size;
1488 while (s < end) {
1489 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001490 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 int i;
1492
1493 /* Non-escape characters are interpreted as Unicode ordinals */
1494 if (*s != '\\') {
1495 *p++ = (unsigned char)*s++;
1496 continue;
1497 }
1498
1499 /* \u-escapes are only interpreted iff the number of leading
1500 backslashes if odd */
1501 bs = s;
1502 for (;s < end;) {
1503 if (*s != '\\')
1504 break;
1505 *p++ = (unsigned char)*s++;
1506 }
1507 if (((s - bs) & 1) == 0 ||
1508 s >= end ||
1509 *s != 'u') {
1510 continue;
1511 }
1512 p--;
1513 s++;
1514
1515 /* \uXXXX with 4 hex digits */
1516 for (x = 0, i = 0; i < 4; i++) {
1517 c = (unsigned char)s[i];
1518 if (!isxdigit(c)) {
1519 if (unicodeescape_decoding_error(&s, &x, errors,
1520 "truncated \\uXXXX"))
1521 goto onError;
1522 i++;
1523 break;
1524 }
1525 x = (x<<4) & ~0xF;
1526 if (c >= '0' && c <= '9')
1527 x += c - '0';
1528 else if (c >= 'a' && c <= 'f')
1529 x += 10 + c - 'a';
1530 else
1531 x += 10 + c - 'A';
1532 }
1533 s += i;
1534 *p++ = x;
1535 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001536 if (_PyUnicode_Resize(v, (int)(p - buf)))
1537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 return (PyObject *)v;
1539
1540 onError:
1541 Py_XDECREF(v);
1542 return NULL;
1543}
1544
1545PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1546 int size)
1547{
1548 PyObject *repr;
1549 char *p;
1550 char *q;
1551
1552 static const char *hexdigit = "0123456789ABCDEF";
1553
1554 repr = PyString_FromStringAndSize(NULL, 6 * size);
1555 if (repr == NULL)
1556 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001557 if (size == 0)
1558 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
1560 p = q = PyString_AS_STRING(repr);
1561 while (size-- > 0) {
1562 Py_UNICODE ch = *s++;
1563 /* Map 16-bit characters to '\uxxxx' */
1564 if (ch >= 256) {
1565 *p++ = '\\';
1566 *p++ = 'u';
1567 *p++ = hexdigit[(ch >> 12) & 0xf];
1568 *p++ = hexdigit[(ch >> 8) & 0xf];
1569 *p++ = hexdigit[(ch >> 4) & 0xf];
1570 *p++ = hexdigit[ch & 15];
1571 }
1572 /* Copy everything else as-is */
1573 else
1574 *p++ = (char) ch;
1575 }
1576 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001577 if (_PyString_Resize(&repr, p - q))
1578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579
1580 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001581
1582 onError:
1583 Py_DECREF(repr);
1584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585}
1586
1587PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1588{
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 return NULL;
1592 }
1593 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1594 PyUnicode_GET_SIZE(unicode));
1595}
1596
1597/* --- Latin-1 Codec ------------------------------------------------------ */
1598
1599PyObject *PyUnicode_DecodeLatin1(const char *s,
1600 int size,
1601 const char *errors)
1602{
1603 PyUnicodeObject *v;
1604 Py_UNICODE *p;
1605
1606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1607 v = _PyUnicode_New(size);
1608 if (v == NULL)
1609 goto onError;
1610 if (size == 0)
1611 return (PyObject *)v;
1612 p = PyUnicode_AS_UNICODE(v);
1613 while (size-- > 0)
1614 *p++ = (unsigned char)*s++;
1615 return (PyObject *)v;
1616
1617 onError:
1618 Py_XDECREF(v);
1619 return NULL;
1620}
1621
1622static
1623int latin1_encoding_error(const Py_UNICODE **source,
1624 char **dest,
1625 const char *errors,
1626 const char *details)
1627{
1628 if ((errors == NULL) ||
1629 (strcmp(errors,"strict") == 0)) {
1630 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001631 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 details);
1633 return -1;
1634 }
1635 else if (strcmp(errors,"ignore") == 0) {
1636 return 0;
1637 }
1638 else if (strcmp(errors,"replace") == 0) {
1639 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001640 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 return 0;
1642 }
1643 else {
1644 PyErr_Format(PyExc_ValueError,
1645 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001646 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 errors);
1648 return -1;
1649 }
1650}
1651
1652PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1653 int size,
1654 const char *errors)
1655{
1656 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001657 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001658
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 repr = PyString_FromStringAndSize(NULL, size);
1660 if (repr == NULL)
1661 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001662 if (size == 0)
1663 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664
1665 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001666 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 while (size-- > 0) {
1668 Py_UNICODE ch = *p++;
1669 if (ch >= 256) {
1670 if (latin1_encoding_error(&p, &s, errors,
1671 "ordinal not in range(256)"))
1672 goto onError;
1673 }
1674 else
1675 *s++ = (char)ch;
1676 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001677 /* Resize if error handling skipped some characters */
1678 if (s - start < PyString_GET_SIZE(repr))
1679 if (_PyString_Resize(&repr, s - start))
1680 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return repr;
1682
1683 onError:
1684 Py_DECREF(repr);
1685 return NULL;
1686}
1687
1688PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1689{
1690 if (!PyUnicode_Check(unicode)) {
1691 PyErr_BadArgument();
1692 return NULL;
1693 }
1694 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1695 PyUnicode_GET_SIZE(unicode),
1696 NULL);
1697}
1698
1699/* --- 7-bit ASCII Codec -------------------------------------------------- */
1700
1701static
1702int ascii_decoding_error(const char **source,
1703 Py_UNICODE **dest,
1704 const char *errors,
1705 const char *details)
1706{
1707 if ((errors == NULL) ||
1708 (strcmp(errors,"strict") == 0)) {
1709 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001710 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 details);
1712 return -1;
1713 }
1714 else if (strcmp(errors,"ignore") == 0) {
1715 return 0;
1716 }
1717 else if (strcmp(errors,"replace") == 0) {
1718 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1719 (*dest)++;
1720 return 0;
1721 }
1722 else {
1723 PyErr_Format(PyExc_ValueError,
1724 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001725 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 errors);
1727 return -1;
1728 }
1729}
1730
1731PyObject *PyUnicode_DecodeASCII(const char *s,
1732 int size,
1733 const char *errors)
1734{
1735 PyUnicodeObject *v;
1736 Py_UNICODE *p;
1737
1738 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1739 v = _PyUnicode_New(size);
1740 if (v == NULL)
1741 goto onError;
1742 if (size == 0)
1743 return (PyObject *)v;
1744 p = PyUnicode_AS_UNICODE(v);
1745 while (size-- > 0) {
1746 register unsigned char c;
1747
1748 c = (unsigned char)*s++;
1749 if (c < 128)
1750 *p++ = c;
1751 else if (ascii_decoding_error(&s, &p, errors,
1752 "ordinal not in range(128)"))
1753 goto onError;
1754 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001755 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1756 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1757 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 return (PyObject *)v;
1759
1760 onError:
1761 Py_XDECREF(v);
1762 return NULL;
1763}
1764
1765static
1766int ascii_encoding_error(const Py_UNICODE **source,
1767 char **dest,
1768 const char *errors,
1769 const char *details)
1770{
1771 if ((errors == NULL) ||
1772 (strcmp(errors,"strict") == 0)) {
1773 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001774 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 details);
1776 return -1;
1777 }
1778 else if (strcmp(errors,"ignore") == 0) {
1779 return 0;
1780 }
1781 else if (strcmp(errors,"replace") == 0) {
1782 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001783 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 return 0;
1785 }
1786 else {
1787 PyErr_Format(PyExc_ValueError,
1788 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001789 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 errors);
1791 return -1;
1792 }
1793}
1794
1795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1796 int size,
1797 const char *errors)
1798{
1799 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001800 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 repr = PyString_FromStringAndSize(NULL, size);
1803 if (repr == NULL)
1804 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001805 if (size == 0)
1806 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
1808 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001809 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 while (size-- > 0) {
1811 Py_UNICODE ch = *p++;
1812 if (ch >= 128) {
1813 if (ascii_encoding_error(&p, &s, errors,
1814 "ordinal not in range(128)"))
1815 goto onError;
1816 }
1817 else
1818 *s++ = (char)ch;
1819 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001820 /* Resize if error handling skipped some characters */
1821 if (s - start < PyString_GET_SIZE(repr))
1822 if (_PyString_Resize(&repr, s - start))
1823 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 return repr;
1825
1826 onError:
1827 Py_DECREF(repr);
1828 return NULL;
1829}
1830
1831PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1832{
1833 if (!PyUnicode_Check(unicode)) {
1834 PyErr_BadArgument();
1835 return NULL;
1836 }
1837 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1838 PyUnicode_GET_SIZE(unicode),
1839 NULL);
1840}
1841
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001842#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001844/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001846PyObject *PyUnicode_DecodeMBCS(const char *s,
1847 int size,
1848 const char *errors)
1849{
1850 PyUnicodeObject *v;
1851 Py_UNICODE *p;
1852
1853 /* First get the size of the result */
1854 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001855 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001856 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1857
1858 v = _PyUnicode_New(usize);
1859 if (v == NULL)
1860 return NULL;
1861 if (usize == 0)
1862 return (PyObject *)v;
1863 p = PyUnicode_AS_UNICODE(v);
1864 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1865 Py_DECREF(v);
1866 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1867 }
1868
1869 return (PyObject *)v;
1870}
1871
1872PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1873 int size,
1874 const char *errors)
1875{
1876 PyObject *repr;
1877 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001878 DWORD mbcssize;
1879
1880 /* If there are no characters, bail now! */
1881 if (size==0)
1882 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001883
1884 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001885 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001886 if (mbcssize==0)
1887 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1888
1889 repr = PyString_FromStringAndSize(NULL, mbcssize);
1890 if (repr == NULL)
1891 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001892 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001893 return repr;
1894
1895 /* Do the conversion */
1896 s = PyString_AS_STRING(repr);
1897 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1898 Py_DECREF(repr);
1899 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1900 }
1901 return repr;
1902}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001903
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001904#endif /* MS_WIN32 */
1905
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906/* --- Character Mapping Codec -------------------------------------------- */
1907
1908static
1909int charmap_decoding_error(const char **source,
1910 Py_UNICODE **dest,
1911 const char *errors,
1912 const char *details)
1913{
1914 if ((errors == NULL) ||
1915 (strcmp(errors,"strict") == 0)) {
1916 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001917 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 details);
1919 return -1;
1920 }
1921 else if (strcmp(errors,"ignore") == 0) {
1922 return 0;
1923 }
1924 else if (strcmp(errors,"replace") == 0) {
1925 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1926 (*dest)++;
1927 return 0;
1928 }
1929 else {
1930 PyErr_Format(PyExc_ValueError,
1931 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001932 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 errors);
1934 return -1;
1935 }
1936}
1937
1938PyObject *PyUnicode_DecodeCharmap(const char *s,
1939 int size,
1940 PyObject *mapping,
1941 const char *errors)
1942{
1943 PyUnicodeObject *v;
1944 Py_UNICODE *p;
1945
1946 /* Default to Latin-1 */
1947 if (mapping == NULL)
1948 return PyUnicode_DecodeLatin1(s, size, errors);
1949
1950 v = _PyUnicode_New(size);
1951 if (v == NULL)
1952 goto onError;
1953 if (size == 0)
1954 return (PyObject *)v;
1955 p = PyUnicode_AS_UNICODE(v);
1956 while (size-- > 0) {
1957 unsigned char ch = *s++;
1958 PyObject *w, *x;
1959
1960 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1961 w = PyInt_FromLong((long)ch);
1962 if (w == NULL)
1963 goto onError;
1964 x = PyObject_GetItem(mapping, w);
1965 Py_DECREF(w);
1966 if (x == NULL) {
1967 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1968 /* No mapping found: default to Latin-1 mapping */
1969 PyErr_Clear();
1970 *p++ = (Py_UNICODE)ch;
1971 continue;
1972 }
1973 goto onError;
1974 }
1975
1976 /* Apply mapping */
1977 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001978 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 if (value < 0 || value > 65535) {
1980 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001981 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 Py_DECREF(x);
1983 goto onError;
1984 }
1985 *p++ = (Py_UNICODE)value;
1986 }
1987 else if (x == Py_None) {
1988 /* undefined mapping */
1989 if (charmap_decoding_error(&s, &p, errors,
1990 "character maps to <undefined>")) {
1991 Py_DECREF(x);
1992 goto onError;
1993 }
1994 }
1995 else if (PyUnicode_Check(x)) {
1996 if (PyUnicode_GET_SIZE(x) != 1) {
1997 /* 1-n mapping */
1998 PyErr_SetString(PyExc_NotImplementedError,
1999 "1-n mappings are currently not implemented");
2000 Py_DECREF(x);
2001 goto onError;
2002 }
2003 *p++ = *PyUnicode_AS_UNICODE(x);
2004 }
2005 else {
2006 /* wrong return value */
2007 PyErr_SetString(PyExc_TypeError,
2008 "character mapping must return integer, None or unicode");
2009 Py_DECREF(x);
2010 goto onError;
2011 }
2012 Py_DECREF(x);
2013 }
2014 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2015 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2016 goto onError;
2017 return (PyObject *)v;
2018
2019 onError:
2020 Py_XDECREF(v);
2021 return NULL;
2022}
2023
2024static
2025int charmap_encoding_error(const Py_UNICODE **source,
2026 char **dest,
2027 const char *errors,
2028 const char *details)
2029{
2030 if ((errors == NULL) ||
2031 (strcmp(errors,"strict") == 0)) {
2032 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002033 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 details);
2035 return -1;
2036 }
2037 else if (strcmp(errors,"ignore") == 0) {
2038 return 0;
2039 }
2040 else if (strcmp(errors,"replace") == 0) {
2041 **dest = '?';
2042 (*dest)++;
2043 return 0;
2044 }
2045 else {
2046 PyErr_Format(PyExc_ValueError,
2047 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002048 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 errors);
2050 return -1;
2051 }
2052}
2053
2054PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2055 int size,
2056 PyObject *mapping,
2057 const char *errors)
2058{
2059 PyObject *v;
2060 char *s;
2061
2062 /* Default to Latin-1 */
2063 if (mapping == NULL)
2064 return PyUnicode_EncodeLatin1(p, size, errors);
2065
2066 v = PyString_FromStringAndSize(NULL, size);
2067 if (v == NULL)
2068 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002069 if (size == 0)
2070 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 s = PyString_AS_STRING(v);
2072 while (size-- > 0) {
2073 Py_UNICODE ch = *p++;
2074 PyObject *w, *x;
2075
2076 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2077 w = PyInt_FromLong((long)ch);
2078 if (w == NULL)
2079 goto onError;
2080 x = PyObject_GetItem(mapping, w);
2081 Py_DECREF(w);
2082 if (x == NULL) {
2083 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2084 /* No mapping found: default to Latin-1 mapping if possible */
2085 PyErr_Clear();
2086 if (ch < 256) {
2087 *s++ = (char)ch;
2088 continue;
2089 }
2090 else if (!charmap_encoding_error(&p, &s, errors,
2091 "missing character mapping"))
2092 continue;
2093 }
2094 goto onError;
2095 }
2096
2097 /* Apply mapping */
2098 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002099 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 if (value < 0 || value > 255) {
2101 PyErr_SetString(PyExc_TypeError,
2102 "character mapping must be in range(256)");
2103 Py_DECREF(x);
2104 goto onError;
2105 }
2106 *s++ = (char)value;
2107 }
2108 else if (x == Py_None) {
2109 /* undefined mapping */
2110 if (charmap_encoding_error(&p, &s, errors,
2111 "character maps to <undefined>")) {
2112 Py_DECREF(x);
2113 goto onError;
2114 }
2115 }
2116 else if (PyString_Check(x)) {
2117 if (PyString_GET_SIZE(x) != 1) {
2118 /* 1-n mapping */
2119 PyErr_SetString(PyExc_NotImplementedError,
2120 "1-n mappings are currently not implemented");
2121 Py_DECREF(x);
2122 goto onError;
2123 }
2124 *s++ = *PyString_AS_STRING(x);
2125 }
2126 else {
2127 /* wrong return value */
2128 PyErr_SetString(PyExc_TypeError,
2129 "character mapping must return integer, None or unicode");
2130 Py_DECREF(x);
2131 goto onError;
2132 }
2133 Py_DECREF(x);
2134 }
2135 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2136 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2137 goto onError;
2138 return v;
2139
2140 onError:
2141 Py_DECREF(v);
2142 return NULL;
2143}
2144
2145PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2146 PyObject *mapping)
2147{
2148 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2149 PyErr_BadArgument();
2150 return NULL;
2151 }
2152 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2153 PyUnicode_GET_SIZE(unicode),
2154 mapping,
2155 NULL);
2156}
2157
2158static
2159int translate_error(const Py_UNICODE **source,
2160 Py_UNICODE **dest,
2161 const char *errors,
2162 const char *details)
2163{
2164 if ((errors == NULL) ||
2165 (strcmp(errors,"strict") == 0)) {
2166 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002167 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 details);
2169 return -1;
2170 }
2171 else if (strcmp(errors,"ignore") == 0) {
2172 return 0;
2173 }
2174 else if (strcmp(errors,"replace") == 0) {
2175 **dest = '?';
2176 (*dest)++;
2177 return 0;
2178 }
2179 else {
2180 PyErr_Format(PyExc_ValueError,
2181 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002182 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 errors);
2184 return -1;
2185 }
2186}
2187
2188PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2189 int size,
2190 PyObject *mapping,
2191 const char *errors)
2192{
2193 PyUnicodeObject *v;
2194 Py_UNICODE *p;
2195
2196 if (mapping == NULL) {
2197 PyErr_BadArgument();
2198 return NULL;
2199 }
2200
2201 /* Output will never be longer than input */
2202 v = _PyUnicode_New(size);
2203 if (v == NULL)
2204 goto onError;
2205 if (size == 0)
2206 goto done;
2207 p = PyUnicode_AS_UNICODE(v);
2208 while (size-- > 0) {
2209 Py_UNICODE ch = *s++;
2210 PyObject *w, *x;
2211
2212 /* Get mapping */
2213 w = PyInt_FromLong(ch);
2214 if (w == NULL)
2215 goto onError;
2216 x = PyObject_GetItem(mapping, w);
2217 Py_DECREF(w);
2218 if (x == NULL) {
2219 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2220 /* No mapping found: default to 1-1 mapping */
2221 PyErr_Clear();
2222 *p++ = ch;
2223 continue;
2224 }
2225 goto onError;
2226 }
2227
2228 /* Apply mapping */
2229 if (PyInt_Check(x))
2230 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2231 else if (x == Py_None) {
2232 /* undefined mapping */
2233 if (translate_error(&s, &p, errors,
2234 "character maps to <undefined>")) {
2235 Py_DECREF(x);
2236 goto onError;
2237 }
2238 }
2239 else if (PyUnicode_Check(x)) {
2240 if (PyUnicode_GET_SIZE(x) != 1) {
2241 /* 1-n mapping */
2242 PyErr_SetString(PyExc_NotImplementedError,
2243 "1-n mappings are currently not implemented");
2244 Py_DECREF(x);
2245 goto onError;
2246 }
2247 *p++ = *PyUnicode_AS_UNICODE(x);
2248 }
2249 else {
2250 /* wrong return value */
2251 PyErr_SetString(PyExc_TypeError,
2252 "translate mapping must return integer, None or unicode");
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 Py_DECREF(x);
2257 }
2258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002259 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261
2262 done:
2263 return (PyObject *)v;
2264
2265 onError:
2266 Py_XDECREF(v);
2267 return NULL;
2268}
2269
2270PyObject *PyUnicode_Translate(PyObject *str,
2271 PyObject *mapping,
2272 const char *errors)
2273{
2274 PyObject *result;
2275
2276 str = PyUnicode_FromObject(str);
2277 if (str == NULL)
2278 goto onError;
2279 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2280 PyUnicode_GET_SIZE(str),
2281 mapping,
2282 errors);
2283 Py_DECREF(str);
2284 return result;
2285
2286 onError:
2287 Py_XDECREF(str);
2288 return NULL;
2289}
2290
Guido van Rossum9e896b32000-04-05 20:11:21 +00002291/* --- Decimal Encoder ---------------------------------------------------- */
2292
2293int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2294 int length,
2295 char *output,
2296 const char *errors)
2297{
2298 Py_UNICODE *p, *end;
2299
2300 if (output == NULL) {
2301 PyErr_BadArgument();
2302 return -1;
2303 }
2304
2305 p = s;
2306 end = s + length;
2307 while (p < end) {
2308 register Py_UNICODE ch = *p++;
2309 int decimal;
2310
2311 if (Py_UNICODE_ISSPACE(ch)) {
2312 *output++ = ' ';
2313 continue;
2314 }
2315 decimal = Py_UNICODE_TODECIMAL(ch);
2316 if (decimal >= 0) {
2317 *output++ = '0' + decimal;
2318 continue;
2319 }
Guido van Rossumba477042000-04-06 18:18:10 +00002320 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002321 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002322 continue;
2323 }
2324 /* All other characters are considered invalid */
2325 if (errors == NULL || strcmp(errors, "strict") == 0) {
2326 PyErr_SetString(PyExc_ValueError,
2327 "invalid decimal Unicode string");
2328 goto onError;
2329 }
2330 else if (strcmp(errors, "ignore") == 0)
2331 continue;
2332 else if (strcmp(errors, "replace") == 0) {
2333 *output++ = '?';
2334 continue;
2335 }
2336 }
2337 /* 0-terminate the output string */
2338 *output++ = '\0';
2339 return 0;
2340
2341 onError:
2342 return -1;
2343}
2344
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345/* --- Helpers ------------------------------------------------------------ */
2346
2347static
2348int count(PyUnicodeObject *self,
2349 int start,
2350 int end,
2351 PyUnicodeObject *substring)
2352{
2353 int count = 0;
2354
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002355 if (substring->length == 0)
2356 return (end - start + 1);
2357
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358 end -= substring->length;
2359
2360 while (start <= end)
2361 if (Py_UNICODE_MATCH(self, start, substring)) {
2362 count++;
2363 start += substring->length;
2364 } else
2365 start++;
2366
2367 return count;
2368}
2369
2370int PyUnicode_Count(PyObject *str,
2371 PyObject *substr,
2372 int start,
2373 int end)
2374{
2375 int result;
2376
2377 str = PyUnicode_FromObject(str);
2378 if (str == NULL)
2379 return -1;
2380 substr = PyUnicode_FromObject(substr);
2381 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002382 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 return -1;
2384 }
2385
2386 result = count((PyUnicodeObject *)str,
2387 start, end,
2388 (PyUnicodeObject *)substr);
2389
2390 Py_DECREF(str);
2391 Py_DECREF(substr);
2392 return result;
2393}
2394
2395static
2396int findstring(PyUnicodeObject *self,
2397 PyUnicodeObject *substring,
2398 int start,
2399 int end,
2400 int direction)
2401{
2402 if (start < 0)
2403 start += self->length;
2404 if (start < 0)
2405 start = 0;
2406
2407 if (substring->length == 0)
2408 return start;
2409
2410 if (end > self->length)
2411 end = self->length;
2412 if (end < 0)
2413 end += self->length;
2414 if (end < 0)
2415 end = 0;
2416
2417 end -= substring->length;
2418
2419 if (direction < 0) {
2420 for (; end >= start; end--)
2421 if (Py_UNICODE_MATCH(self, end, substring))
2422 return end;
2423 } else {
2424 for (; start <= end; start++)
2425 if (Py_UNICODE_MATCH(self, start, substring))
2426 return start;
2427 }
2428
2429 return -1;
2430}
2431
2432int PyUnicode_Find(PyObject *str,
2433 PyObject *substr,
2434 int start,
2435 int end,
2436 int direction)
2437{
2438 int result;
2439
2440 str = PyUnicode_FromObject(str);
2441 if (str == NULL)
2442 return -1;
2443 substr = PyUnicode_FromObject(substr);
2444 if (substr == NULL) {
2445 Py_DECREF(substr);
2446 return -1;
2447 }
2448
2449 result = findstring((PyUnicodeObject *)str,
2450 (PyUnicodeObject *)substr,
2451 start, end, direction);
2452 Py_DECREF(str);
2453 Py_DECREF(substr);
2454 return result;
2455}
2456
2457static
2458int tailmatch(PyUnicodeObject *self,
2459 PyUnicodeObject *substring,
2460 int start,
2461 int end,
2462 int direction)
2463{
2464 if (start < 0)
2465 start += self->length;
2466 if (start < 0)
2467 start = 0;
2468
2469 if (substring->length == 0)
2470 return 1;
2471
2472 if (end > self->length)
2473 end = self->length;
2474 if (end < 0)
2475 end += self->length;
2476 if (end < 0)
2477 end = 0;
2478
2479 end -= substring->length;
2480 if (end < start)
2481 return 0;
2482
2483 if (direction > 0) {
2484 if (Py_UNICODE_MATCH(self, end, substring))
2485 return 1;
2486 } else {
2487 if (Py_UNICODE_MATCH(self, start, substring))
2488 return 1;
2489 }
2490
2491 return 0;
2492}
2493
2494int PyUnicode_Tailmatch(PyObject *str,
2495 PyObject *substr,
2496 int start,
2497 int end,
2498 int direction)
2499{
2500 int result;
2501
2502 str = PyUnicode_FromObject(str);
2503 if (str == NULL)
2504 return -1;
2505 substr = PyUnicode_FromObject(substr);
2506 if (substr == NULL) {
2507 Py_DECREF(substr);
2508 return -1;
2509 }
2510
2511 result = tailmatch((PyUnicodeObject *)str,
2512 (PyUnicodeObject *)substr,
2513 start, end, direction);
2514 Py_DECREF(str);
2515 Py_DECREF(substr);
2516 return result;
2517}
2518
2519static
2520const Py_UNICODE *findchar(const Py_UNICODE *s,
2521 int size,
2522 Py_UNICODE ch)
2523{
2524 /* like wcschr, but doesn't stop at NULL characters */
2525
2526 while (size-- > 0) {
2527 if (*s == ch)
2528 return s;
2529 s++;
2530 }
2531
2532 return NULL;
2533}
2534
2535/* Apply fixfct filter to the Unicode object self and return a
2536 reference to the modified object */
2537
2538static
2539PyObject *fixup(PyUnicodeObject *self,
2540 int (*fixfct)(PyUnicodeObject *s))
2541{
2542
2543 PyUnicodeObject *u;
2544
2545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2546 self->length);
2547 if (u == NULL)
2548 return NULL;
2549 if (!fixfct(u)) {
2550 /* fixfct should return TRUE if it modified the buffer. If
2551 FALSE, return a reference to the original buffer instead
2552 (to save space, not time) */
2553 Py_INCREF(self);
2554 Py_DECREF(u);
2555 return (PyObject*) self;
2556 }
2557 return (PyObject*) u;
2558}
2559
2560static
2561int fixupper(PyUnicodeObject *self)
2562{
2563 int len = self->length;
2564 Py_UNICODE *s = self->str;
2565 int status = 0;
2566
2567 while (len-- > 0) {
2568 register Py_UNICODE ch;
2569
2570 ch = Py_UNICODE_TOUPPER(*s);
2571 if (ch != *s) {
2572 status = 1;
2573 *s = ch;
2574 }
2575 s++;
2576 }
2577
2578 return status;
2579}
2580
2581static
2582int fixlower(PyUnicodeObject *self)
2583{
2584 int len = self->length;
2585 Py_UNICODE *s = self->str;
2586 int status = 0;
2587
2588 while (len-- > 0) {
2589 register Py_UNICODE ch;
2590
2591 ch = Py_UNICODE_TOLOWER(*s);
2592 if (ch != *s) {
2593 status = 1;
2594 *s = ch;
2595 }
2596 s++;
2597 }
2598
2599 return status;
2600}
2601
2602static
2603int fixswapcase(PyUnicodeObject *self)
2604{
2605 int len = self->length;
2606 Py_UNICODE *s = self->str;
2607 int status = 0;
2608
2609 while (len-- > 0) {
2610 if (Py_UNICODE_ISUPPER(*s)) {
2611 *s = Py_UNICODE_TOLOWER(*s);
2612 status = 1;
2613 } else if (Py_UNICODE_ISLOWER(*s)) {
2614 *s = Py_UNICODE_TOUPPER(*s);
2615 status = 1;
2616 }
2617 s++;
2618 }
2619
2620 return status;
2621}
2622
2623static
2624int fixcapitalize(PyUnicodeObject *self)
2625{
2626 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2627 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2628 return 1;
2629 }
2630 return 0;
2631}
2632
2633static
2634int fixtitle(PyUnicodeObject *self)
2635{
2636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2637 register Py_UNICODE *e;
2638 int previous_is_cased;
2639
2640 /* Shortcut for single character strings */
2641 if (PyUnicode_GET_SIZE(self) == 1) {
2642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2643 if (*p != ch) {
2644 *p = ch;
2645 return 1;
2646 }
2647 else
2648 return 0;
2649 }
2650
2651 e = p + PyUnicode_GET_SIZE(self);
2652 previous_is_cased = 0;
2653 for (; p < e; p++) {
2654 register const Py_UNICODE ch = *p;
2655
2656 if (previous_is_cased)
2657 *p = Py_UNICODE_TOLOWER(ch);
2658 else
2659 *p = Py_UNICODE_TOTITLE(ch);
2660
2661 if (Py_UNICODE_ISLOWER(ch) ||
2662 Py_UNICODE_ISUPPER(ch) ||
2663 Py_UNICODE_ISTITLE(ch))
2664 previous_is_cased = 1;
2665 else
2666 previous_is_cased = 0;
2667 }
2668 return 1;
2669}
2670
2671PyObject *PyUnicode_Join(PyObject *separator,
2672 PyObject *seq)
2673{
2674 Py_UNICODE *sep;
2675 int seplen;
2676 PyUnicodeObject *res = NULL;
2677 int reslen = 0;
2678 Py_UNICODE *p;
2679 int seqlen = 0;
2680 int sz = 100;
2681 int i;
2682
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002683 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (seqlen < 0 && PyErr_Occurred())
2685 return NULL;
2686
2687 if (separator == NULL) {
2688 Py_UNICODE blank = ' ';
2689 sep = &blank;
2690 seplen = 1;
2691 }
2692 else {
2693 separator = PyUnicode_FromObject(separator);
2694 if (separator == NULL)
2695 return NULL;
2696 sep = PyUnicode_AS_UNICODE(separator);
2697 seplen = PyUnicode_GET_SIZE(separator);
2698 }
2699
2700 res = _PyUnicode_New(sz);
2701 if (res == NULL)
2702 goto onError;
2703 p = PyUnicode_AS_UNICODE(res);
2704 reslen = 0;
2705
2706 for (i = 0; i < seqlen; i++) {
2707 int itemlen;
2708 PyObject *item;
2709
2710 item = PySequence_GetItem(seq, i);
2711 if (item == NULL)
2712 goto onError;
2713 if (!PyUnicode_Check(item)) {
2714 PyObject *v;
2715 v = PyUnicode_FromObject(item);
2716 Py_DECREF(item);
2717 item = v;
2718 if (item == NULL)
2719 goto onError;
2720 }
2721 itemlen = PyUnicode_GET_SIZE(item);
2722 while (reslen + itemlen + seplen >= sz) {
2723 if (_PyUnicode_Resize(res, sz*2))
2724 goto onError;
2725 sz *= 2;
2726 p = PyUnicode_AS_UNICODE(res) + reslen;
2727 }
2728 if (i > 0) {
2729 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2730 p += seplen;
2731 reslen += seplen;
2732 }
2733 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2734 p += itemlen;
2735 reslen += itemlen;
2736 Py_DECREF(item);
2737 }
2738 if (_PyUnicode_Resize(res, reslen))
2739 goto onError;
2740
2741 Py_XDECREF(separator);
2742 return (PyObject *)res;
2743
2744 onError:
2745 Py_XDECREF(separator);
2746 Py_DECREF(res);
2747 return NULL;
2748}
2749
2750static
2751PyUnicodeObject *pad(PyUnicodeObject *self,
2752 int left,
2753 int right,
2754 Py_UNICODE fill)
2755{
2756 PyUnicodeObject *u;
2757
2758 if (left < 0)
2759 left = 0;
2760 if (right < 0)
2761 right = 0;
2762
2763 if (left == 0 && right == 0) {
2764 Py_INCREF(self);
2765 return self;
2766 }
2767
2768 u = _PyUnicode_New(left + self->length + right);
2769 if (u) {
2770 if (left)
2771 Py_UNICODE_FILL(u->str, fill, left);
2772 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2773 if (right)
2774 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2775 }
2776
2777 return u;
2778}
2779
2780#define SPLIT_APPEND(data, left, right) \
2781 str = PyUnicode_FromUnicode(data + left, right - left); \
2782 if (!str) \
2783 goto onError; \
2784 if (PyList_Append(list, str)) { \
2785 Py_DECREF(str); \
2786 goto onError; \
2787 } \
2788 else \
2789 Py_DECREF(str);
2790
2791static
2792PyObject *split_whitespace(PyUnicodeObject *self,
2793 PyObject *list,
2794 int maxcount)
2795{
2796 register int i;
2797 register int j;
2798 int len = self->length;
2799 PyObject *str;
2800
2801 for (i = j = 0; i < len; ) {
2802 /* find a token */
2803 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2804 i++;
2805 j = i;
2806 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2807 i++;
2808 if (j < i) {
2809 if (maxcount-- <= 0)
2810 break;
2811 SPLIT_APPEND(self->str, j, i);
2812 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2813 i++;
2814 j = i;
2815 }
2816 }
2817 if (j < len) {
2818 SPLIT_APPEND(self->str, j, len);
2819 }
2820 return list;
2821
2822 onError:
2823 Py_DECREF(list);
2824 return NULL;
2825}
2826
2827PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002828 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829{
2830 register int i;
2831 register int j;
2832 int len;
2833 PyObject *list;
2834 PyObject *str;
2835 Py_UNICODE *data;
2836
2837 string = PyUnicode_FromObject(string);
2838 if (string == NULL)
2839 return NULL;
2840 data = PyUnicode_AS_UNICODE(string);
2841 len = PyUnicode_GET_SIZE(string);
2842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 list = PyList_New(0);
2844 if (!list)
2845 goto onError;
2846
2847 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002848 int eol;
2849
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 /* Find a line and append it */
2851 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2852 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
2854 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002855 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (i < len) {
2857 if (data[i] == '\r' && i + 1 < len &&
2858 data[i+1] == '\n')
2859 i += 2;
2860 else
2861 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002862 if (keepends)
2863 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 }
Guido van Rossum86662912000-04-11 15:38:46 +00002865 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 j = i;
2867 }
2868 if (j < len) {
2869 SPLIT_APPEND(data, j, len);
2870 }
2871
2872 Py_DECREF(string);
2873 return list;
2874
2875 onError:
2876 Py_DECREF(list);
2877 Py_DECREF(string);
2878 return NULL;
2879}
2880
2881static
2882PyObject *split_char(PyUnicodeObject *self,
2883 PyObject *list,
2884 Py_UNICODE ch,
2885 int maxcount)
2886{
2887 register int i;
2888 register int j;
2889 int len = self->length;
2890 PyObject *str;
2891
2892 for (i = j = 0; i < len; ) {
2893 if (self->str[i] == ch) {
2894 if (maxcount-- <= 0)
2895 break;
2896 SPLIT_APPEND(self->str, j, i);
2897 i = j = i + 1;
2898 } else
2899 i++;
2900 }
2901 if (j <= len) {
2902 SPLIT_APPEND(self->str, j, len);
2903 }
2904 return list;
2905
2906 onError:
2907 Py_DECREF(list);
2908 return NULL;
2909}
2910
2911static
2912PyObject *split_substring(PyUnicodeObject *self,
2913 PyObject *list,
2914 PyUnicodeObject *substring,
2915 int maxcount)
2916{
2917 register int i;
2918 register int j;
2919 int len = self->length;
2920 int sublen = substring->length;
2921 PyObject *str;
2922
2923 for (i = j = 0; i < len - sublen; ) {
2924 if (Py_UNICODE_MATCH(self, i, substring)) {
2925 if (maxcount-- <= 0)
2926 break;
2927 SPLIT_APPEND(self->str, j, i);
2928 i = j = i + sublen;
2929 } else
2930 i++;
2931 }
2932 if (j <= len) {
2933 SPLIT_APPEND(self->str, j, len);
2934 }
2935 return list;
2936
2937 onError:
2938 Py_DECREF(list);
2939 return NULL;
2940}
2941
2942#undef SPLIT_APPEND
2943
2944static
2945PyObject *split(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int maxcount)
2948{
2949 PyObject *list;
2950
2951 if (maxcount < 0)
2952 maxcount = INT_MAX;
2953
2954 list = PyList_New(0);
2955 if (!list)
2956 return NULL;
2957
2958 if (substring == NULL)
2959 return split_whitespace(self,list,maxcount);
2960
2961 else if (substring->length == 1)
2962 return split_char(self,list,substring->str[0],maxcount);
2963
2964 else if (substring->length == 0) {
2965 Py_DECREF(list);
2966 PyErr_SetString(PyExc_ValueError, "empty separator");
2967 return NULL;
2968 }
2969 else
2970 return split_substring(self,list,substring,maxcount);
2971}
2972
2973static
2974PyObject *strip(PyUnicodeObject *self,
2975 int left,
2976 int right)
2977{
2978 Py_UNICODE *p = self->str;
2979 int start = 0;
2980 int end = self->length;
2981
2982 if (left)
2983 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2984 start++;
2985
2986 if (right)
2987 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2988 end--;
2989
2990 if (start == 0 && end == self->length) {
2991 /* couldn't strip anything off, return original string */
2992 Py_INCREF(self);
2993 return (PyObject*) self;
2994 }
2995
2996 return (PyObject*) PyUnicode_FromUnicode(
2997 self->str + start,
2998 end - start
2999 );
3000}
3001
3002static
3003PyObject *replace(PyUnicodeObject *self,
3004 PyUnicodeObject *str1,
3005 PyUnicodeObject *str2,
3006 int maxcount)
3007{
3008 PyUnicodeObject *u;
3009
3010 if (maxcount < 0)
3011 maxcount = INT_MAX;
3012
3013 if (str1->length == 1 && str2->length == 1) {
3014 int i;
3015
3016 /* replace characters */
3017 if (!findchar(self->str, self->length, str1->str[0])) {
3018 /* nothing to replace, return original string */
3019 Py_INCREF(self);
3020 u = self;
3021 } else {
3022 Py_UNICODE u1 = str1->str[0];
3023 Py_UNICODE u2 = str2->str[0];
3024
3025 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3026 self->str,
3027 self->length
3028 );
3029 if (u)
3030 for (i = 0; i < u->length; i++)
3031 if (u->str[i] == u1) {
3032 if (--maxcount < 0)
3033 break;
3034 u->str[i] = u2;
3035 }
3036 }
3037
3038 } else {
3039 int n, i;
3040 Py_UNICODE *p;
3041
3042 /* replace strings */
3043 n = count(self, 0, self->length, str1);
3044 if (n > maxcount)
3045 n = maxcount;
3046 if (n == 0) {
3047 /* nothing to replace, return original string */
3048 Py_INCREF(self);
3049 u = self;
3050 } else {
3051 u = _PyUnicode_New(
3052 self->length + n * (str2->length - str1->length));
3053 if (u) {
3054 i = 0;
3055 p = u->str;
3056 while (i <= self->length - str1->length)
3057 if (Py_UNICODE_MATCH(self, i, str1)) {
3058 /* replace string segment */
3059 Py_UNICODE_COPY(p, str2->str, str2->length);
3060 p += str2->length;
3061 i += str1->length;
3062 if (--n <= 0) {
3063 /* copy remaining part */
3064 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3065 break;
3066 }
3067 } else
3068 *p++ = self->str[i++];
3069 }
3070 }
3071 }
3072
3073 return (PyObject *) u;
3074}
3075
3076/* --- Unicode Object Methods --------------------------------------------- */
3077
3078static char title__doc__[] =
3079"S.title() -> unicode\n\
3080\n\
3081Return a titlecased version of S, i.e. words start with title case\n\
3082characters, all remaining cased characters have lower case.";
3083
3084static PyObject*
3085unicode_title(PyUnicodeObject *self, PyObject *args)
3086{
3087 if (!PyArg_NoArgs(args))
3088 return NULL;
3089 return fixup(self, fixtitle);
3090}
3091
3092static char capitalize__doc__[] =
3093"S.capitalize() -> unicode\n\
3094\n\
3095Return a capitalized version of S, i.e. make the first character\n\
3096have upper case.";
3097
3098static PyObject*
3099unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3100{
3101 if (!PyArg_NoArgs(args))
3102 return NULL;
3103 return fixup(self, fixcapitalize);
3104}
3105
3106#if 0
3107static char capwords__doc__[] =
3108"S.capwords() -> unicode\n\
3109\n\
3110Apply .capitalize() to all words in S and return the result with\n\
3111normalized whitespace (all whitespace strings are replaced by ' ').";
3112
3113static PyObject*
3114unicode_capwords(PyUnicodeObject *self, PyObject *args)
3115{
3116 PyObject *list;
3117 PyObject *item;
3118 int i;
3119
3120 if (!PyArg_NoArgs(args))
3121 return NULL;
3122
3123 /* Split into words */
3124 list = split(self, NULL, -1);
3125 if (!list)
3126 return NULL;
3127
3128 /* Capitalize each word */
3129 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3130 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3131 fixcapitalize);
3132 if (item == NULL)
3133 goto onError;
3134 Py_DECREF(PyList_GET_ITEM(list, i));
3135 PyList_SET_ITEM(list, i, item);
3136 }
3137
3138 /* Join the words to form a new string */
3139 item = PyUnicode_Join(NULL, list);
3140
3141onError:
3142 Py_DECREF(list);
3143 return (PyObject *)item;
3144}
3145#endif
3146
3147static char center__doc__[] =
3148"S.center(width) -> unicode\n\
3149\n\
3150Return S centered in a Unicode string of length width. Padding is done\n\
3151using spaces.";
3152
3153static PyObject *
3154unicode_center(PyUnicodeObject *self, PyObject *args)
3155{
3156 int marg, left;
3157 int width;
3158
3159 if (!PyArg_ParseTuple(args, "i:center", &width))
3160 return NULL;
3161
3162 if (self->length >= width) {
3163 Py_INCREF(self);
3164 return (PyObject*) self;
3165 }
3166
3167 marg = width - self->length;
3168 left = marg / 2 + (marg & width & 1);
3169
3170 return (PyObject*) pad(self, left, marg - left, ' ');
3171}
3172
Marc-André Lemburge5034372000-08-08 08:04:29 +00003173#if 0
3174
3175/* This code should go into some future Unicode collation support
3176 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003177 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003178
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003179/* speedy UTF-16 code point order comparison */
3180/* gleaned from: */
3181/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3182
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003183static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003184{
3185 0, 0, 0, 0, 0, 0, 0, 0,
3186 0, 0, 0, 0, 0, 0, 0, 0,
3187 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003188 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003189};
3190
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191static int
3192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3193{
3194 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 Py_UNICODE *s1 = str1->str;
3197 Py_UNICODE *s2 = str2->str;
3198
3199 len1 = str1->length;
3200 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003201
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003203 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003204 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003205
3206 c1 = *s1++;
3207 c2 = *s2++;
3208 if (c1 > (1<<11) * 26)
3209 c1 += utf16Fixup[c1>>11];
3210 if (c2 > (1<<11) * 26)
3211 c2 += utf16Fixup[c2>>11];
3212
3213 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003214 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003215 if (diff)
3216 return (diff < 0) ? -1 : (diff != 0);
3217 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 }
3219
3220 return (len1 < len2) ? -1 : (len1 != len2);
3221}
3222
Marc-André Lemburge5034372000-08-08 08:04:29 +00003223#else
3224
3225static int
3226unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3227{
3228 register int len1, len2;
3229
3230 Py_UNICODE *s1 = str1->str;
3231 Py_UNICODE *s2 = str2->str;
3232
3233 len1 = str1->length;
3234 len2 = str2->length;
3235
3236 while (len1 > 0 && len2 > 0) {
3237 register long diff;
3238
3239 diff = (long)*s1++ - (long)*s2++;
3240 if (diff)
3241 return (diff < 0) ? -1 : (diff != 0);
3242 len1--; len2--;
3243 }
3244
3245 return (len1 < len2) ? -1 : (len1 != len2);
3246}
3247
3248#endif
3249
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250int PyUnicode_Compare(PyObject *left,
3251 PyObject *right)
3252{
3253 PyUnicodeObject *u = NULL, *v = NULL;
3254 int result;
3255
3256 /* Coerce the two arguments */
3257 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3258 if (u == NULL)
3259 goto onError;
3260 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3261 if (v == NULL)
3262 goto onError;
3263
Thomas Wouters7e474022000-07-16 12:04:32 +00003264 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 if (v == u) {
3266 Py_DECREF(u);
3267 Py_DECREF(v);
3268 return 0;
3269 }
3270
3271 result = unicode_compare(u, v);
3272
3273 Py_DECREF(u);
3274 Py_DECREF(v);
3275 return result;
3276
3277onError:
3278 Py_XDECREF(u);
3279 Py_XDECREF(v);
3280 return -1;
3281}
3282
Guido van Rossum403d68b2000-03-13 15:55:09 +00003283int PyUnicode_Contains(PyObject *container,
3284 PyObject *element)
3285{
3286 PyUnicodeObject *u = NULL, *v = NULL;
3287 int result;
3288 register const Py_UNICODE *p, *e;
3289 register Py_UNICODE ch;
3290
3291 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003292 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003293 if (v == NULL) {
3294 PyErr_SetString(PyExc_TypeError,
3295 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003296 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003297 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003298 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3299 if (u == NULL) {
3300 Py_DECREF(v);
3301 goto onError;
3302 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003303
3304 /* Check v in u */
3305 if (PyUnicode_GET_SIZE(v) != 1) {
3306 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003307 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003308 goto onError;
3309 }
3310 ch = *PyUnicode_AS_UNICODE(v);
3311 p = PyUnicode_AS_UNICODE(u);
3312 e = p + PyUnicode_GET_SIZE(u);
3313 result = 0;
3314 while (p < e) {
3315 if (*p++ == ch) {
3316 result = 1;
3317 break;
3318 }
3319 }
3320
3321 Py_DECREF(u);
3322 Py_DECREF(v);
3323 return result;
3324
3325onError:
3326 Py_XDECREF(u);
3327 Py_XDECREF(v);
3328 return -1;
3329}
3330
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331/* Concat to string or Unicode object giving a new Unicode object. */
3332
3333PyObject *PyUnicode_Concat(PyObject *left,
3334 PyObject *right)
3335{
3336 PyUnicodeObject *u = NULL, *v = NULL, *w;
3337
3338 /* Coerce the two arguments */
3339 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3340 if (u == NULL)
3341 goto onError;
3342 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3343 if (v == NULL)
3344 goto onError;
3345
3346 /* Shortcuts */
3347 if (v == unicode_empty) {
3348 Py_DECREF(v);
3349 return (PyObject *)u;
3350 }
3351 if (u == unicode_empty) {
3352 Py_DECREF(u);
3353 return (PyObject *)v;
3354 }
3355
3356 /* Concat the two Unicode strings */
3357 w = _PyUnicode_New(u->length + v->length);
3358 if (w == NULL)
3359 goto onError;
3360 Py_UNICODE_COPY(w->str, u->str, u->length);
3361 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3362
3363 Py_DECREF(u);
3364 Py_DECREF(v);
3365 return (PyObject *)w;
3366
3367onError:
3368 Py_XDECREF(u);
3369 Py_XDECREF(v);
3370 return NULL;
3371}
3372
3373static char count__doc__[] =
3374"S.count(sub[, start[, end]]) -> int\n\
3375\n\
3376Return the number of occurrences of substring sub in Unicode string\n\
3377S[start:end]. Optional arguments start and end are\n\
3378interpreted as in slice notation.";
3379
3380static PyObject *
3381unicode_count(PyUnicodeObject *self, PyObject *args)
3382{
3383 PyUnicodeObject *substring;
3384 int start = 0;
3385 int end = INT_MAX;
3386 PyObject *result;
3387
Guido van Rossumb8872e62000-05-09 14:14:27 +00003388 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3389 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 return NULL;
3391
3392 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3393 (PyObject *)substring);
3394 if (substring == NULL)
3395 return NULL;
3396
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (start < 0)
3398 start += self->length;
3399 if (start < 0)
3400 start = 0;
3401 if (end > self->length)
3402 end = self->length;
3403 if (end < 0)
3404 end += self->length;
3405 if (end < 0)
3406 end = 0;
3407
3408 result = PyInt_FromLong((long) count(self, start, end, substring));
3409
3410 Py_DECREF(substring);
3411 return result;
3412}
3413
3414static char encode__doc__[] =
3415"S.encode([encoding[,errors]]) -> string\n\
3416\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003417Return an encoded string version of S. Default encoding is the current\n\
3418default string encoding. errors may be given to set a different error\n\
3419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3420a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422static PyObject *
3423unicode_encode(PyUnicodeObject *self, PyObject *args)
3424{
3425 char *encoding = NULL;
3426 char *errors = NULL;
3427 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3428 return NULL;
3429 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3430}
3431
3432static char expandtabs__doc__[] =
3433"S.expandtabs([tabsize]) -> unicode\n\
3434\n\
3435Return a copy of S where all tab characters are expanded using spaces.\n\
3436If tabsize is not given, a tab size of 8 characters is assumed.";
3437
3438static PyObject*
3439unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3440{
3441 Py_UNICODE *e;
3442 Py_UNICODE *p;
3443 Py_UNICODE *q;
3444 int i, j;
3445 PyUnicodeObject *u;
3446 int tabsize = 8;
3447
3448 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3449 return NULL;
3450
Thomas Wouters7e474022000-07-16 12:04:32 +00003451 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 i = j = 0;
3453 e = self->str + self->length;
3454 for (p = self->str; p < e; p++)
3455 if (*p == '\t') {
3456 if (tabsize > 0)
3457 j += tabsize - (j % tabsize);
3458 }
3459 else {
3460 j++;
3461 if (*p == '\n' || *p == '\r') {
3462 i += j;
3463 j = 0;
3464 }
3465 }
3466
3467 /* Second pass: create output string and fill it */
3468 u = _PyUnicode_New(i + j);
3469 if (!u)
3470 return NULL;
3471
3472 j = 0;
3473 q = u->str;
3474
3475 for (p = self->str; p < e; p++)
3476 if (*p == '\t') {
3477 if (tabsize > 0) {
3478 i = tabsize - (j % tabsize);
3479 j += i;
3480 while (i--)
3481 *q++ = ' ';
3482 }
3483 }
3484 else {
3485 j++;
3486 *q++ = *p;
3487 if (*p == '\n' || *p == '\r')
3488 j = 0;
3489 }
3490
3491 return (PyObject*) u;
3492}
3493
3494static char find__doc__[] =
3495"S.find(sub [,start [,end]]) -> int\n\
3496\n\
3497Return the lowest index in S where substring sub is found,\n\
3498such that sub is contained within s[start,end]. Optional\n\
3499arguments start and end are interpreted as in slice notation.\n\
3500\n\
3501Return -1 on failure.";
3502
3503static PyObject *
3504unicode_find(PyUnicodeObject *self, PyObject *args)
3505{
3506 PyUnicodeObject *substring;
3507 int start = 0;
3508 int end = INT_MAX;
3509 PyObject *result;
3510
Guido van Rossumb8872e62000-05-09 14:14:27 +00003511 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 return NULL;
3514 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3515 (PyObject *)substring);
3516 if (substring == NULL)
3517 return NULL;
3518
3519 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3520
3521 Py_DECREF(substring);
3522 return result;
3523}
3524
3525static PyObject *
3526unicode_getitem(PyUnicodeObject *self, int index)
3527{
3528 if (index < 0 || index >= self->length) {
3529 PyErr_SetString(PyExc_IndexError, "string index out of range");
3530 return NULL;
3531 }
3532
3533 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3534}
3535
3536static long
3537unicode_hash(PyUnicodeObject *self)
3538{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003539 /* Since Unicode objects compare equal to their ASCII string
3540 counterparts, they should use the individual character values
3541 as basis for their hash value. This is needed to assure that
3542 strings and Unicode objects behave in the same way as
3543 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544
Fredrik Lundhdde61642000-07-10 18:27:47 +00003545 register int len;
3546 register Py_UNICODE *p;
3547 register long x;
3548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 if (self->hash != -1)
3550 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003551 len = PyUnicode_GET_SIZE(self);
3552 p = PyUnicode_AS_UNICODE(self);
3553 x = *p << 7;
3554 while (--len >= 0)
3555 x = (1000003*x) ^ *p++;
3556 x ^= PyUnicode_GET_SIZE(self);
3557 if (x == -1)
3558 x = -2;
3559 self->hash = x;
3560 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561}
3562
3563static char index__doc__[] =
3564"S.index(sub [,start [,end]]) -> int\n\
3565\n\
3566Like S.find() but raise ValueError when the substring is not found.";
3567
3568static PyObject *
3569unicode_index(PyUnicodeObject *self, PyObject *args)
3570{
3571 int result;
3572 PyUnicodeObject *substring;
3573 int start = 0;
3574 int end = INT_MAX;
3575
Guido van Rossumb8872e62000-05-09 14:14:27 +00003576 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return NULL;
3579
3580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3581 (PyObject *)substring);
3582 if (substring == NULL)
3583 return NULL;
3584
3585 result = findstring(self, substring, start, end, 1);
3586
3587 Py_DECREF(substring);
3588 if (result < 0) {
3589 PyErr_SetString(PyExc_ValueError, "substring not found");
3590 return NULL;
3591 }
3592 return PyInt_FromLong(result);
3593}
3594
3595static char islower__doc__[] =
3596"S.islower() -> int\n\
3597\n\
3598Return 1 if all cased characters in S are lowercase and there is\n\
3599at least one cased character in S, 0 otherwise.";
3600
3601static PyObject*
3602unicode_islower(PyUnicodeObject *self, PyObject *args)
3603{
3604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3605 register const Py_UNICODE *e;
3606 int cased;
3607
3608 if (!PyArg_NoArgs(args))
3609 return NULL;
3610
3611 /* Shortcut for single character strings */
3612 if (PyUnicode_GET_SIZE(self) == 1)
3613 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3614
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003615 /* Special case for empty strings */
3616 if (PyString_GET_SIZE(self) == 0)
3617 return PyInt_FromLong(0);
3618
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 e = p + PyUnicode_GET_SIZE(self);
3620 cased = 0;
3621 for (; p < e; p++) {
3622 register const Py_UNICODE ch = *p;
3623
3624 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3625 return PyInt_FromLong(0);
3626 else if (!cased && Py_UNICODE_ISLOWER(ch))
3627 cased = 1;
3628 }
3629 return PyInt_FromLong(cased);
3630}
3631
3632static char isupper__doc__[] =
3633"S.isupper() -> int\n\
3634\n\
3635Return 1 if all cased characters in S are uppercase and there is\n\
3636at least one cased character in S, 0 otherwise.";
3637
3638static PyObject*
3639unicode_isupper(PyUnicodeObject *self, PyObject *args)
3640{
3641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3642 register const Py_UNICODE *e;
3643 int cased;
3644
3645 if (!PyArg_NoArgs(args))
3646 return NULL;
3647
3648 /* Shortcut for single character strings */
3649 if (PyUnicode_GET_SIZE(self) == 1)
3650 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3651
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003652 /* Special case for empty strings */
3653 if (PyString_GET_SIZE(self) == 0)
3654 return PyInt_FromLong(0);
3655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 e = p + PyUnicode_GET_SIZE(self);
3657 cased = 0;
3658 for (; p < e; p++) {
3659 register const Py_UNICODE ch = *p;
3660
3661 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3662 return PyInt_FromLong(0);
3663 else if (!cased && Py_UNICODE_ISUPPER(ch))
3664 cased = 1;
3665 }
3666 return PyInt_FromLong(cased);
3667}
3668
3669static char istitle__doc__[] =
3670"S.istitle() -> int\n\
3671\n\
3672Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3673may only follow uncased characters and lowercase characters only cased\n\
3674ones. Return 0 otherwise.";
3675
3676static PyObject*
3677unicode_istitle(PyUnicodeObject *self, PyObject *args)
3678{
3679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3680 register const Py_UNICODE *e;
3681 int cased, previous_is_cased;
3682
3683 if (!PyArg_NoArgs(args))
3684 return NULL;
3685
3686 /* Shortcut for single character strings */
3687 if (PyUnicode_GET_SIZE(self) == 1)
3688 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3689 (Py_UNICODE_ISUPPER(*p) != 0));
3690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003691 /* Special case for empty strings */
3692 if (PyString_GET_SIZE(self) == 0)
3693 return PyInt_FromLong(0);
3694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 e = p + PyUnicode_GET_SIZE(self);
3696 cased = 0;
3697 previous_is_cased = 0;
3698 for (; p < e; p++) {
3699 register const Py_UNICODE ch = *p;
3700
3701 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3702 if (previous_is_cased)
3703 return PyInt_FromLong(0);
3704 previous_is_cased = 1;
3705 cased = 1;
3706 }
3707 else if (Py_UNICODE_ISLOWER(ch)) {
3708 if (!previous_is_cased)
3709 return PyInt_FromLong(0);
3710 previous_is_cased = 1;
3711 cased = 1;
3712 }
3713 else
3714 previous_is_cased = 0;
3715 }
3716 return PyInt_FromLong(cased);
3717}
3718
3719static char isspace__doc__[] =
3720"S.isspace() -> int\n\
3721\n\
3722Return 1 if there are only whitespace characters in S,\n\
37230 otherwise.";
3724
3725static PyObject*
3726unicode_isspace(PyUnicodeObject *self, PyObject *args)
3727{
3728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3729 register const Py_UNICODE *e;
3730
3731 if (!PyArg_NoArgs(args))
3732 return NULL;
3733
3734 /* Shortcut for single character strings */
3735 if (PyUnicode_GET_SIZE(self) == 1 &&
3736 Py_UNICODE_ISSPACE(*p))
3737 return PyInt_FromLong(1);
3738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003739 /* Special case for empty strings */
3740 if (PyString_GET_SIZE(self) == 0)
3741 return PyInt_FromLong(0);
3742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 e = p + PyUnicode_GET_SIZE(self);
3744 for (; p < e; p++) {
3745 if (!Py_UNICODE_ISSPACE(*p))
3746 return PyInt_FromLong(0);
3747 }
3748 return PyInt_FromLong(1);
3749}
3750
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003751static char isalpha__doc__[] =
3752"S.isalpha() -> int\n\
3753\n\
3754Return 1 if all characters in S are alphabetic\n\
3755and there is at least one character in S, 0 otherwise.";
3756
3757static PyObject*
3758unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3759{
3760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3761 register const Py_UNICODE *e;
3762
3763 if (!PyArg_NoArgs(args))
3764 return NULL;
3765
3766 /* Shortcut for single character strings */
3767 if (PyUnicode_GET_SIZE(self) == 1 &&
3768 Py_UNICODE_ISALPHA(*p))
3769 return PyInt_FromLong(1);
3770
3771 /* Special case for empty strings */
3772 if (PyString_GET_SIZE(self) == 0)
3773 return PyInt_FromLong(0);
3774
3775 e = p + PyUnicode_GET_SIZE(self);
3776 for (; p < e; p++) {
3777 if (!Py_UNICODE_ISALPHA(*p))
3778 return PyInt_FromLong(0);
3779 }
3780 return PyInt_FromLong(1);
3781}
3782
3783static char isalnum__doc__[] =
3784"S.isalnum() -> int\n\
3785\n\
3786Return 1 if all characters in S are alphanumeric\n\
3787and there is at least one character in S, 0 otherwise.";
3788
3789static PyObject*
3790unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3791{
3792 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3793 register const Py_UNICODE *e;
3794
3795 if (!PyArg_NoArgs(args))
3796 return NULL;
3797
3798 /* Shortcut for single character strings */
3799 if (PyUnicode_GET_SIZE(self) == 1 &&
3800 Py_UNICODE_ISALNUM(*p))
3801 return PyInt_FromLong(1);
3802
3803 /* Special case for empty strings */
3804 if (PyString_GET_SIZE(self) == 0)
3805 return PyInt_FromLong(0);
3806
3807 e = p + PyUnicode_GET_SIZE(self);
3808 for (; p < e; p++) {
3809 if (!Py_UNICODE_ISALNUM(*p))
3810 return PyInt_FromLong(0);
3811 }
3812 return PyInt_FromLong(1);
3813}
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815static char isdecimal__doc__[] =
3816"S.isdecimal() -> int\n\
3817\n\
3818Return 1 if there are only decimal characters in S,\n\
38190 otherwise.";
3820
3821static PyObject*
3822unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3823{
3824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3825 register const Py_UNICODE *e;
3826
3827 if (!PyArg_NoArgs(args))
3828 return NULL;
3829
3830 /* Shortcut for single character strings */
3831 if (PyUnicode_GET_SIZE(self) == 1 &&
3832 Py_UNICODE_ISDECIMAL(*p))
3833 return PyInt_FromLong(1);
3834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003835 /* Special case for empty strings */
3836 if (PyString_GET_SIZE(self) == 0)
3837 return PyInt_FromLong(0);
3838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 e = p + PyUnicode_GET_SIZE(self);
3840 for (; p < e; p++) {
3841 if (!Py_UNICODE_ISDECIMAL(*p))
3842 return PyInt_FromLong(0);
3843 }
3844 return PyInt_FromLong(1);
3845}
3846
3847static char isdigit__doc__[] =
3848"S.isdigit() -> int\n\
3849\n\
3850Return 1 if there are only digit characters in S,\n\
38510 otherwise.";
3852
3853static PyObject*
3854unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3855{
3856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3857 register const Py_UNICODE *e;
3858
3859 if (!PyArg_NoArgs(args))
3860 return NULL;
3861
3862 /* Shortcut for single character strings */
3863 if (PyUnicode_GET_SIZE(self) == 1 &&
3864 Py_UNICODE_ISDIGIT(*p))
3865 return PyInt_FromLong(1);
3866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003867 /* Special case for empty strings */
3868 if (PyString_GET_SIZE(self) == 0)
3869 return PyInt_FromLong(0);
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 e = p + PyUnicode_GET_SIZE(self);
3872 for (; p < e; p++) {
3873 if (!Py_UNICODE_ISDIGIT(*p))
3874 return PyInt_FromLong(0);
3875 }
3876 return PyInt_FromLong(1);
3877}
3878
3879static char isnumeric__doc__[] =
3880"S.isnumeric() -> int\n\
3881\n\
3882Return 1 if there are only numeric characters in S,\n\
38830 otherwise.";
3884
3885static PyObject*
3886unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3887{
3888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3889 register const Py_UNICODE *e;
3890
3891 if (!PyArg_NoArgs(args))
3892 return NULL;
3893
3894 /* Shortcut for single character strings */
3895 if (PyUnicode_GET_SIZE(self) == 1 &&
3896 Py_UNICODE_ISNUMERIC(*p))
3897 return PyInt_FromLong(1);
3898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003899 /* Special case for empty strings */
3900 if (PyString_GET_SIZE(self) == 0)
3901 return PyInt_FromLong(0);
3902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 e = p + PyUnicode_GET_SIZE(self);
3904 for (; p < e; p++) {
3905 if (!Py_UNICODE_ISNUMERIC(*p))
3906 return PyInt_FromLong(0);
3907 }
3908 return PyInt_FromLong(1);
3909}
3910
3911static char join__doc__[] =
3912"S.join(sequence) -> unicode\n\
3913\n\
3914Return a string which is the concatenation of the strings in the\n\
3915sequence. The separator between elements is S.";
3916
3917static PyObject*
3918unicode_join(PyUnicodeObject *self, PyObject *args)
3919{
3920 PyObject *data;
3921 if (!PyArg_ParseTuple(args, "O:join", &data))
3922 return NULL;
3923
3924 return PyUnicode_Join((PyObject *)self, data);
3925}
3926
3927static int
3928unicode_length(PyUnicodeObject *self)
3929{
3930 return self->length;
3931}
3932
3933static char ljust__doc__[] =
3934"S.ljust(width) -> unicode\n\
3935\n\
3936Return S left justified in a Unicode string of length width. Padding is\n\
3937done using spaces.";
3938
3939static PyObject *
3940unicode_ljust(PyUnicodeObject *self, PyObject *args)
3941{
3942 int width;
3943 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3944 return NULL;
3945
3946 if (self->length >= width) {
3947 Py_INCREF(self);
3948 return (PyObject*) self;
3949 }
3950
3951 return (PyObject*) pad(self, 0, width - self->length, ' ');
3952}
3953
3954static char lower__doc__[] =
3955"S.lower() -> unicode\n\
3956\n\
3957Return a copy of the string S converted to lowercase.";
3958
3959static PyObject*
3960unicode_lower(PyUnicodeObject *self, PyObject *args)
3961{
3962 if (!PyArg_NoArgs(args))
3963 return NULL;
3964 return fixup(self, fixlower);
3965}
3966
3967static char lstrip__doc__[] =
3968"S.lstrip() -> unicode\n\
3969\n\
3970Return a copy of the string S with leading whitespace removed.";
3971
3972static PyObject *
3973unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3974{
3975 if (!PyArg_NoArgs(args))
3976 return NULL;
3977 return strip(self, 1, 0);
3978}
3979
3980static PyObject*
3981unicode_repeat(PyUnicodeObject *str, int len)
3982{
3983 PyUnicodeObject *u;
3984 Py_UNICODE *p;
3985
3986 if (len < 0)
3987 len = 0;
3988
3989 if (len == 1) {
3990 /* no repeat, return original string */
3991 Py_INCREF(str);
3992 return (PyObject*) str;
3993 }
3994
3995 u = _PyUnicode_New(len * str->length);
3996 if (!u)
3997 return NULL;
3998
3999 p = u->str;
4000
4001 while (len-- > 0) {
4002 Py_UNICODE_COPY(p, str->str, str->length);
4003 p += str->length;
4004 }
4005
4006 return (PyObject*) u;
4007}
4008
4009PyObject *PyUnicode_Replace(PyObject *obj,
4010 PyObject *subobj,
4011 PyObject *replobj,
4012 int maxcount)
4013{
4014 PyObject *self;
4015 PyObject *str1;
4016 PyObject *str2;
4017 PyObject *result;
4018
4019 self = PyUnicode_FromObject(obj);
4020 if (self == NULL)
4021 return NULL;
4022 str1 = PyUnicode_FromObject(subobj);
4023 if (str1 == NULL) {
4024 Py_DECREF(self);
4025 return NULL;
4026 }
4027 str2 = PyUnicode_FromObject(replobj);
4028 if (str2 == NULL) {
4029 Py_DECREF(self);
4030 Py_DECREF(str1);
4031 return NULL;
4032 }
4033 result = replace((PyUnicodeObject *)self,
4034 (PyUnicodeObject *)str1,
4035 (PyUnicodeObject *)str2,
4036 maxcount);
4037 Py_DECREF(self);
4038 Py_DECREF(str1);
4039 Py_DECREF(str2);
4040 return result;
4041}
4042
4043static char replace__doc__[] =
4044"S.replace (old, new[, maxsplit]) -> unicode\n\
4045\n\
4046Return a copy of S with all occurrences of substring\n\
4047old replaced by new. If the optional argument maxsplit is\n\
4048given, only the first maxsplit occurrences are replaced.";
4049
4050static PyObject*
4051unicode_replace(PyUnicodeObject *self, PyObject *args)
4052{
4053 PyUnicodeObject *str1;
4054 PyUnicodeObject *str2;
4055 int maxcount = -1;
4056 PyObject *result;
4057
4058 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4059 return NULL;
4060 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4061 if (str1 == NULL)
4062 return NULL;
4063 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4064 if (str2 == NULL)
4065 return NULL;
4066
4067 result = replace(self, str1, str2, maxcount);
4068
4069 Py_DECREF(str1);
4070 Py_DECREF(str2);
4071 return result;
4072}
4073
4074static
4075PyObject *unicode_repr(PyObject *unicode)
4076{
4077 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4078 PyUnicode_GET_SIZE(unicode),
4079 1);
4080}
4081
4082static char rfind__doc__[] =
4083"S.rfind(sub [,start [,end]]) -> int\n\
4084\n\
4085Return the highest index in S where substring sub is found,\n\
4086such that sub is contained within s[start,end]. Optional\n\
4087arguments start and end are interpreted as in slice notation.\n\
4088\n\
4089Return -1 on failure.";
4090
4091static PyObject *
4092unicode_rfind(PyUnicodeObject *self, PyObject *args)
4093{
4094 PyUnicodeObject *substring;
4095 int start = 0;
4096 int end = INT_MAX;
4097 PyObject *result;
4098
Guido van Rossumb8872e62000-05-09 14:14:27 +00004099 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 return NULL;
4102 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4103 (PyObject *)substring);
4104 if (substring == NULL)
4105 return NULL;
4106
4107 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4108
4109 Py_DECREF(substring);
4110 return result;
4111}
4112
4113static char rindex__doc__[] =
4114"S.rindex(sub [,start [,end]]) -> int\n\
4115\n\
4116Like S.rfind() but raise ValueError when the substring is not found.";
4117
4118static PyObject *
4119unicode_rindex(PyUnicodeObject *self, PyObject *args)
4120{
4121 int result;
4122 PyUnicodeObject *substring;
4123 int start = 0;
4124 int end = INT_MAX;
4125
Guido van Rossumb8872e62000-05-09 14:14:27 +00004126 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4127 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 return NULL;
4129 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4130 (PyObject *)substring);
4131 if (substring == NULL)
4132 return NULL;
4133
4134 result = findstring(self, substring, start, end, -1);
4135
4136 Py_DECREF(substring);
4137 if (result < 0) {
4138 PyErr_SetString(PyExc_ValueError, "substring not found");
4139 return NULL;
4140 }
4141 return PyInt_FromLong(result);
4142}
4143
4144static char rjust__doc__[] =
4145"S.rjust(width) -> unicode\n\
4146\n\
4147Return S right justified in a Unicode string of length width. Padding is\n\
4148done using spaces.";
4149
4150static PyObject *
4151unicode_rjust(PyUnicodeObject *self, PyObject *args)
4152{
4153 int width;
4154 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4155 return NULL;
4156
4157 if (self->length >= width) {
4158 Py_INCREF(self);
4159 return (PyObject*) self;
4160 }
4161
4162 return (PyObject*) pad(self, width - self->length, 0, ' ');
4163}
4164
4165static char rstrip__doc__[] =
4166"S.rstrip() -> unicode\n\
4167\n\
4168Return a copy of the string S with trailing whitespace removed.";
4169
4170static PyObject *
4171unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4172{
4173 if (!PyArg_NoArgs(args))
4174 return NULL;
4175 return strip(self, 0, 1);
4176}
4177
4178static PyObject*
4179unicode_slice(PyUnicodeObject *self, int start, int end)
4180{
4181 /* standard clamping */
4182 if (start < 0)
4183 start = 0;
4184 if (end < 0)
4185 end = 0;
4186 if (end > self->length)
4187 end = self->length;
4188 if (start == 0 && end == self->length) {
4189 /* full slice, return original string */
4190 Py_INCREF(self);
4191 return (PyObject*) self;
4192 }
4193 if (start > end)
4194 start = end;
4195 /* copy slice */
4196 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4197 end - start);
4198}
4199
4200PyObject *PyUnicode_Split(PyObject *s,
4201 PyObject *sep,
4202 int maxsplit)
4203{
4204 PyObject *result;
4205
4206 s = PyUnicode_FromObject(s);
4207 if (s == NULL)
4208 return NULL;
4209 if (sep != NULL) {
4210 sep = PyUnicode_FromObject(sep);
4211 if (sep == NULL) {
4212 Py_DECREF(s);
4213 return NULL;
4214 }
4215 }
4216
4217 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4218
4219 Py_DECREF(s);
4220 Py_XDECREF(sep);
4221 return result;
4222}
4223
4224static char split__doc__[] =
4225"S.split([sep [,maxsplit]]) -> list of strings\n\
4226\n\
4227Return a list of the words in S, using sep as the\n\
4228delimiter string. If maxsplit is given, at most maxsplit\n\
4229splits are done. If sep is not specified, any whitespace string\n\
4230is a separator.";
4231
4232static PyObject*
4233unicode_split(PyUnicodeObject *self, PyObject *args)
4234{
4235 PyObject *substring = Py_None;
4236 int maxcount = -1;
4237
4238 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4239 return NULL;
4240
4241 if (substring == Py_None)
4242 return split(self, NULL, maxcount);
4243 else if (PyUnicode_Check(substring))
4244 return split(self, (PyUnicodeObject *)substring, maxcount);
4245 else
4246 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4247}
4248
4249static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004250"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251\n\
4252Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004253Line breaks are not included in the resulting list unless keepends\n\
4254is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255
4256static PyObject*
4257unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4258{
Guido van Rossum86662912000-04-11 15:38:46 +00004259 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260
Guido van Rossum86662912000-04-11 15:38:46 +00004261 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 return NULL;
4263
Guido van Rossum86662912000-04-11 15:38:46 +00004264 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265}
4266
4267static
4268PyObject *unicode_str(PyUnicodeObject *self)
4269{
Fred Drakee4315f52000-05-09 19:53:39 +00004270 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271}
4272
4273static char strip__doc__[] =
4274"S.strip() -> unicode\n\
4275\n\
4276Return a copy of S with leading and trailing whitespace removed.";
4277
4278static PyObject *
4279unicode_strip(PyUnicodeObject *self, PyObject *args)
4280{
4281 if (!PyArg_NoArgs(args))
4282 return NULL;
4283 return strip(self, 1, 1);
4284}
4285
4286static char swapcase__doc__[] =
4287"S.swapcase() -> unicode\n\
4288\n\
4289Return a copy of S with uppercase characters converted to lowercase\n\
4290and vice versa.";
4291
4292static PyObject*
4293unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4294{
4295 if (!PyArg_NoArgs(args))
4296 return NULL;
4297 return fixup(self, fixswapcase);
4298}
4299
4300static char translate__doc__[] =
4301"S.translate(table) -> unicode\n\
4302\n\
4303Return a copy of the string S, where all characters have been mapped\n\
4304through the given translation table, which must be a mapping of\n\
4305Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4306are left untouched. Characters mapped to None are deleted.";
4307
4308static PyObject*
4309unicode_translate(PyUnicodeObject *self, PyObject *args)
4310{
4311 PyObject *table;
4312
4313 if (!PyArg_ParseTuple(args, "O:translate", &table))
4314 return NULL;
4315 return PyUnicode_TranslateCharmap(self->str,
4316 self->length,
4317 table,
4318 "ignore");
4319}
4320
4321static char upper__doc__[] =
4322"S.upper() -> unicode\n\
4323\n\
4324Return a copy of S converted to uppercase.";
4325
4326static PyObject*
4327unicode_upper(PyUnicodeObject *self, PyObject *args)
4328{
4329 if (!PyArg_NoArgs(args))
4330 return NULL;
4331 return fixup(self, fixupper);
4332}
4333
4334#if 0
4335static char zfill__doc__[] =
4336"S.zfill(width) -> unicode\n\
4337\n\
4338Pad a numeric string x with zeros on the left, to fill a field\n\
4339of the specified width. The string x is never truncated.";
4340
4341static PyObject *
4342unicode_zfill(PyUnicodeObject *self, PyObject *args)
4343{
4344 int fill;
4345 PyUnicodeObject *u;
4346
4347 int width;
4348 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4349 return NULL;
4350
4351 if (self->length >= width) {
4352 Py_INCREF(self);
4353 return (PyObject*) self;
4354 }
4355
4356 fill = width - self->length;
4357
4358 u = pad(self, fill, 0, '0');
4359
4360 if (u->str[fill] == '+' || u->str[fill] == '-') {
4361 /* move sign to beginning of string */
4362 u->str[0] = u->str[fill];
4363 u->str[fill] = '0';
4364 }
4365
4366 return (PyObject*) u;
4367}
4368#endif
4369
4370#if 0
4371static PyObject*
4372unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4373{
4374 if (!PyArg_NoArgs(args))
4375 return NULL;
4376 return PyInt_FromLong(unicode_freelist_size);
4377}
4378#endif
4379
4380static char startswith__doc__[] =
4381"S.startswith(prefix[, start[, end]]) -> int\n\
4382\n\
4383Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4384optional start, test S beginning at that position. With optional end, stop\n\
4385comparing S at that position.";
4386
4387static PyObject *
4388unicode_startswith(PyUnicodeObject *self,
4389 PyObject *args)
4390{
4391 PyUnicodeObject *substring;
4392 int start = 0;
4393 int end = INT_MAX;
4394 PyObject *result;
4395
Guido van Rossumb8872e62000-05-09 14:14:27 +00004396 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4397 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 return NULL;
4399 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4400 (PyObject *)substring);
4401 if (substring == NULL)
4402 return NULL;
4403
4404 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4405
4406 Py_DECREF(substring);
4407 return result;
4408}
4409
4410
4411static char endswith__doc__[] =
4412"S.endswith(suffix[, start[, end]]) -> int\n\
4413\n\
4414Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4415optional start, test S beginning at that position. With optional end, stop\n\
4416comparing S at that position.";
4417
4418static PyObject *
4419unicode_endswith(PyUnicodeObject *self,
4420 PyObject *args)
4421{
4422 PyUnicodeObject *substring;
4423 int start = 0;
4424 int end = INT_MAX;
4425 PyObject *result;
4426
Guido van Rossumb8872e62000-05-09 14:14:27 +00004427 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4428 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return NULL;
4430 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4431 (PyObject *)substring);
4432 if (substring == NULL)
4433 return NULL;
4434
4435 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4436
4437 Py_DECREF(substring);
4438 return result;
4439}
4440
4441
4442static PyMethodDef unicode_methods[] = {
4443
4444 /* Order is according to common usage: often used methods should
4445 appear first, since lookup is done sequentially. */
4446
4447 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4448 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4449 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4450 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4451 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4452 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4453 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4454 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4455 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4456 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4457 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4458 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4459 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4460 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4461/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4462 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4463 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4464 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4465 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4466 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4467 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4468 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4469 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4470 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4471 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4472 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4473 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4474 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4475 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4476 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4477 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4478 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4479 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004480 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4481 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482#if 0
4483 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4484 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4485#endif
4486
4487#if 0
4488 /* This one is just used for debugging the implementation. */
4489 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4490#endif
4491
4492 {NULL, NULL}
4493};
4494
4495static PyObject *
4496unicode_getattr(PyUnicodeObject *self, char *name)
4497{
4498 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4499}
4500
4501static PySequenceMethods unicode_as_sequence = {
4502 (inquiry) unicode_length, /* sq_length */
4503 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4504 (intargfunc) unicode_repeat, /* sq_repeat */
4505 (intargfunc) unicode_getitem, /* sq_item */
4506 (intintargfunc) unicode_slice, /* sq_slice */
4507 0, /* sq_ass_item */
4508 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004509 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510};
4511
4512static int
4513unicode_buffer_getreadbuf(PyUnicodeObject *self,
4514 int index,
4515 const void **ptr)
4516{
4517 if (index != 0) {
4518 PyErr_SetString(PyExc_SystemError,
4519 "accessing non-existent unicode segment");
4520 return -1;
4521 }
4522 *ptr = (void *) self->str;
4523 return PyUnicode_GET_DATA_SIZE(self);
4524}
4525
4526static int
4527unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4528 const void **ptr)
4529{
4530 PyErr_SetString(PyExc_TypeError,
4531 "cannot use unicode as modifyable buffer");
4532 return -1;
4533}
4534
4535static int
4536unicode_buffer_getsegcount(PyUnicodeObject *self,
4537 int *lenp)
4538{
4539 if (lenp)
4540 *lenp = PyUnicode_GET_DATA_SIZE(self);
4541 return 1;
4542}
4543
4544static int
4545unicode_buffer_getcharbuf(PyUnicodeObject *self,
4546 int index,
4547 const void **ptr)
4548{
4549 PyObject *str;
4550
4551 if (index != 0) {
4552 PyErr_SetString(PyExc_SystemError,
4553 "accessing non-existent unicode segment");
4554 return -1;
4555 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004556 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 if (str == NULL)
4558 return -1;
4559 *ptr = (void *) PyString_AS_STRING(str);
4560 return PyString_GET_SIZE(str);
4561}
4562
4563/* Helpers for PyUnicode_Format() */
4564
4565static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004566getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567{
4568 int argidx = *p_argidx;
4569 if (argidx < arglen) {
4570 (*p_argidx)++;
4571 if (arglen < 0)
4572 return args;
4573 else
4574 return PyTuple_GetItem(args, argidx);
4575 }
4576 PyErr_SetString(PyExc_TypeError,
4577 "not enough arguments for format string");
4578 return NULL;
4579}
4580
4581#define F_LJUST (1<<0)
4582#define F_SIGN (1<<1)
4583#define F_BLANK (1<<2)
4584#define F_ALT (1<<3)
4585#define F_ZERO (1<<4)
4586
4587static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589{
4590 register int i;
4591 int len;
4592 va_list va;
4593 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595
4596 /* First, format the string as char array, then expand to Py_UNICODE
4597 array. */
4598 charbuffer = (char *)buffer;
4599 len = vsprintf(charbuffer, format, va);
4600 for (i = len - 1; i >= 0; i--)
4601 buffer[i] = (Py_UNICODE) charbuffer[i];
4602
4603 va_end(va);
4604 return len;
4605}
4606
4607static int
4608formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004609 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 int flags,
4611 int prec,
4612 int type,
4613 PyObject *v)
4614{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004615 /* fmt = '%#.' + `prec` + `type`
4616 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 char fmt[20];
4618 double x;
4619
4620 x = PyFloat_AsDouble(v);
4621 if (x == -1.0 && PyErr_Occurred())
4622 return -1;
4623 if (prec < 0)
4624 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4626 type = 'g';
4627 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004628 /* worst case length calc to ensure no buffer overrun:
4629 fmt = %#.<prec>g
4630 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4631 for any double rep.)
4632 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4633 If prec=0 the effective precision is 1 (the leading digit is
4634 always given), therefore increase by one to 10+prec. */
4635 if (buflen <= (size_t)10 + (size_t)prec) {
4636 PyErr_SetString(PyExc_OverflowError,
4637 "formatted float is too long (precision too long?)");
4638 return -1;
4639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 return usprintf(buf, fmt, x);
4641}
4642
4643static int
4644formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004645 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 int flags,
4647 int prec,
4648 int type,
4649 PyObject *v)
4650{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004651 /* fmt = '%#.' + `prec` + 'l' + `type`
4652 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 char fmt[20];
4654 long x;
4655
4656 x = PyInt_AsLong(v);
4657 if (x == -1 && PyErr_Occurred())
4658 return -1;
4659 if (prec < 0)
4660 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004661 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4662 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4663 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4664 PyErr_SetString(PyExc_OverflowError,
4665 "formatted integer is too long (precision too long?)");
4666 return -1;
4667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4669 return usprintf(buf, fmt, x);
4670}
4671
4672static int
4673formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004674 size_t buflen,
4675 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004677 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004678 if (PyUnicode_Check(v)) {
4679 if (PyUnicode_GET_SIZE(v) != 1)
4680 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004684 else if (PyString_Check(v)) {
4685 if (PyString_GET_SIZE(v) != 1)
4686 goto onError;
4687 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
4690 else {
4691 /* Integer input truncated to a character */
4692 long x;
4693 x = PyInt_AsLong(v);
4694 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 buf[0] = (char) x;
4697 }
4698 buf[1] = '\0';
4699 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004700
4701 onError:
4702 PyErr_SetString(PyExc_TypeError,
4703 "%c requires int or char");
4704 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705}
4706
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004707/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4708
4709 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4710 chars are formatted. XXX This is a magic number. Each formatting
4711 routine does bounds checking to ensure no overflow, but a better
4712 solution may be to malloc a buffer of appropriate size for each
4713 format. For now, the current solution is sufficient.
4714*/
4715#define FORMATBUFLEN (size_t)120
4716
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717PyObject *PyUnicode_Format(PyObject *format,
4718 PyObject *args)
4719{
4720 Py_UNICODE *fmt, *res;
4721 int fmtcnt, rescnt, reslen, arglen, argidx;
4722 int args_owned = 0;
4723 PyUnicodeObject *result = NULL;
4724 PyObject *dict = NULL;
4725 PyObject *uformat;
4726
4727 if (format == NULL || args == NULL) {
4728 PyErr_BadInternalCall();
4729 return NULL;
4730 }
4731 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004732 if (uformat == NULL)
4733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 fmt = PyUnicode_AS_UNICODE(uformat);
4735 fmtcnt = PyUnicode_GET_SIZE(uformat);
4736
4737 reslen = rescnt = fmtcnt + 100;
4738 result = _PyUnicode_New(reslen);
4739 if (result == NULL)
4740 goto onError;
4741 res = PyUnicode_AS_UNICODE(result);
4742
4743 if (PyTuple_Check(args)) {
4744 arglen = PyTuple_Size(args);
4745 argidx = 0;
4746 }
4747 else {
4748 arglen = -1;
4749 argidx = -2;
4750 }
4751 if (args->ob_type->tp_as_mapping)
4752 dict = args;
4753
4754 while (--fmtcnt >= 0) {
4755 if (*fmt != '%') {
4756 if (--rescnt < 0) {
4757 rescnt = fmtcnt + 100;
4758 reslen += rescnt;
4759 if (_PyUnicode_Resize(result, reslen) < 0)
4760 return NULL;
4761 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4762 --rescnt;
4763 }
4764 *res++ = *fmt++;
4765 }
4766 else {
4767 /* Got a format specifier */
4768 int flags = 0;
4769 int width = -1;
4770 int prec = -1;
4771 int size = 0;
4772 Py_UNICODE c = '\0';
4773 Py_UNICODE fill;
4774 PyObject *v = NULL;
4775 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004776 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 Py_UNICODE sign;
4778 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004779 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780
4781 fmt++;
4782 if (*fmt == '(') {
4783 Py_UNICODE *keystart;
4784 int keylen;
4785 PyObject *key;
4786 int pcount = 1;
4787
4788 if (dict == NULL) {
4789 PyErr_SetString(PyExc_TypeError,
4790 "format requires a mapping");
4791 goto onError;
4792 }
4793 ++fmt;
4794 --fmtcnt;
4795 keystart = fmt;
4796 /* Skip over balanced parentheses */
4797 while (pcount > 0 && --fmtcnt >= 0) {
4798 if (*fmt == ')')
4799 --pcount;
4800 else if (*fmt == '(')
4801 ++pcount;
4802 fmt++;
4803 }
4804 keylen = fmt - keystart - 1;
4805 if (fmtcnt < 0 || pcount > 0) {
4806 PyErr_SetString(PyExc_ValueError,
4807 "incomplete format key");
4808 goto onError;
4809 }
Fred Drakee4315f52000-05-09 19:53:39 +00004810 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 then looked up since Python uses strings to hold
4812 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004813 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 key = PyUnicode_EncodeUTF8(keystart,
4815 keylen,
4816 NULL);
4817 if (key == NULL)
4818 goto onError;
4819 if (args_owned) {
4820 Py_DECREF(args);
4821 args_owned = 0;
4822 }
4823 args = PyObject_GetItem(dict, key);
4824 Py_DECREF(key);
4825 if (args == NULL) {
4826 goto onError;
4827 }
4828 args_owned = 1;
4829 arglen = -1;
4830 argidx = -2;
4831 }
4832 while (--fmtcnt >= 0) {
4833 switch (c = *fmt++) {
4834 case '-': flags |= F_LJUST; continue;
4835 case '+': flags |= F_SIGN; continue;
4836 case ' ': flags |= F_BLANK; continue;
4837 case '#': flags |= F_ALT; continue;
4838 case '0': flags |= F_ZERO; continue;
4839 }
4840 break;
4841 }
4842 if (c == '*') {
4843 v = getnextarg(args, arglen, &argidx);
4844 if (v == NULL)
4845 goto onError;
4846 if (!PyInt_Check(v)) {
4847 PyErr_SetString(PyExc_TypeError,
4848 "* wants int");
4849 goto onError;
4850 }
4851 width = PyInt_AsLong(v);
4852 if (width < 0) {
4853 flags |= F_LJUST;
4854 width = -width;
4855 }
4856 if (--fmtcnt >= 0)
4857 c = *fmt++;
4858 }
4859 else if (c >= '0' && c <= '9') {
4860 width = c - '0';
4861 while (--fmtcnt >= 0) {
4862 c = *fmt++;
4863 if (c < '0' || c > '9')
4864 break;
4865 if ((width*10) / 10 != width) {
4866 PyErr_SetString(PyExc_ValueError,
4867 "width too big");
4868 goto onError;
4869 }
4870 width = width*10 + (c - '0');
4871 }
4872 }
4873 if (c == '.') {
4874 prec = 0;
4875 if (--fmtcnt >= 0)
4876 c = *fmt++;
4877 if (c == '*') {
4878 v = getnextarg(args, arglen, &argidx);
4879 if (v == NULL)
4880 goto onError;
4881 if (!PyInt_Check(v)) {
4882 PyErr_SetString(PyExc_TypeError,
4883 "* wants int");
4884 goto onError;
4885 }
4886 prec = PyInt_AsLong(v);
4887 if (prec < 0)
4888 prec = 0;
4889 if (--fmtcnt >= 0)
4890 c = *fmt++;
4891 }
4892 else if (c >= '0' && c <= '9') {
4893 prec = c - '0';
4894 while (--fmtcnt >= 0) {
4895 c = Py_CHARMASK(*fmt++);
4896 if (c < '0' || c > '9')
4897 break;
4898 if ((prec*10) / 10 != prec) {
4899 PyErr_SetString(PyExc_ValueError,
4900 "prec too big");
4901 goto onError;
4902 }
4903 prec = prec*10 + (c - '0');
4904 }
4905 }
4906 } /* prec */
4907 if (fmtcnt >= 0) {
4908 if (c == 'h' || c == 'l' || c == 'L') {
4909 size = c;
4910 if (--fmtcnt >= 0)
4911 c = *fmt++;
4912 }
4913 }
4914 if (fmtcnt < 0) {
4915 PyErr_SetString(PyExc_ValueError,
4916 "incomplete format");
4917 goto onError;
4918 }
4919 if (c != '%') {
4920 v = getnextarg(args, arglen, &argidx);
4921 if (v == NULL)
4922 goto onError;
4923 }
4924 sign = 0;
4925 fill = ' ';
4926 switch (c) {
4927
4928 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004929 pbuf = formatbuf;
4930 /* presume that buffer length is at least 1 */
4931 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 len = 1;
4933 break;
4934
4935 case 's':
4936 case 'r':
4937 if (PyUnicode_Check(v) && c == 's') {
4938 temp = v;
4939 Py_INCREF(temp);
4940 }
4941 else {
4942 PyObject *unicode;
4943 if (c == 's')
4944 temp = PyObject_Str(v);
4945 else
4946 temp = PyObject_Repr(v);
4947 if (temp == NULL)
4948 goto onError;
4949 if (!PyString_Check(temp)) {
4950 /* XXX Note: this should never happen, since
4951 PyObject_Repr() and PyObject_Str() assure
4952 this */
4953 Py_DECREF(temp);
4954 PyErr_SetString(PyExc_TypeError,
4955 "%s argument has non-string str()");
4956 goto onError;
4957 }
Fred Drakee4315f52000-05-09 19:53:39 +00004958 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004960 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 "strict");
4962 Py_DECREF(temp);
4963 temp = unicode;
4964 if (temp == NULL)
4965 goto onError;
4966 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004967 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 len = PyUnicode_GET_SIZE(temp);
4969 if (prec >= 0 && len > prec)
4970 len = prec;
4971 break;
4972
4973 case 'i':
4974 case 'd':
4975 case 'u':
4976 case 'o':
4977 case 'x':
4978 case 'X':
4979 if (c == 'i')
4980 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004981 pbuf = formatbuf;
4982 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4983 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 if (len < 0)
4985 goto onError;
4986 sign = (c == 'd');
4987 if (flags & F_ZERO) {
4988 fill = '0';
4989 if ((flags&F_ALT) &&
4990 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004991 pbuf[0] == '0' && pbuf[1] == c) {
4992 *res++ = *pbuf++;
4993 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 rescnt -= 2;
4995 len -= 2;
4996 width -= 2;
4997 if (width < 0)
4998 width = 0;
4999 }
5000 }
5001 break;
5002
5003 case 'e':
5004 case 'E':
5005 case 'f':
5006 case 'g':
5007 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005008 pbuf = formatbuf;
5009 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5010 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011 if (len < 0)
5012 goto onError;
5013 sign = 1;
5014 if (flags&F_ZERO)
5015 fill = '0';
5016 break;
5017
5018 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005019 pbuf = formatbuf;
5020 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 if (len < 0)
5022 goto onError;
5023 break;
5024
5025 default:
5026 PyErr_Format(PyExc_ValueError,
5027 "unsupported format character '%c' (0x%x)",
5028 c, c);
5029 goto onError;
5030 }
5031 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005032 if (*pbuf == '-' || *pbuf == '+') {
5033 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 len--;
5035 }
5036 else if (flags & F_SIGN)
5037 sign = '+';
5038 else if (flags & F_BLANK)
5039 sign = ' ';
5040 else
5041 sign = 0;
5042 }
5043 if (width < len)
5044 width = len;
5045 if (rescnt < width + (sign != 0)) {
5046 reslen -= rescnt;
5047 rescnt = width + fmtcnt + 100;
5048 reslen += rescnt;
5049 if (_PyUnicode_Resize(result, reslen) < 0)
5050 return NULL;
5051 res = PyUnicode_AS_UNICODE(result)
5052 + reslen - rescnt;
5053 }
5054 if (sign) {
5055 if (fill != ' ')
5056 *res++ = sign;
5057 rescnt--;
5058 if (width > len)
5059 width--;
5060 }
5061 if (width > len && !(flags & F_LJUST)) {
5062 do {
5063 --rescnt;
5064 *res++ = fill;
5065 } while (--width > len);
5066 }
5067 if (sign && fill == ' ')
5068 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005069 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 res += len;
5071 rescnt -= len;
5072 while (--width >= len) {
5073 --rescnt;
5074 *res++ = ' ';
5075 }
5076 if (dict && (argidx < arglen) && c != '%') {
5077 PyErr_SetString(PyExc_TypeError,
5078 "not all arguments converted");
5079 goto onError;
5080 }
5081 Py_XDECREF(temp);
5082 } /* '%' */
5083 } /* until end */
5084 if (argidx < arglen && !dict) {
5085 PyErr_SetString(PyExc_TypeError,
5086 "not all arguments converted");
5087 goto onError;
5088 }
5089
5090 if (args_owned) {
5091 Py_DECREF(args);
5092 }
5093 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005094 if (_PyUnicode_Resize(result, reslen - rescnt))
5095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 return (PyObject *)result;
5097
5098 onError:
5099 Py_XDECREF(result);
5100 Py_DECREF(uformat);
5101 if (args_owned) {
5102 Py_DECREF(args);
5103 }
5104 return NULL;
5105}
5106
5107static PyBufferProcs unicode_as_buffer = {
5108 (getreadbufferproc) unicode_buffer_getreadbuf,
5109 (getwritebufferproc) unicode_buffer_getwritebuf,
5110 (getsegcountproc) unicode_buffer_getsegcount,
5111 (getcharbufferproc) unicode_buffer_getcharbuf,
5112};
5113
5114PyTypeObject PyUnicode_Type = {
5115 PyObject_HEAD_INIT(&PyType_Type)
5116 0, /* ob_size */
5117 "unicode", /* tp_name */
5118 sizeof(PyUnicodeObject), /* tp_size */
5119 0, /* tp_itemsize */
5120 /* Slots */
5121 (destructor)_PyUnicode_Free, /* tp_dealloc */
5122 0, /* tp_print */
5123 (getattrfunc)unicode_getattr, /* tp_getattr */
5124 0, /* tp_setattr */
5125 (cmpfunc) unicode_compare, /* tp_compare */
5126 (reprfunc) unicode_repr, /* tp_repr */
5127 0, /* tp_as_number */
5128 &unicode_as_sequence, /* tp_as_sequence */
5129 0, /* tp_as_mapping */
5130 (hashfunc) unicode_hash, /* tp_hash*/
5131 0, /* tp_call*/
5132 (reprfunc) unicode_str, /* tp_str */
5133 (getattrofunc) NULL, /* tp_getattro */
5134 (setattrofunc) NULL, /* tp_setattro */
5135 &unicode_as_buffer, /* tp_as_buffer */
5136 Py_TPFLAGS_DEFAULT, /* tp_flags */
5137};
5138
5139/* Initialize the Unicode implementation */
5140
Thomas Wouters78890102000-07-22 19:25:51 +00005141void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142{
5143 /* Doublecheck the configuration... */
5144 if (sizeof(Py_UNICODE) != 2)
5145 Py_FatalError("Unicode configuration error: "
5146 "sizeof(Py_UNICODE) != 2 bytes");
5147
Fred Drakee4315f52000-05-09 19:53:39 +00005148 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005149 unicode_freelist = NULL;
5150 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005152 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153}
5154
5155/* Finalize the Unicode implementation */
5156
5157void
Thomas Wouters78890102000-07-22 19:25:51 +00005158_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
5160 PyUnicodeObject *u = unicode_freelist;
5161
5162 while (u != NULL) {
5163 PyUnicodeObject *v = u;
5164 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005165 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005166 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005167 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005168 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005170 unicode_freelist = NULL;
5171 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005173 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174}