blob: 1d35c3d3805ff02d5a1366b10c5303f6cd891cfe [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000246 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
364{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 const char *s;
366 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000367 int owned = 0;
368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
373 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000374
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
383 }
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
389 }
390 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
397 }
398 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
403 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414
415 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 if (len == 0) {
417 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return v;
427
428 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000429 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000430 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000431 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433}
434
435PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *buffer = NULL, *unicode;
441
Fred Drakee4315f52000-05-09 19:53:39 +0000442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
444
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000462 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
466 }
467 Py_DECREF(buffer);
468 return unicode;
469
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
479{
480 PyObject *v, *unicode;
481
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *v;
495
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
499 }
Fred Drakee4315f52000-05-09 19:53:39 +0000500
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
503
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000521 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
525 }
526 return v;
527
528 onError:
529 return NULL;
530}
531
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000532/* Return a Python string holding the default encoded value of the
533 Unicode object.
534
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
539
540 The refcount of the string is *not* incremented.
541
542 *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
548{
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
557}
558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
564 }
565 return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568 return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577 return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580 return -1;
581}
582
Thomas Wouters78890102000-07-22 19:25:51 +0000583const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000584{
585 return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590 PyObject *v;
591
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
602
603 onError:
604 return -1;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
636{
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000640 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 details);
642 return -1;
643 }
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
647 }
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
653 }
654 else {
655 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000656 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 errors);
658 return -1;
659 }
660}
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
665{
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000670 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
679
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
683
684 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000685 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686
687 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000688 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 s++;
690 continue;
691 }
692
693 n = utf8_code_length[ch];
694
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699
700 switch (n) {
701
702 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "unexpected code byte";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 errmsg = "internal error";
709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710 break;
711
712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 break;
725
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000738 *p++ = (Py_UNICODE)ch;
739 break;
740
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000758 /* compute and append the two surrogates: */
759
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
762
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 break;
769
770 default:
771 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 }
776 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 continue;
778
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 }
783
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
787
788 return (PyObject *)unicode;
789
790onError:
791 Py_DECREF(unicode);
792 return NULL;
793}
794
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795/* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000797#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798static
799int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
818 }
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 errors);
824 return -1;
825 }
826}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
832{
833 PyObject *v;
834 char *p;
835 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000842 if (v == NULL)
843 return NULL;
844 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000845 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten++;
853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000857 cbWritten += 2;
858 }
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000871 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000872 }
873
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 i++;
880 cbWritten += 4;
881 }
882 }
883 }
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
887 }
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891 }
892 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000893 if (_PyString_Resize(&v, p - q))
894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895 return v;
896
897 onError:
898 Py_DECREF(v);
899 return NULL;
900}
901
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
903{
904 PyObject *str;
905
906 if (!PyUnicode_Check(unicode)) {
907 PyErr_BadArgument();
908 return NULL;
909 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000910 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
911 PyUnicode_GET_SIZE(unicode),
912 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000913}
914
915/* --- UTF-16 Codec ------------------------------------------------------- */
916
917static
918int utf16_decoding_error(const Py_UNICODE **source,
919 Py_UNICODE **dest,
920 const char *errors,
921 const char *details)
922{
923 if ((errors == NULL) ||
924 (strcmp(errors,"strict") == 0)) {
925 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000926 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 details);
928 return -1;
929 }
930 else if (strcmp(errors,"ignore") == 0) {
931 return 0;
932 }
933 else if (strcmp(errors,"replace") == 0) {
934 if (dest) {
935 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
936 (*dest)++;
937 }
938 return 0;
939 }
940 else {
941 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000942 "UTF-16 decoding error; "
943 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944 errors);
945 return -1;
946 }
947}
948
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949PyObject *PyUnicode_DecodeUTF16(const char *s,
950 int size,
951 const char *errors,
952 int *byteorder)
953{
954 PyUnicodeObject *unicode;
955 Py_UNICODE *p;
956 const Py_UNICODE *q, *e;
957 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000958 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959
960 /* size should be an even number */
961 if (size % sizeof(Py_UNICODE) != 0) {
962 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
963 return NULL;
964 /* The remaining input chars are ignored if we fall through
965 here... */
966 }
967
968 /* Note: size will always be longer than the resulting Unicode
969 character count */
970 unicode = _PyUnicode_New(size);
971 if (!unicode)
972 return NULL;
973 if (size == 0)
974 return (PyObject *)unicode;
975
976 /* Unpack UTF-16 encoded data */
977 p = unicode->str;
978 q = (Py_UNICODE *)s;
979 e = q + (size / sizeof(Py_UNICODE));
980
981 if (byteorder)
982 bo = *byteorder;
983
984 while (q < e) {
985 register Py_UNICODE ch = *q++;
986
987 /* Check for BOM marks (U+FEFF) in the input and adjust
988 current byte order setting accordingly. Swap input
989 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
990 !) */
991#ifdef BYTEORDER_IS_LITTLE_ENDIAN
992 if (ch == 0xFEFF) {
993 bo = -1;
994 continue;
995 } else if (ch == 0xFFFE) {
996 bo = 1;
997 continue;
998 }
999 if (bo == 1)
1000 ch = (ch >> 8) | (ch << 8);
1001#else
1002 if (ch == 0xFEFF) {
1003 bo = 1;
1004 continue;
1005 } else if (ch == 0xFFFE) {
1006 bo = -1;
1007 continue;
1008 }
1009 if (bo == -1)
1010 ch = (ch >> 8) | (ch << 8);
1011#endif
1012 if (ch < 0xD800 || ch > 0xDFFF) {
1013 *p++ = ch;
1014 continue;
1015 }
1016
1017 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001018 if (q >= e) {
1019 errmsg = "unexpected end of data";
1020 goto utf16Error;
1021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 if (0xDC00 <= *q && *q <= 0xDFFF) {
1023 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001024 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 /* This is valid data (a UTF-16 surrogate pair), but
1026 we are not able to store this information since our
1027 Py_UNICODE type only has 16 bits... this might
1028 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001029 errmsg = "code pairs are not supported";
1030 goto utf16Error;
1031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032 else
1033 continue;
1034 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001035 errmsg = "illegal encoding";
1036 /* Fall through to report the error */
1037
1038 utf16Error:
1039 if (utf16_decoding_error(&q, &p, errors, errmsg))
1040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041 }
1042
1043 if (byteorder)
1044 *byteorder = bo;
1045
1046 /* Adjust length */
1047 if (_PyUnicode_Resize(unicode, p - unicode->str))
1048 goto onError;
1049
1050 return (PyObject *)unicode;
1051
1052onError:
1053 Py_DECREF(unicode);
1054 return NULL;
1055}
1056
1057#undef UTF16_ERROR
1058
1059PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1060 int size,
1061 const char *errors,
1062 int byteorder)
1063{
1064 PyObject *v;
1065 Py_UNICODE *p;
1066 char *q;
1067
1068 /* We don't create UTF-16 pairs... */
1069 v = PyString_FromStringAndSize(NULL,
1070 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1071 if (v == NULL)
1072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073
1074 q = PyString_AS_STRING(v);
1075 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 if (byteorder == 0)
1077 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001078 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001079 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 if (byteorder == 0 ||
1081#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1082 byteorder == -1
1083#else
1084 byteorder == 1
1085#endif
1086 )
1087 memcpy(p, s, size * sizeof(Py_UNICODE));
1088 else
1089 while (size-- > 0) {
1090 Py_UNICODE ch = *s++;
1091 *p++ = (ch >> 8) | (ch << 8);
1092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 return v;
1094}
1095
1096PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1097{
1098 if (!PyUnicode_Check(unicode)) {
1099 PyErr_BadArgument();
1100 return NULL;
1101 }
1102 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1103 PyUnicode_GET_SIZE(unicode),
1104 NULL,
1105 0);
1106}
1107
1108/* --- Unicode Escape Codec ----------------------------------------------- */
1109
1110static
1111int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001112 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 const char *errors,
1114 const char *details)
1115{
1116 if ((errors == NULL) ||
1117 (strcmp(errors,"strict") == 0)) {
1118 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001119 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 details);
1121 return -1;
1122 }
1123 else if (strcmp(errors,"ignore") == 0) {
1124 return 0;
1125 }
1126 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001127 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 return 0;
1129 }
1130 else {
1131 PyErr_Format(PyExc_ValueError,
1132 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001133 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 errors);
1135 return -1;
1136 }
1137}
1138
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001139static _Py_UCNHashAPI *pucnHash = NULL;
1140
1141static
1142int mystrnicmp(const char *s1, const char *s2, size_t count)
1143{
1144 char c1, c2;
1145
1146 if (count)
1147 {
1148 do
1149 {
1150 c1 = tolower(*(s1++));
1151 c2 = tolower(*(s2++));
1152 }
1153 while(--count && c1 == c2);
1154
1155 return c1 - c2;
1156 }
1157
1158 return 0;
1159}
1160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1162 int size,
1163 const char *errors)
1164{
1165 PyUnicodeObject *v;
1166 Py_UNICODE *p = NULL, *buf = NULL;
1167 const char *end;
1168
1169 /* Escaped strings will always be longer than the resulting
1170 Unicode string, so we start with size here and then reduce the
1171 length after conversion to the true value. */
1172 v = _PyUnicode_New(size);
1173 if (v == NULL)
1174 goto onError;
1175 if (size == 0)
1176 return (PyObject *)v;
1177 p = buf = PyUnicode_AS_UNICODE(v);
1178 end = s + size;
1179 while (s < end) {
1180 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001181 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 int i;
1183
1184 /* Non-escape characters are interpreted as Unicode ordinals */
1185 if (*s != '\\') {
1186 *p++ = (unsigned char)*s++;
1187 continue;
1188 }
1189
1190 /* \ - Escapes */
1191 s++;
1192 switch (*s++) {
1193
1194 /* \x escapes */
1195 case '\n': break;
1196 case '\\': *p++ = '\\'; break;
1197 case '\'': *p++ = '\''; break;
1198 case '\"': *p++ = '\"'; break;
1199 case 'b': *p++ = '\b'; break;
1200 case 'f': *p++ = '\014'; break; /* FF */
1201 case 't': *p++ = '\t'; break;
1202 case 'n': *p++ = '\n'; break;
1203 case 'r': *p++ = '\r'; break;
1204 case 'v': *p++ = '\013'; break; /* VT */
1205 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1206
1207 /* \OOO (octal) escapes */
1208 case '0': case '1': case '2': case '3':
1209 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001210 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001212 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001216 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 break;
1218
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001219 /* \xXXXX escape with 1-n hex digits. for compatibility
1220 with 8-bit strings, this code ignores all but the last
1221 two digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 case 'x':
1223 x = 0;
1224 c = (unsigned char)*s;
1225 if (isxdigit(c)) {
1226 do {
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001227 x = (x<<4) & 0xF0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 if ('0' <= c && c <= '9')
1229 x += c - '0';
1230 else if ('a' <= c && c <= 'f')
1231 x += 10 + c - 'a';
1232 else
1233 x += 10 + c - 'A';
1234 c = (unsigned char)*++s;
1235 } while (isxdigit(c));
Fredrik Lundh0e19e762000-07-16 18:47:43 +00001236 *p++ = (unsigned char) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 } else {
1238 *p++ = '\\';
1239 *p++ = (unsigned char)s[-1];
1240 }
1241 break;
1242
1243 /* \uXXXX with 4 hex digits */
1244 case 'u':
1245 for (x = 0, i = 0; i < 4; i++) {
1246 c = (unsigned char)s[i];
1247 if (!isxdigit(c)) {
1248 if (unicodeescape_decoding_error(&s, &x, errors,
1249 "truncated \\uXXXX"))
1250 goto onError;
1251 i++;
1252 break;
1253 }
1254 x = (x<<4) & ~0xF;
1255 if (c >= '0' && c <= '9')
1256 x += c - '0';
1257 else if (c >= 'a' && c <= 'f')
1258 x += 10 + c - 'a';
1259 else
1260 x += 10 + c - 'A';
1261 }
1262 s += i;
1263 *p++ = x;
1264 break;
1265
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001266 case 'N':
1267 /* Ok, we need to deal with Unicode Character Names now,
1268 * make sure we've imported the hash table data...
1269 */
1270 if (pucnHash == NULL)
1271 {
1272 PyObject *mod = 0, *v = 0;
1273
1274 mod = PyImport_ImportModule("ucnhash");
1275 if (mod == NULL)
1276 goto onError;
1277 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1278 Py_DECREF(mod);
1279 if (v == NULL)
1280 {
1281 goto onError;
1282 }
1283 pucnHash = PyCObject_AsVoidPtr(v);
1284 Py_DECREF(v);
1285 if (pucnHash == NULL)
1286 {
1287 goto onError;
1288 }
1289 }
1290
1291 if (*s == '{')
1292 {
1293 const char *start = s + 1;
1294 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001296 unsigned long j;
1297
1298 /* look for either the closing brace, or we
1299 * exceed the maximum length of the unicode character names
1300 */
1301 while (*endBrace != '}' &&
1302 (unsigned int)(endBrace - start) <=
1303 pucnHash->cchMax &&
1304 endBrace < end)
1305 {
1306 endBrace++;
1307 }
1308 if (endBrace != end && *endBrace == '}')
1309 {
1310 j = pucnHash->hash(start, endBrace - start);
1311 if (j > pucnHash->cKeys ||
1312 mystrnicmp(
1313 start,
1314 ((_Py_UnicodeCharacterName *)
1315 (pucnHash->getValue(j)))->pszUCN,
1316 (int)(endBrace - start)) != 0)
1317 {
1318 if (unicodeescape_decoding_error(
1319 &s, &x, errors,
1320 "Invalid Unicode Character Name"))
1321 {
1322 goto onError;
1323 }
1324 goto ucnFallthrough;
1325 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001326 value = ((_Py_UnicodeCharacterName *)
1327 (pucnHash->getValue(j)))->value;
1328 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001329 {
1330 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001331 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001332 }
1333 else
1334 {
1335 /* Oops, its in UCS-4 space, */
1336 /* compute and append the two surrogates: */
1337 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001338 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001339
1340 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001341 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001342
1343 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001344 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001345 }
1346 s = endBrace + 1;
1347 }
1348 else
1349 {
1350 if (unicodeescape_decoding_error(
1351 &s, &x, errors,
1352 "Unicode name missing closing brace"))
1353 goto onError;
1354 goto ucnFallthrough;
1355 }
1356 break;
1357 }
1358 if (unicodeescape_decoding_error(
1359 &s, &x, errors,
1360 "Missing opening brace for Unicode Character Name escape"))
1361 goto onError;
1362ucnFallthrough:
1363 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001364 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 *p++ = '\\';
1366 *p++ = (unsigned char)s[-1];
1367 break;
1368 }
1369 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001370 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001371 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 return (PyObject *)v;
1373
1374 onError:
1375 Py_XDECREF(v);
1376 return NULL;
1377}
1378
1379/* Return a Unicode-Escape string version of the Unicode object.
1380
1381 If quotes is true, the string is enclosed in u"" or u'' quotes as
1382 appropriate.
1383
1384*/
1385
Barry Warsaw51ac5802000-03-20 16:36:48 +00001386static const Py_UNICODE *findchar(const Py_UNICODE *s,
1387 int size,
1388 Py_UNICODE ch);
1389
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390static
1391PyObject *unicodeescape_string(const Py_UNICODE *s,
1392 int size,
1393 int quotes)
1394{
1395 PyObject *repr;
1396 char *p;
1397 char *q;
1398
1399 static const char *hexdigit = "0123456789ABCDEF";
1400
1401 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1402 if (repr == NULL)
1403 return NULL;
1404
1405 p = q = PyString_AS_STRING(repr);
1406
1407 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 *p++ = 'u';
1409 *p++ = (findchar(s, size, '\'') &&
1410 !findchar(s, size, '"')) ? '"' : '\'';
1411 }
1412 while (size-- > 0) {
1413 Py_UNICODE ch = *s++;
1414 /* Escape quotes */
1415 if (quotes && (ch == q[1] || ch == '\\')) {
1416 *p++ = '\\';
1417 *p++ = (char) ch;
1418 }
1419 /* Map 16-bit characters to '\uxxxx' */
1420 else if (ch >= 256) {
1421 *p++ = '\\';
1422 *p++ = 'u';
1423 *p++ = hexdigit[(ch >> 12) & 0xf];
1424 *p++ = hexdigit[(ch >> 8) & 0xf];
1425 *p++ = hexdigit[(ch >> 4) & 0xf];
1426 *p++ = hexdigit[ch & 15];
1427 }
1428 /* Map non-printable US ASCII to '\ooo' */
1429 else if (ch < ' ' || ch >= 128) {
1430 *p++ = '\\';
1431 *p++ = hexdigit[(ch >> 6) & 7];
1432 *p++ = hexdigit[(ch >> 3) & 7];
1433 *p++ = hexdigit[ch & 7];
1434 }
1435 /* Copy everything else as-is */
1436 else
1437 *p++ = (char) ch;
1438 }
1439 if (quotes)
1440 *p++ = q[1];
1441
1442 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001443 if (_PyString_Resize(&repr, p - q))
1444 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445
1446 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001447
1448 onError:
1449 Py_DECREF(repr);
1450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451}
1452
1453PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1454 int size)
1455{
1456 return unicodeescape_string(s, size, 0);
1457}
1458
1459PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1460{
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 return NULL;
1464 }
1465 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1466 PyUnicode_GET_SIZE(unicode));
1467}
1468
1469/* --- Raw Unicode Escape Codec ------------------------------------------- */
1470
1471PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1472 int size,
1473 const char *errors)
1474{
1475 PyUnicodeObject *v;
1476 Py_UNICODE *p, *buf;
1477 const char *end;
1478 const char *bs;
1479
1480 /* Escaped strings will always be longer than the resulting
1481 Unicode string, so we start with size here and then reduce the
1482 length after conversion to the true value. */
1483 v = _PyUnicode_New(size);
1484 if (v == NULL)
1485 goto onError;
1486 if (size == 0)
1487 return (PyObject *)v;
1488 p = buf = PyUnicode_AS_UNICODE(v);
1489 end = s + size;
1490 while (s < end) {
1491 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001492 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 int i;
1494
1495 /* Non-escape characters are interpreted as Unicode ordinals */
1496 if (*s != '\\') {
1497 *p++ = (unsigned char)*s++;
1498 continue;
1499 }
1500
1501 /* \u-escapes are only interpreted iff the number of leading
1502 backslashes if odd */
1503 bs = s;
1504 for (;s < end;) {
1505 if (*s != '\\')
1506 break;
1507 *p++ = (unsigned char)*s++;
1508 }
1509 if (((s - bs) & 1) == 0 ||
1510 s >= end ||
1511 *s != 'u') {
1512 continue;
1513 }
1514 p--;
1515 s++;
1516
1517 /* \uXXXX with 4 hex digits */
1518 for (x = 0, i = 0; i < 4; i++) {
1519 c = (unsigned char)s[i];
1520 if (!isxdigit(c)) {
1521 if (unicodeescape_decoding_error(&s, &x, errors,
1522 "truncated \\uXXXX"))
1523 goto onError;
1524 i++;
1525 break;
1526 }
1527 x = (x<<4) & ~0xF;
1528 if (c >= '0' && c <= '9')
1529 x += c - '0';
1530 else if (c >= 'a' && c <= 'f')
1531 x += 10 + c - 'a';
1532 else
1533 x += 10 + c - 'A';
1534 }
1535 s += i;
1536 *p++ = x;
1537 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001538 if (_PyUnicode_Resize(v, (int)(p - buf)))
1539 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 return (PyObject *)v;
1541
1542 onError:
1543 Py_XDECREF(v);
1544 return NULL;
1545}
1546
1547PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1548 int size)
1549{
1550 PyObject *repr;
1551 char *p;
1552 char *q;
1553
1554 static const char *hexdigit = "0123456789ABCDEF";
1555
1556 repr = PyString_FromStringAndSize(NULL, 6 * size);
1557 if (repr == NULL)
1558 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001559 if (size == 0)
1560 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
1562 p = q = PyString_AS_STRING(repr);
1563 while (size-- > 0) {
1564 Py_UNICODE ch = *s++;
1565 /* Map 16-bit characters to '\uxxxx' */
1566 if (ch >= 256) {
1567 *p++ = '\\';
1568 *p++ = 'u';
1569 *p++ = hexdigit[(ch >> 12) & 0xf];
1570 *p++ = hexdigit[(ch >> 8) & 0xf];
1571 *p++ = hexdigit[(ch >> 4) & 0xf];
1572 *p++ = hexdigit[ch & 15];
1573 }
1574 /* Copy everything else as-is */
1575 else
1576 *p++ = (char) ch;
1577 }
1578 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001579 if (_PyString_Resize(&repr, p - q))
1580 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
1582 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001583
1584 onError:
1585 Py_DECREF(repr);
1586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587}
1588
1589PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1590{
1591 if (!PyUnicode_Check(unicode)) {
1592 PyErr_BadArgument();
1593 return NULL;
1594 }
1595 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1596 PyUnicode_GET_SIZE(unicode));
1597}
1598
1599/* --- Latin-1 Codec ------------------------------------------------------ */
1600
1601PyObject *PyUnicode_DecodeLatin1(const char *s,
1602 int size,
1603 const char *errors)
1604{
1605 PyUnicodeObject *v;
1606 Py_UNICODE *p;
1607
1608 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1609 v = _PyUnicode_New(size);
1610 if (v == NULL)
1611 goto onError;
1612 if (size == 0)
1613 return (PyObject *)v;
1614 p = PyUnicode_AS_UNICODE(v);
1615 while (size-- > 0)
1616 *p++ = (unsigned char)*s++;
1617 return (PyObject *)v;
1618
1619 onError:
1620 Py_XDECREF(v);
1621 return NULL;
1622}
1623
1624static
1625int latin1_encoding_error(const Py_UNICODE **source,
1626 char **dest,
1627 const char *errors,
1628 const char *details)
1629{
1630 if ((errors == NULL) ||
1631 (strcmp(errors,"strict") == 0)) {
1632 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001633 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 details);
1635 return -1;
1636 }
1637 else if (strcmp(errors,"ignore") == 0) {
1638 return 0;
1639 }
1640 else if (strcmp(errors,"replace") == 0) {
1641 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001642 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 return 0;
1644 }
1645 else {
1646 PyErr_Format(PyExc_ValueError,
1647 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001648 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 errors);
1650 return -1;
1651 }
1652}
1653
1654PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1655 int size,
1656 const char *errors)
1657{
1658 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001659 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 repr = PyString_FromStringAndSize(NULL, size);
1662 if (repr == NULL)
1663 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001664 if (size == 0)
1665 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666
1667 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001668 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 while (size-- > 0) {
1670 Py_UNICODE ch = *p++;
1671 if (ch >= 256) {
1672 if (latin1_encoding_error(&p, &s, errors,
1673 "ordinal not in range(256)"))
1674 goto onError;
1675 }
1676 else
1677 *s++ = (char)ch;
1678 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001679 /* Resize if error handling skipped some characters */
1680 if (s - start < PyString_GET_SIZE(repr))
1681 if (_PyString_Resize(&repr, s - start))
1682 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 return repr;
1684
1685 onError:
1686 Py_DECREF(repr);
1687 return NULL;
1688}
1689
1690PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1691{
1692 if (!PyUnicode_Check(unicode)) {
1693 PyErr_BadArgument();
1694 return NULL;
1695 }
1696 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1697 PyUnicode_GET_SIZE(unicode),
1698 NULL);
1699}
1700
1701/* --- 7-bit ASCII Codec -------------------------------------------------- */
1702
1703static
1704int ascii_decoding_error(const char **source,
1705 Py_UNICODE **dest,
1706 const char *errors,
1707 const char *details)
1708{
1709 if ((errors == NULL) ||
1710 (strcmp(errors,"strict") == 0)) {
1711 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001712 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 details);
1714 return -1;
1715 }
1716 else if (strcmp(errors,"ignore") == 0) {
1717 return 0;
1718 }
1719 else if (strcmp(errors,"replace") == 0) {
1720 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1721 (*dest)++;
1722 return 0;
1723 }
1724 else {
1725 PyErr_Format(PyExc_ValueError,
1726 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001727 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 errors);
1729 return -1;
1730 }
1731}
1732
1733PyObject *PyUnicode_DecodeASCII(const char *s,
1734 int size,
1735 const char *errors)
1736{
1737 PyUnicodeObject *v;
1738 Py_UNICODE *p;
1739
1740 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1741 v = _PyUnicode_New(size);
1742 if (v == NULL)
1743 goto onError;
1744 if (size == 0)
1745 return (PyObject *)v;
1746 p = PyUnicode_AS_UNICODE(v);
1747 while (size-- > 0) {
1748 register unsigned char c;
1749
1750 c = (unsigned char)*s++;
1751 if (c < 128)
1752 *p++ = c;
1753 else if (ascii_decoding_error(&s, &p, errors,
1754 "ordinal not in range(128)"))
1755 goto onError;
1756 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001757 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1758 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1759 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 return (PyObject *)v;
1761
1762 onError:
1763 Py_XDECREF(v);
1764 return NULL;
1765}
1766
1767static
1768int ascii_encoding_error(const Py_UNICODE **source,
1769 char **dest,
1770 const char *errors,
1771 const char *details)
1772{
1773 if ((errors == NULL) ||
1774 (strcmp(errors,"strict") == 0)) {
1775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001776 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 details);
1778 return -1;
1779 }
1780 else if (strcmp(errors,"ignore") == 0) {
1781 return 0;
1782 }
1783 else if (strcmp(errors,"replace") == 0) {
1784 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001785 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 return 0;
1787 }
1788 else {
1789 PyErr_Format(PyExc_ValueError,
1790 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 errors);
1793 return -1;
1794 }
1795}
1796
1797PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1798 int size,
1799 const char *errors)
1800{
1801 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001802 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001803
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 repr = PyString_FromStringAndSize(NULL, size);
1805 if (repr == NULL)
1806 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001807 if (size == 0)
1808 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
1810 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001811 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 while (size-- > 0) {
1813 Py_UNICODE ch = *p++;
1814 if (ch >= 128) {
1815 if (ascii_encoding_error(&p, &s, errors,
1816 "ordinal not in range(128)"))
1817 goto onError;
1818 }
1819 else
1820 *s++ = (char)ch;
1821 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001822 /* Resize if error handling skipped some characters */
1823 if (s - start < PyString_GET_SIZE(repr))
1824 if (_PyString_Resize(&repr, s - start))
1825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 return repr;
1827
1828 onError:
1829 Py_DECREF(repr);
1830 return NULL;
1831}
1832
1833PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1834{
1835 if (!PyUnicode_Check(unicode)) {
1836 PyErr_BadArgument();
1837 return NULL;
1838 }
1839 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1840 PyUnicode_GET_SIZE(unicode),
1841 NULL);
1842}
1843
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001844#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001846/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848PyObject *PyUnicode_DecodeMBCS(const char *s,
1849 int size,
1850 const char *errors)
1851{
1852 PyUnicodeObject *v;
1853 Py_UNICODE *p;
1854
1855 /* First get the size of the result */
1856 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001857 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001858 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1859
1860 v = _PyUnicode_New(usize);
1861 if (v == NULL)
1862 return NULL;
1863 if (usize == 0)
1864 return (PyObject *)v;
1865 p = PyUnicode_AS_UNICODE(v);
1866 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1867 Py_DECREF(v);
1868 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1869 }
1870
1871 return (PyObject *)v;
1872}
1873
1874PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1875 int size,
1876 const char *errors)
1877{
1878 PyObject *repr;
1879 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001880 DWORD mbcssize;
1881
1882 /* If there are no characters, bail now! */
1883 if (size==0)
1884 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001885
1886 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001887 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888 if (mbcssize==0)
1889 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1890
1891 repr = PyString_FromStringAndSize(NULL, mbcssize);
1892 if (repr == NULL)
1893 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001894 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001895 return repr;
1896
1897 /* Do the conversion */
1898 s = PyString_AS_STRING(repr);
1899 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1900 Py_DECREF(repr);
1901 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1902 }
1903 return repr;
1904}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001905
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001906#endif /* MS_WIN32 */
1907
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908/* --- Character Mapping Codec -------------------------------------------- */
1909
1910static
1911int charmap_decoding_error(const char **source,
1912 Py_UNICODE **dest,
1913 const char *errors,
1914 const char *details)
1915{
1916 if ((errors == NULL) ||
1917 (strcmp(errors,"strict") == 0)) {
1918 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001919 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 details);
1921 return -1;
1922 }
1923 else if (strcmp(errors,"ignore") == 0) {
1924 return 0;
1925 }
1926 else if (strcmp(errors,"replace") == 0) {
1927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1928 (*dest)++;
1929 return 0;
1930 }
1931 else {
1932 PyErr_Format(PyExc_ValueError,
1933 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001934 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 errors);
1936 return -1;
1937 }
1938}
1939
1940PyObject *PyUnicode_DecodeCharmap(const char *s,
1941 int size,
1942 PyObject *mapping,
1943 const char *errors)
1944{
1945 PyUnicodeObject *v;
1946 Py_UNICODE *p;
1947
1948 /* Default to Latin-1 */
1949 if (mapping == NULL)
1950 return PyUnicode_DecodeLatin1(s, size, errors);
1951
1952 v = _PyUnicode_New(size);
1953 if (v == NULL)
1954 goto onError;
1955 if (size == 0)
1956 return (PyObject *)v;
1957 p = PyUnicode_AS_UNICODE(v);
1958 while (size-- > 0) {
1959 unsigned char ch = *s++;
1960 PyObject *w, *x;
1961
1962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1963 w = PyInt_FromLong((long)ch);
1964 if (w == NULL)
1965 goto onError;
1966 x = PyObject_GetItem(mapping, w);
1967 Py_DECREF(w);
1968 if (x == NULL) {
1969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1970 /* No mapping found: default to Latin-1 mapping */
1971 PyErr_Clear();
1972 *p++ = (Py_UNICODE)ch;
1973 continue;
1974 }
1975 goto onError;
1976 }
1977
1978 /* Apply mapping */
1979 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001980 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 if (value < 0 || value > 65535) {
1982 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001983 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 Py_DECREF(x);
1985 goto onError;
1986 }
1987 *p++ = (Py_UNICODE)value;
1988 }
1989 else if (x == Py_None) {
1990 /* undefined mapping */
1991 if (charmap_decoding_error(&s, &p, errors,
1992 "character maps to <undefined>")) {
1993 Py_DECREF(x);
1994 goto onError;
1995 }
1996 }
1997 else if (PyUnicode_Check(x)) {
1998 if (PyUnicode_GET_SIZE(x) != 1) {
1999 /* 1-n mapping */
2000 PyErr_SetString(PyExc_NotImplementedError,
2001 "1-n mappings are currently not implemented");
2002 Py_DECREF(x);
2003 goto onError;
2004 }
2005 *p++ = *PyUnicode_AS_UNICODE(x);
2006 }
2007 else {
2008 /* wrong return value */
2009 PyErr_SetString(PyExc_TypeError,
2010 "character mapping must return integer, None or unicode");
2011 Py_DECREF(x);
2012 goto onError;
2013 }
2014 Py_DECREF(x);
2015 }
2016 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2017 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2018 goto onError;
2019 return (PyObject *)v;
2020
2021 onError:
2022 Py_XDECREF(v);
2023 return NULL;
2024}
2025
2026static
2027int charmap_encoding_error(const Py_UNICODE **source,
2028 char **dest,
2029 const char *errors,
2030 const char *details)
2031{
2032 if ((errors == NULL) ||
2033 (strcmp(errors,"strict") == 0)) {
2034 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002035 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 details);
2037 return -1;
2038 }
2039 else if (strcmp(errors,"ignore") == 0) {
2040 return 0;
2041 }
2042 else if (strcmp(errors,"replace") == 0) {
2043 **dest = '?';
2044 (*dest)++;
2045 return 0;
2046 }
2047 else {
2048 PyErr_Format(PyExc_ValueError,
2049 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002050 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 errors);
2052 return -1;
2053 }
2054}
2055
2056PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2057 int size,
2058 PyObject *mapping,
2059 const char *errors)
2060{
2061 PyObject *v;
2062 char *s;
2063
2064 /* Default to Latin-1 */
2065 if (mapping == NULL)
2066 return PyUnicode_EncodeLatin1(p, size, errors);
2067
2068 v = PyString_FromStringAndSize(NULL, size);
2069 if (v == NULL)
2070 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002071 if (size == 0)
2072 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 s = PyString_AS_STRING(v);
2074 while (size-- > 0) {
2075 Py_UNICODE ch = *p++;
2076 PyObject *w, *x;
2077
2078 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2079 w = PyInt_FromLong((long)ch);
2080 if (w == NULL)
2081 goto onError;
2082 x = PyObject_GetItem(mapping, w);
2083 Py_DECREF(w);
2084 if (x == NULL) {
2085 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2086 /* No mapping found: default to Latin-1 mapping if possible */
2087 PyErr_Clear();
2088 if (ch < 256) {
2089 *s++ = (char)ch;
2090 continue;
2091 }
2092 else if (!charmap_encoding_error(&p, &s, errors,
2093 "missing character mapping"))
2094 continue;
2095 }
2096 goto onError;
2097 }
2098
2099 /* Apply mapping */
2100 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002101 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 if (value < 0 || value > 255) {
2103 PyErr_SetString(PyExc_TypeError,
2104 "character mapping must be in range(256)");
2105 Py_DECREF(x);
2106 goto onError;
2107 }
2108 *s++ = (char)value;
2109 }
2110 else if (x == Py_None) {
2111 /* undefined mapping */
2112 if (charmap_encoding_error(&p, &s, errors,
2113 "character maps to <undefined>")) {
2114 Py_DECREF(x);
2115 goto onError;
2116 }
2117 }
2118 else if (PyString_Check(x)) {
2119 if (PyString_GET_SIZE(x) != 1) {
2120 /* 1-n mapping */
2121 PyErr_SetString(PyExc_NotImplementedError,
2122 "1-n mappings are currently not implemented");
2123 Py_DECREF(x);
2124 goto onError;
2125 }
2126 *s++ = *PyString_AS_STRING(x);
2127 }
2128 else {
2129 /* wrong return value */
2130 PyErr_SetString(PyExc_TypeError,
2131 "character mapping must return integer, None or unicode");
2132 Py_DECREF(x);
2133 goto onError;
2134 }
2135 Py_DECREF(x);
2136 }
2137 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2138 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2139 goto onError;
2140 return v;
2141
2142 onError:
2143 Py_DECREF(v);
2144 return NULL;
2145}
2146
2147PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2148 PyObject *mapping)
2149{
2150 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2151 PyErr_BadArgument();
2152 return NULL;
2153 }
2154 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2155 PyUnicode_GET_SIZE(unicode),
2156 mapping,
2157 NULL);
2158}
2159
2160static
2161int translate_error(const Py_UNICODE **source,
2162 Py_UNICODE **dest,
2163 const char *errors,
2164 const char *details)
2165{
2166 if ((errors == NULL) ||
2167 (strcmp(errors,"strict") == 0)) {
2168 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002169 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 details);
2171 return -1;
2172 }
2173 else if (strcmp(errors,"ignore") == 0) {
2174 return 0;
2175 }
2176 else if (strcmp(errors,"replace") == 0) {
2177 **dest = '?';
2178 (*dest)++;
2179 return 0;
2180 }
2181 else {
2182 PyErr_Format(PyExc_ValueError,
2183 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002184 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 errors);
2186 return -1;
2187 }
2188}
2189
2190PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2191 int size,
2192 PyObject *mapping,
2193 const char *errors)
2194{
2195 PyUnicodeObject *v;
2196 Py_UNICODE *p;
2197
2198 if (mapping == NULL) {
2199 PyErr_BadArgument();
2200 return NULL;
2201 }
2202
2203 /* Output will never be longer than input */
2204 v = _PyUnicode_New(size);
2205 if (v == NULL)
2206 goto onError;
2207 if (size == 0)
2208 goto done;
2209 p = PyUnicode_AS_UNICODE(v);
2210 while (size-- > 0) {
2211 Py_UNICODE ch = *s++;
2212 PyObject *w, *x;
2213
2214 /* Get mapping */
2215 w = PyInt_FromLong(ch);
2216 if (w == NULL)
2217 goto onError;
2218 x = PyObject_GetItem(mapping, w);
2219 Py_DECREF(w);
2220 if (x == NULL) {
2221 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2222 /* No mapping found: default to 1-1 mapping */
2223 PyErr_Clear();
2224 *p++ = ch;
2225 continue;
2226 }
2227 goto onError;
2228 }
2229
2230 /* Apply mapping */
2231 if (PyInt_Check(x))
2232 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2233 else if (x == Py_None) {
2234 /* undefined mapping */
2235 if (translate_error(&s, &p, errors,
2236 "character maps to <undefined>")) {
2237 Py_DECREF(x);
2238 goto onError;
2239 }
2240 }
2241 else if (PyUnicode_Check(x)) {
2242 if (PyUnicode_GET_SIZE(x) != 1) {
2243 /* 1-n mapping */
2244 PyErr_SetString(PyExc_NotImplementedError,
2245 "1-n mappings are currently not implemented");
2246 Py_DECREF(x);
2247 goto onError;
2248 }
2249 *p++ = *PyUnicode_AS_UNICODE(x);
2250 }
2251 else {
2252 /* wrong return value */
2253 PyErr_SetString(PyExc_TypeError,
2254 "translate mapping must return integer, None or unicode");
2255 Py_DECREF(x);
2256 goto onError;
2257 }
2258 Py_DECREF(x);
2259 }
2260 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002261 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263
2264 done:
2265 return (PyObject *)v;
2266
2267 onError:
2268 Py_XDECREF(v);
2269 return NULL;
2270}
2271
2272PyObject *PyUnicode_Translate(PyObject *str,
2273 PyObject *mapping,
2274 const char *errors)
2275{
2276 PyObject *result;
2277
2278 str = PyUnicode_FromObject(str);
2279 if (str == NULL)
2280 goto onError;
2281 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2282 PyUnicode_GET_SIZE(str),
2283 mapping,
2284 errors);
2285 Py_DECREF(str);
2286 return result;
2287
2288 onError:
2289 Py_XDECREF(str);
2290 return NULL;
2291}
2292
Guido van Rossum9e896b32000-04-05 20:11:21 +00002293/* --- Decimal Encoder ---------------------------------------------------- */
2294
2295int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2296 int length,
2297 char *output,
2298 const char *errors)
2299{
2300 Py_UNICODE *p, *end;
2301
2302 if (output == NULL) {
2303 PyErr_BadArgument();
2304 return -1;
2305 }
2306
2307 p = s;
2308 end = s + length;
2309 while (p < end) {
2310 register Py_UNICODE ch = *p++;
2311 int decimal;
2312
2313 if (Py_UNICODE_ISSPACE(ch)) {
2314 *output++ = ' ';
2315 continue;
2316 }
2317 decimal = Py_UNICODE_TODECIMAL(ch);
2318 if (decimal >= 0) {
2319 *output++ = '0' + decimal;
2320 continue;
2321 }
Guido van Rossumba477042000-04-06 18:18:10 +00002322 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002323 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002324 continue;
2325 }
2326 /* All other characters are considered invalid */
2327 if (errors == NULL || strcmp(errors, "strict") == 0) {
2328 PyErr_SetString(PyExc_ValueError,
2329 "invalid decimal Unicode string");
2330 goto onError;
2331 }
2332 else if (strcmp(errors, "ignore") == 0)
2333 continue;
2334 else if (strcmp(errors, "replace") == 0) {
2335 *output++ = '?';
2336 continue;
2337 }
2338 }
2339 /* 0-terminate the output string */
2340 *output++ = '\0';
2341 return 0;
2342
2343 onError:
2344 return -1;
2345}
2346
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347/* --- Helpers ------------------------------------------------------------ */
2348
2349static
2350int count(PyUnicodeObject *self,
2351 int start,
2352 int end,
2353 PyUnicodeObject *substring)
2354{
2355 int count = 0;
2356
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002357 if (substring->length == 0)
2358 return (end - start + 1);
2359
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 end -= substring->length;
2361
2362 while (start <= end)
2363 if (Py_UNICODE_MATCH(self, start, substring)) {
2364 count++;
2365 start += substring->length;
2366 } else
2367 start++;
2368
2369 return count;
2370}
2371
2372int PyUnicode_Count(PyObject *str,
2373 PyObject *substr,
2374 int start,
2375 int end)
2376{
2377 int result;
2378
2379 str = PyUnicode_FromObject(str);
2380 if (str == NULL)
2381 return -1;
2382 substr = PyUnicode_FromObject(substr);
2383 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002384 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385 return -1;
2386 }
2387
2388 result = count((PyUnicodeObject *)str,
2389 start, end,
2390 (PyUnicodeObject *)substr);
2391
2392 Py_DECREF(str);
2393 Py_DECREF(substr);
2394 return result;
2395}
2396
2397static
2398int findstring(PyUnicodeObject *self,
2399 PyUnicodeObject *substring,
2400 int start,
2401 int end,
2402 int direction)
2403{
2404 if (start < 0)
2405 start += self->length;
2406 if (start < 0)
2407 start = 0;
2408
2409 if (substring->length == 0)
2410 return start;
2411
2412 if (end > self->length)
2413 end = self->length;
2414 if (end < 0)
2415 end += self->length;
2416 if (end < 0)
2417 end = 0;
2418
2419 end -= substring->length;
2420
2421 if (direction < 0) {
2422 for (; end >= start; end--)
2423 if (Py_UNICODE_MATCH(self, end, substring))
2424 return end;
2425 } else {
2426 for (; start <= end; start++)
2427 if (Py_UNICODE_MATCH(self, start, substring))
2428 return start;
2429 }
2430
2431 return -1;
2432}
2433
2434int PyUnicode_Find(PyObject *str,
2435 PyObject *substr,
2436 int start,
2437 int end,
2438 int direction)
2439{
2440 int result;
2441
2442 str = PyUnicode_FromObject(str);
2443 if (str == NULL)
2444 return -1;
2445 substr = PyUnicode_FromObject(substr);
2446 if (substr == NULL) {
2447 Py_DECREF(substr);
2448 return -1;
2449 }
2450
2451 result = findstring((PyUnicodeObject *)str,
2452 (PyUnicodeObject *)substr,
2453 start, end, direction);
2454 Py_DECREF(str);
2455 Py_DECREF(substr);
2456 return result;
2457}
2458
2459static
2460int tailmatch(PyUnicodeObject *self,
2461 PyUnicodeObject *substring,
2462 int start,
2463 int end,
2464 int direction)
2465{
2466 if (start < 0)
2467 start += self->length;
2468 if (start < 0)
2469 start = 0;
2470
2471 if (substring->length == 0)
2472 return 1;
2473
2474 if (end > self->length)
2475 end = self->length;
2476 if (end < 0)
2477 end += self->length;
2478 if (end < 0)
2479 end = 0;
2480
2481 end -= substring->length;
2482 if (end < start)
2483 return 0;
2484
2485 if (direction > 0) {
2486 if (Py_UNICODE_MATCH(self, end, substring))
2487 return 1;
2488 } else {
2489 if (Py_UNICODE_MATCH(self, start, substring))
2490 return 1;
2491 }
2492
2493 return 0;
2494}
2495
2496int PyUnicode_Tailmatch(PyObject *str,
2497 PyObject *substr,
2498 int start,
2499 int end,
2500 int direction)
2501{
2502 int result;
2503
2504 str = PyUnicode_FromObject(str);
2505 if (str == NULL)
2506 return -1;
2507 substr = PyUnicode_FromObject(substr);
2508 if (substr == NULL) {
2509 Py_DECREF(substr);
2510 return -1;
2511 }
2512
2513 result = tailmatch((PyUnicodeObject *)str,
2514 (PyUnicodeObject *)substr,
2515 start, end, direction);
2516 Py_DECREF(str);
2517 Py_DECREF(substr);
2518 return result;
2519}
2520
2521static
2522const Py_UNICODE *findchar(const Py_UNICODE *s,
2523 int size,
2524 Py_UNICODE ch)
2525{
2526 /* like wcschr, but doesn't stop at NULL characters */
2527
2528 while (size-- > 0) {
2529 if (*s == ch)
2530 return s;
2531 s++;
2532 }
2533
2534 return NULL;
2535}
2536
2537/* Apply fixfct filter to the Unicode object self and return a
2538 reference to the modified object */
2539
2540static
2541PyObject *fixup(PyUnicodeObject *self,
2542 int (*fixfct)(PyUnicodeObject *s))
2543{
2544
2545 PyUnicodeObject *u;
2546
2547 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2548 self->length);
2549 if (u == NULL)
2550 return NULL;
2551 if (!fixfct(u)) {
2552 /* fixfct should return TRUE if it modified the buffer. If
2553 FALSE, return a reference to the original buffer instead
2554 (to save space, not time) */
2555 Py_INCREF(self);
2556 Py_DECREF(u);
2557 return (PyObject*) self;
2558 }
2559 return (PyObject*) u;
2560}
2561
2562static
2563int fixupper(PyUnicodeObject *self)
2564{
2565 int len = self->length;
2566 Py_UNICODE *s = self->str;
2567 int status = 0;
2568
2569 while (len-- > 0) {
2570 register Py_UNICODE ch;
2571
2572 ch = Py_UNICODE_TOUPPER(*s);
2573 if (ch != *s) {
2574 status = 1;
2575 *s = ch;
2576 }
2577 s++;
2578 }
2579
2580 return status;
2581}
2582
2583static
2584int fixlower(PyUnicodeObject *self)
2585{
2586 int len = self->length;
2587 Py_UNICODE *s = self->str;
2588 int status = 0;
2589
2590 while (len-- > 0) {
2591 register Py_UNICODE ch;
2592
2593 ch = Py_UNICODE_TOLOWER(*s);
2594 if (ch != *s) {
2595 status = 1;
2596 *s = ch;
2597 }
2598 s++;
2599 }
2600
2601 return status;
2602}
2603
2604static
2605int fixswapcase(PyUnicodeObject *self)
2606{
2607 int len = self->length;
2608 Py_UNICODE *s = self->str;
2609 int status = 0;
2610
2611 while (len-- > 0) {
2612 if (Py_UNICODE_ISUPPER(*s)) {
2613 *s = Py_UNICODE_TOLOWER(*s);
2614 status = 1;
2615 } else if (Py_UNICODE_ISLOWER(*s)) {
2616 *s = Py_UNICODE_TOUPPER(*s);
2617 status = 1;
2618 }
2619 s++;
2620 }
2621
2622 return status;
2623}
2624
2625static
2626int fixcapitalize(PyUnicodeObject *self)
2627{
2628 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2629 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2630 return 1;
2631 }
2632 return 0;
2633}
2634
2635static
2636int fixtitle(PyUnicodeObject *self)
2637{
2638 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2639 register Py_UNICODE *e;
2640 int previous_is_cased;
2641
2642 /* Shortcut for single character strings */
2643 if (PyUnicode_GET_SIZE(self) == 1) {
2644 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2645 if (*p != ch) {
2646 *p = ch;
2647 return 1;
2648 }
2649 else
2650 return 0;
2651 }
2652
2653 e = p + PyUnicode_GET_SIZE(self);
2654 previous_is_cased = 0;
2655 for (; p < e; p++) {
2656 register const Py_UNICODE ch = *p;
2657
2658 if (previous_is_cased)
2659 *p = Py_UNICODE_TOLOWER(ch);
2660 else
2661 *p = Py_UNICODE_TOTITLE(ch);
2662
2663 if (Py_UNICODE_ISLOWER(ch) ||
2664 Py_UNICODE_ISUPPER(ch) ||
2665 Py_UNICODE_ISTITLE(ch))
2666 previous_is_cased = 1;
2667 else
2668 previous_is_cased = 0;
2669 }
2670 return 1;
2671}
2672
2673PyObject *PyUnicode_Join(PyObject *separator,
2674 PyObject *seq)
2675{
2676 Py_UNICODE *sep;
2677 int seplen;
2678 PyUnicodeObject *res = NULL;
2679 int reslen = 0;
2680 Py_UNICODE *p;
2681 int seqlen = 0;
2682 int sz = 100;
2683 int i;
2684
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002685 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 if (seqlen < 0 && PyErr_Occurred())
2687 return NULL;
2688
2689 if (separator == NULL) {
2690 Py_UNICODE blank = ' ';
2691 sep = &blank;
2692 seplen = 1;
2693 }
2694 else {
2695 separator = PyUnicode_FromObject(separator);
2696 if (separator == NULL)
2697 return NULL;
2698 sep = PyUnicode_AS_UNICODE(separator);
2699 seplen = PyUnicode_GET_SIZE(separator);
2700 }
2701
2702 res = _PyUnicode_New(sz);
2703 if (res == NULL)
2704 goto onError;
2705 p = PyUnicode_AS_UNICODE(res);
2706 reslen = 0;
2707
2708 for (i = 0; i < seqlen; i++) {
2709 int itemlen;
2710 PyObject *item;
2711
2712 item = PySequence_GetItem(seq, i);
2713 if (item == NULL)
2714 goto onError;
2715 if (!PyUnicode_Check(item)) {
2716 PyObject *v;
2717 v = PyUnicode_FromObject(item);
2718 Py_DECREF(item);
2719 item = v;
2720 if (item == NULL)
2721 goto onError;
2722 }
2723 itemlen = PyUnicode_GET_SIZE(item);
2724 while (reslen + itemlen + seplen >= sz) {
2725 if (_PyUnicode_Resize(res, sz*2))
2726 goto onError;
2727 sz *= 2;
2728 p = PyUnicode_AS_UNICODE(res) + reslen;
2729 }
2730 if (i > 0) {
2731 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2732 p += seplen;
2733 reslen += seplen;
2734 }
2735 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2736 p += itemlen;
2737 reslen += itemlen;
2738 Py_DECREF(item);
2739 }
2740 if (_PyUnicode_Resize(res, reslen))
2741 goto onError;
2742
2743 Py_XDECREF(separator);
2744 return (PyObject *)res;
2745
2746 onError:
2747 Py_XDECREF(separator);
2748 Py_DECREF(res);
2749 return NULL;
2750}
2751
2752static
2753PyUnicodeObject *pad(PyUnicodeObject *self,
2754 int left,
2755 int right,
2756 Py_UNICODE fill)
2757{
2758 PyUnicodeObject *u;
2759
2760 if (left < 0)
2761 left = 0;
2762 if (right < 0)
2763 right = 0;
2764
2765 if (left == 0 && right == 0) {
2766 Py_INCREF(self);
2767 return self;
2768 }
2769
2770 u = _PyUnicode_New(left + self->length + right);
2771 if (u) {
2772 if (left)
2773 Py_UNICODE_FILL(u->str, fill, left);
2774 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2775 if (right)
2776 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2777 }
2778
2779 return u;
2780}
2781
2782#define SPLIT_APPEND(data, left, right) \
2783 str = PyUnicode_FromUnicode(data + left, right - left); \
2784 if (!str) \
2785 goto onError; \
2786 if (PyList_Append(list, str)) { \
2787 Py_DECREF(str); \
2788 goto onError; \
2789 } \
2790 else \
2791 Py_DECREF(str);
2792
2793static
2794PyObject *split_whitespace(PyUnicodeObject *self,
2795 PyObject *list,
2796 int maxcount)
2797{
2798 register int i;
2799 register int j;
2800 int len = self->length;
2801 PyObject *str;
2802
2803 for (i = j = 0; i < len; ) {
2804 /* find a token */
2805 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2806 i++;
2807 j = i;
2808 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2809 i++;
2810 if (j < i) {
2811 if (maxcount-- <= 0)
2812 break;
2813 SPLIT_APPEND(self->str, j, i);
2814 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2815 i++;
2816 j = i;
2817 }
2818 }
2819 if (j < len) {
2820 SPLIT_APPEND(self->str, j, len);
2821 }
2822 return list;
2823
2824 onError:
2825 Py_DECREF(list);
2826 return NULL;
2827}
2828
2829PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002830 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831{
2832 register int i;
2833 register int j;
2834 int len;
2835 PyObject *list;
2836 PyObject *str;
2837 Py_UNICODE *data;
2838
2839 string = PyUnicode_FromObject(string);
2840 if (string == NULL)
2841 return NULL;
2842 data = PyUnicode_AS_UNICODE(string);
2843 len = PyUnicode_GET_SIZE(string);
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 list = PyList_New(0);
2846 if (!list)
2847 goto onError;
2848
2849 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002850 int eol;
2851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 /* Find a line and append it */
2853 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2854 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855
2856 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002857 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 if (i < len) {
2859 if (data[i] == '\r' && i + 1 < len &&
2860 data[i+1] == '\n')
2861 i += 2;
2862 else
2863 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002864 if (keepends)
2865 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Guido van Rossum86662912000-04-11 15:38:46 +00002867 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 j = i;
2869 }
2870 if (j < len) {
2871 SPLIT_APPEND(data, j, len);
2872 }
2873
2874 Py_DECREF(string);
2875 return list;
2876
2877 onError:
2878 Py_DECREF(list);
2879 Py_DECREF(string);
2880 return NULL;
2881}
2882
2883static
2884PyObject *split_char(PyUnicodeObject *self,
2885 PyObject *list,
2886 Py_UNICODE ch,
2887 int maxcount)
2888{
2889 register int i;
2890 register int j;
2891 int len = self->length;
2892 PyObject *str;
2893
2894 for (i = j = 0; i < len; ) {
2895 if (self->str[i] == ch) {
2896 if (maxcount-- <= 0)
2897 break;
2898 SPLIT_APPEND(self->str, j, i);
2899 i = j = i + 1;
2900 } else
2901 i++;
2902 }
2903 if (j <= len) {
2904 SPLIT_APPEND(self->str, j, len);
2905 }
2906 return list;
2907
2908 onError:
2909 Py_DECREF(list);
2910 return NULL;
2911}
2912
2913static
2914PyObject *split_substring(PyUnicodeObject *self,
2915 PyObject *list,
2916 PyUnicodeObject *substring,
2917 int maxcount)
2918{
2919 register int i;
2920 register int j;
2921 int len = self->length;
2922 int sublen = substring->length;
2923 PyObject *str;
2924
2925 for (i = j = 0; i < len - sublen; ) {
2926 if (Py_UNICODE_MATCH(self, i, substring)) {
2927 if (maxcount-- <= 0)
2928 break;
2929 SPLIT_APPEND(self->str, j, i);
2930 i = j = i + sublen;
2931 } else
2932 i++;
2933 }
2934 if (j <= len) {
2935 SPLIT_APPEND(self->str, j, len);
2936 }
2937 return list;
2938
2939 onError:
2940 Py_DECREF(list);
2941 return NULL;
2942}
2943
2944#undef SPLIT_APPEND
2945
2946static
2947PyObject *split(PyUnicodeObject *self,
2948 PyUnicodeObject *substring,
2949 int maxcount)
2950{
2951 PyObject *list;
2952
2953 if (maxcount < 0)
2954 maxcount = INT_MAX;
2955
2956 list = PyList_New(0);
2957 if (!list)
2958 return NULL;
2959
2960 if (substring == NULL)
2961 return split_whitespace(self,list,maxcount);
2962
2963 else if (substring->length == 1)
2964 return split_char(self,list,substring->str[0],maxcount);
2965
2966 else if (substring->length == 0) {
2967 Py_DECREF(list);
2968 PyErr_SetString(PyExc_ValueError, "empty separator");
2969 return NULL;
2970 }
2971 else
2972 return split_substring(self,list,substring,maxcount);
2973}
2974
2975static
2976PyObject *strip(PyUnicodeObject *self,
2977 int left,
2978 int right)
2979{
2980 Py_UNICODE *p = self->str;
2981 int start = 0;
2982 int end = self->length;
2983
2984 if (left)
2985 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2986 start++;
2987
2988 if (right)
2989 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2990 end--;
2991
2992 if (start == 0 && end == self->length) {
2993 /* couldn't strip anything off, return original string */
2994 Py_INCREF(self);
2995 return (PyObject*) self;
2996 }
2997
2998 return (PyObject*) PyUnicode_FromUnicode(
2999 self->str + start,
3000 end - start
3001 );
3002}
3003
3004static
3005PyObject *replace(PyUnicodeObject *self,
3006 PyUnicodeObject *str1,
3007 PyUnicodeObject *str2,
3008 int maxcount)
3009{
3010 PyUnicodeObject *u;
3011
3012 if (maxcount < 0)
3013 maxcount = INT_MAX;
3014
3015 if (str1->length == 1 && str2->length == 1) {
3016 int i;
3017
3018 /* replace characters */
3019 if (!findchar(self->str, self->length, str1->str[0])) {
3020 /* nothing to replace, return original string */
3021 Py_INCREF(self);
3022 u = self;
3023 } else {
3024 Py_UNICODE u1 = str1->str[0];
3025 Py_UNICODE u2 = str2->str[0];
3026
3027 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3028 self->str,
3029 self->length
3030 );
3031 if (u)
3032 for (i = 0; i < u->length; i++)
3033 if (u->str[i] == u1) {
3034 if (--maxcount < 0)
3035 break;
3036 u->str[i] = u2;
3037 }
3038 }
3039
3040 } else {
3041 int n, i;
3042 Py_UNICODE *p;
3043
3044 /* replace strings */
3045 n = count(self, 0, self->length, str1);
3046 if (n > maxcount)
3047 n = maxcount;
3048 if (n == 0) {
3049 /* nothing to replace, return original string */
3050 Py_INCREF(self);
3051 u = self;
3052 } else {
3053 u = _PyUnicode_New(
3054 self->length + n * (str2->length - str1->length));
3055 if (u) {
3056 i = 0;
3057 p = u->str;
3058 while (i <= self->length - str1->length)
3059 if (Py_UNICODE_MATCH(self, i, str1)) {
3060 /* replace string segment */
3061 Py_UNICODE_COPY(p, str2->str, str2->length);
3062 p += str2->length;
3063 i += str1->length;
3064 if (--n <= 0) {
3065 /* copy remaining part */
3066 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3067 break;
3068 }
3069 } else
3070 *p++ = self->str[i++];
3071 }
3072 }
3073 }
3074
3075 return (PyObject *) u;
3076}
3077
3078/* --- Unicode Object Methods --------------------------------------------- */
3079
3080static char title__doc__[] =
3081"S.title() -> unicode\n\
3082\n\
3083Return a titlecased version of S, i.e. words start with title case\n\
3084characters, all remaining cased characters have lower case.";
3085
3086static PyObject*
3087unicode_title(PyUnicodeObject *self, PyObject *args)
3088{
3089 if (!PyArg_NoArgs(args))
3090 return NULL;
3091 return fixup(self, fixtitle);
3092}
3093
3094static char capitalize__doc__[] =
3095"S.capitalize() -> unicode\n\
3096\n\
3097Return a capitalized version of S, i.e. make the first character\n\
3098have upper case.";
3099
3100static PyObject*
3101unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3102{
3103 if (!PyArg_NoArgs(args))
3104 return NULL;
3105 return fixup(self, fixcapitalize);
3106}
3107
3108#if 0
3109static char capwords__doc__[] =
3110"S.capwords() -> unicode\n\
3111\n\
3112Apply .capitalize() to all words in S and return the result with\n\
3113normalized whitespace (all whitespace strings are replaced by ' ').";
3114
3115static PyObject*
3116unicode_capwords(PyUnicodeObject *self, PyObject *args)
3117{
3118 PyObject *list;
3119 PyObject *item;
3120 int i;
3121
3122 if (!PyArg_NoArgs(args))
3123 return NULL;
3124
3125 /* Split into words */
3126 list = split(self, NULL, -1);
3127 if (!list)
3128 return NULL;
3129
3130 /* Capitalize each word */
3131 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3132 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3133 fixcapitalize);
3134 if (item == NULL)
3135 goto onError;
3136 Py_DECREF(PyList_GET_ITEM(list, i));
3137 PyList_SET_ITEM(list, i, item);
3138 }
3139
3140 /* Join the words to form a new string */
3141 item = PyUnicode_Join(NULL, list);
3142
3143onError:
3144 Py_DECREF(list);
3145 return (PyObject *)item;
3146}
3147#endif
3148
3149static char center__doc__[] =
3150"S.center(width) -> unicode\n\
3151\n\
3152Return S centered in a Unicode string of length width. Padding is done\n\
3153using spaces.";
3154
3155static PyObject *
3156unicode_center(PyUnicodeObject *self, PyObject *args)
3157{
3158 int marg, left;
3159 int width;
3160
3161 if (!PyArg_ParseTuple(args, "i:center", &width))
3162 return NULL;
3163
3164 if (self->length >= width) {
3165 Py_INCREF(self);
3166 return (PyObject*) self;
3167 }
3168
3169 marg = width - self->length;
3170 left = marg / 2 + (marg & width & 1);
3171
3172 return (PyObject*) pad(self, left, marg - left, ' ');
3173}
3174
Marc-André Lemburge5034372000-08-08 08:04:29 +00003175#if 0
3176
3177/* This code should go into some future Unicode collation support
3178 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003179 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003180
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003181/* speedy UTF-16 code point order comparison */
3182/* gleaned from: */
3183/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3184
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003185static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003186{
3187 0, 0, 0, 0, 0, 0, 0, 0,
3188 0, 0, 0, 0, 0, 0, 0, 0,
3189 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003190 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003191};
3192
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193static int
3194unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3195{
3196 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003197
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 Py_UNICODE *s1 = str1->str;
3199 Py_UNICODE *s2 = str2->str;
3200
3201 len1 = str1->length;
3202 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003205 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003206 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003207
3208 c1 = *s1++;
3209 c2 = *s2++;
3210 if (c1 > (1<<11) * 26)
3211 c1 += utf16Fixup[c1>>11];
3212 if (c2 > (1<<11) * 26)
3213 c2 += utf16Fixup[c2>>11];
3214
3215 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003216 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003217 if (diff)
3218 return (diff < 0) ? -1 : (diff != 0);
3219 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 }
3221
3222 return (len1 < len2) ? -1 : (len1 != len2);
3223}
3224
Marc-André Lemburge5034372000-08-08 08:04:29 +00003225#else
3226
3227static int
3228unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3229{
3230 register int len1, len2;
3231
3232 Py_UNICODE *s1 = str1->str;
3233 Py_UNICODE *s2 = str2->str;
3234
3235 len1 = str1->length;
3236 len2 = str2->length;
3237
3238 while (len1 > 0 && len2 > 0) {
3239 register long diff;
3240
3241 diff = (long)*s1++ - (long)*s2++;
3242 if (diff)
3243 return (diff < 0) ? -1 : (diff != 0);
3244 len1--; len2--;
3245 }
3246
3247 return (len1 < len2) ? -1 : (len1 != len2);
3248}
3249
3250#endif
3251
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252int PyUnicode_Compare(PyObject *left,
3253 PyObject *right)
3254{
3255 PyUnicodeObject *u = NULL, *v = NULL;
3256 int result;
3257
3258 /* Coerce the two arguments */
3259 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3260 if (u == NULL)
3261 goto onError;
3262 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3263 if (v == NULL)
3264 goto onError;
3265
Thomas Wouters7e474022000-07-16 12:04:32 +00003266 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 if (v == u) {
3268 Py_DECREF(u);
3269 Py_DECREF(v);
3270 return 0;
3271 }
3272
3273 result = unicode_compare(u, v);
3274
3275 Py_DECREF(u);
3276 Py_DECREF(v);
3277 return result;
3278
3279onError:
3280 Py_XDECREF(u);
3281 Py_XDECREF(v);
3282 return -1;
3283}
3284
Guido van Rossum403d68b2000-03-13 15:55:09 +00003285int PyUnicode_Contains(PyObject *container,
3286 PyObject *element)
3287{
3288 PyUnicodeObject *u = NULL, *v = NULL;
3289 int result;
3290 register const Py_UNICODE *p, *e;
3291 register Py_UNICODE ch;
3292
3293 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003294 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003295 if (v == NULL) {
3296 PyErr_SetString(PyExc_TypeError,
3297 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003298 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003299 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003300 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3301 if (u == NULL) {
3302 Py_DECREF(v);
3303 goto onError;
3304 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003305
3306 /* Check v in u */
3307 if (PyUnicode_GET_SIZE(v) != 1) {
3308 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003309 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003310 goto onError;
3311 }
3312 ch = *PyUnicode_AS_UNICODE(v);
3313 p = PyUnicode_AS_UNICODE(u);
3314 e = p + PyUnicode_GET_SIZE(u);
3315 result = 0;
3316 while (p < e) {
3317 if (*p++ == ch) {
3318 result = 1;
3319 break;
3320 }
3321 }
3322
3323 Py_DECREF(u);
3324 Py_DECREF(v);
3325 return result;
3326
3327onError:
3328 Py_XDECREF(u);
3329 Py_XDECREF(v);
3330 return -1;
3331}
3332
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333/* Concat to string or Unicode object giving a new Unicode object. */
3334
3335PyObject *PyUnicode_Concat(PyObject *left,
3336 PyObject *right)
3337{
3338 PyUnicodeObject *u = NULL, *v = NULL, *w;
3339
3340 /* Coerce the two arguments */
3341 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3342 if (u == NULL)
3343 goto onError;
3344 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3345 if (v == NULL)
3346 goto onError;
3347
3348 /* Shortcuts */
3349 if (v == unicode_empty) {
3350 Py_DECREF(v);
3351 return (PyObject *)u;
3352 }
3353 if (u == unicode_empty) {
3354 Py_DECREF(u);
3355 return (PyObject *)v;
3356 }
3357
3358 /* Concat the two Unicode strings */
3359 w = _PyUnicode_New(u->length + v->length);
3360 if (w == NULL)
3361 goto onError;
3362 Py_UNICODE_COPY(w->str, u->str, u->length);
3363 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3364
3365 Py_DECREF(u);
3366 Py_DECREF(v);
3367 return (PyObject *)w;
3368
3369onError:
3370 Py_XDECREF(u);
3371 Py_XDECREF(v);
3372 return NULL;
3373}
3374
3375static char count__doc__[] =
3376"S.count(sub[, start[, end]]) -> int\n\
3377\n\
3378Return the number of occurrences of substring sub in Unicode string\n\
3379S[start:end]. Optional arguments start and end are\n\
3380interpreted as in slice notation.";
3381
3382static PyObject *
3383unicode_count(PyUnicodeObject *self, PyObject *args)
3384{
3385 PyUnicodeObject *substring;
3386 int start = 0;
3387 int end = INT_MAX;
3388 PyObject *result;
3389
Guido van Rossumb8872e62000-05-09 14:14:27 +00003390 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3391 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 return NULL;
3393
3394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3395 (PyObject *)substring);
3396 if (substring == NULL)
3397 return NULL;
3398
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 if (start < 0)
3400 start += self->length;
3401 if (start < 0)
3402 start = 0;
3403 if (end > self->length)
3404 end = self->length;
3405 if (end < 0)
3406 end += self->length;
3407 if (end < 0)
3408 end = 0;
3409
3410 result = PyInt_FromLong((long) count(self, start, end, substring));
3411
3412 Py_DECREF(substring);
3413 return result;
3414}
3415
3416static char encode__doc__[] =
3417"S.encode([encoding[,errors]]) -> string\n\
3418\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003419Return an encoded string version of S. Default encoding is the current\n\
3420default string encoding. errors may be given to set a different error\n\
3421handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3422a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423
3424static PyObject *
3425unicode_encode(PyUnicodeObject *self, PyObject *args)
3426{
3427 char *encoding = NULL;
3428 char *errors = NULL;
3429 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3430 return NULL;
3431 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3432}
3433
3434static char expandtabs__doc__[] =
3435"S.expandtabs([tabsize]) -> unicode\n\
3436\n\
3437Return a copy of S where all tab characters are expanded using spaces.\n\
3438If tabsize is not given, a tab size of 8 characters is assumed.";
3439
3440static PyObject*
3441unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3442{
3443 Py_UNICODE *e;
3444 Py_UNICODE *p;
3445 Py_UNICODE *q;
3446 int i, j;
3447 PyUnicodeObject *u;
3448 int tabsize = 8;
3449
3450 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3451 return NULL;
3452
Thomas Wouters7e474022000-07-16 12:04:32 +00003453 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 i = j = 0;
3455 e = self->str + self->length;
3456 for (p = self->str; p < e; p++)
3457 if (*p == '\t') {
3458 if (tabsize > 0)
3459 j += tabsize - (j % tabsize);
3460 }
3461 else {
3462 j++;
3463 if (*p == '\n' || *p == '\r') {
3464 i += j;
3465 j = 0;
3466 }
3467 }
3468
3469 /* Second pass: create output string and fill it */
3470 u = _PyUnicode_New(i + j);
3471 if (!u)
3472 return NULL;
3473
3474 j = 0;
3475 q = u->str;
3476
3477 for (p = self->str; p < e; p++)
3478 if (*p == '\t') {
3479 if (tabsize > 0) {
3480 i = tabsize - (j % tabsize);
3481 j += i;
3482 while (i--)
3483 *q++ = ' ';
3484 }
3485 }
3486 else {
3487 j++;
3488 *q++ = *p;
3489 if (*p == '\n' || *p == '\r')
3490 j = 0;
3491 }
3492
3493 return (PyObject*) u;
3494}
3495
3496static char find__doc__[] =
3497"S.find(sub [,start [,end]]) -> int\n\
3498\n\
3499Return the lowest index in S where substring sub is found,\n\
3500such that sub is contained within s[start,end]. Optional\n\
3501arguments start and end are interpreted as in slice notation.\n\
3502\n\
3503Return -1 on failure.";
3504
3505static PyObject *
3506unicode_find(PyUnicodeObject *self, PyObject *args)
3507{
3508 PyUnicodeObject *substring;
3509 int start = 0;
3510 int end = INT_MAX;
3511 PyObject *result;
3512
Guido van Rossumb8872e62000-05-09 14:14:27 +00003513 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3514 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 return NULL;
3516 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3517 (PyObject *)substring);
3518 if (substring == NULL)
3519 return NULL;
3520
3521 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3522
3523 Py_DECREF(substring);
3524 return result;
3525}
3526
3527static PyObject *
3528unicode_getitem(PyUnicodeObject *self, int index)
3529{
3530 if (index < 0 || index >= self->length) {
3531 PyErr_SetString(PyExc_IndexError, "string index out of range");
3532 return NULL;
3533 }
3534
3535 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3536}
3537
3538static long
3539unicode_hash(PyUnicodeObject *self)
3540{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003541 /* Since Unicode objects compare equal to their ASCII string
3542 counterparts, they should use the individual character values
3543 as basis for their hash value. This is needed to assure that
3544 strings and Unicode objects behave in the same way as
3545 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546
Fredrik Lundhdde61642000-07-10 18:27:47 +00003547 register int len;
3548 register Py_UNICODE *p;
3549 register long x;
3550
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 if (self->hash != -1)
3552 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003553 len = PyUnicode_GET_SIZE(self);
3554 p = PyUnicode_AS_UNICODE(self);
3555 x = *p << 7;
3556 while (--len >= 0)
3557 x = (1000003*x) ^ *p++;
3558 x ^= PyUnicode_GET_SIZE(self);
3559 if (x == -1)
3560 x = -2;
3561 self->hash = x;
3562 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563}
3564
3565static char index__doc__[] =
3566"S.index(sub [,start [,end]]) -> int\n\
3567\n\
3568Like S.find() but raise ValueError when the substring is not found.";
3569
3570static PyObject *
3571unicode_index(PyUnicodeObject *self, PyObject *args)
3572{
3573 int result;
3574 PyUnicodeObject *substring;
3575 int start = 0;
3576 int end = INT_MAX;
3577
Guido van Rossumb8872e62000-05-09 14:14:27 +00003578 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3579 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 return NULL;
3581
3582 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3583 (PyObject *)substring);
3584 if (substring == NULL)
3585 return NULL;
3586
3587 result = findstring(self, substring, start, end, 1);
3588
3589 Py_DECREF(substring);
3590 if (result < 0) {
3591 PyErr_SetString(PyExc_ValueError, "substring not found");
3592 return NULL;
3593 }
3594 return PyInt_FromLong(result);
3595}
3596
3597static char islower__doc__[] =
3598"S.islower() -> int\n\
3599\n\
3600Return 1 if all cased characters in S are lowercase and there is\n\
3601at least one cased character in S, 0 otherwise.";
3602
3603static PyObject*
3604unicode_islower(PyUnicodeObject *self, PyObject *args)
3605{
3606 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3607 register const Py_UNICODE *e;
3608 int cased;
3609
3610 if (!PyArg_NoArgs(args))
3611 return NULL;
3612
3613 /* Shortcut for single character strings */
3614 if (PyUnicode_GET_SIZE(self) == 1)
3615 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3616
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003617 /* Special case for empty strings */
3618 if (PyString_GET_SIZE(self) == 0)
3619 return PyInt_FromLong(0);
3620
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 e = p + PyUnicode_GET_SIZE(self);
3622 cased = 0;
3623 for (; p < e; p++) {
3624 register const Py_UNICODE ch = *p;
3625
3626 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3627 return PyInt_FromLong(0);
3628 else if (!cased && Py_UNICODE_ISLOWER(ch))
3629 cased = 1;
3630 }
3631 return PyInt_FromLong(cased);
3632}
3633
3634static char isupper__doc__[] =
3635"S.isupper() -> int\n\
3636\n\
3637Return 1 if all cased characters in S are uppercase and there is\n\
3638at least one cased character in S, 0 otherwise.";
3639
3640static PyObject*
3641unicode_isupper(PyUnicodeObject *self, PyObject *args)
3642{
3643 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3644 register const Py_UNICODE *e;
3645 int cased;
3646
3647 if (!PyArg_NoArgs(args))
3648 return NULL;
3649
3650 /* Shortcut for single character strings */
3651 if (PyUnicode_GET_SIZE(self) == 1)
3652 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3653
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003654 /* Special case for empty strings */
3655 if (PyString_GET_SIZE(self) == 0)
3656 return PyInt_FromLong(0);
3657
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 e = p + PyUnicode_GET_SIZE(self);
3659 cased = 0;
3660 for (; p < e; p++) {
3661 register const Py_UNICODE ch = *p;
3662
3663 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3664 return PyInt_FromLong(0);
3665 else if (!cased && Py_UNICODE_ISUPPER(ch))
3666 cased = 1;
3667 }
3668 return PyInt_FromLong(cased);
3669}
3670
3671static char istitle__doc__[] =
3672"S.istitle() -> int\n\
3673\n\
3674Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3675may only follow uncased characters and lowercase characters only cased\n\
3676ones. Return 0 otherwise.";
3677
3678static PyObject*
3679unicode_istitle(PyUnicodeObject *self, PyObject *args)
3680{
3681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3682 register const Py_UNICODE *e;
3683 int cased, previous_is_cased;
3684
3685 if (!PyArg_NoArgs(args))
3686 return NULL;
3687
3688 /* Shortcut for single character strings */
3689 if (PyUnicode_GET_SIZE(self) == 1)
3690 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3691 (Py_UNICODE_ISUPPER(*p) != 0));
3692
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003693 /* Special case for empty strings */
3694 if (PyString_GET_SIZE(self) == 0)
3695 return PyInt_FromLong(0);
3696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 e = p + PyUnicode_GET_SIZE(self);
3698 cased = 0;
3699 previous_is_cased = 0;
3700 for (; p < e; p++) {
3701 register const Py_UNICODE ch = *p;
3702
3703 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3704 if (previous_is_cased)
3705 return PyInt_FromLong(0);
3706 previous_is_cased = 1;
3707 cased = 1;
3708 }
3709 else if (Py_UNICODE_ISLOWER(ch)) {
3710 if (!previous_is_cased)
3711 return PyInt_FromLong(0);
3712 previous_is_cased = 1;
3713 cased = 1;
3714 }
3715 else
3716 previous_is_cased = 0;
3717 }
3718 return PyInt_FromLong(cased);
3719}
3720
3721static char isspace__doc__[] =
3722"S.isspace() -> int\n\
3723\n\
3724Return 1 if there are only whitespace characters in S,\n\
37250 otherwise.";
3726
3727static PyObject*
3728unicode_isspace(PyUnicodeObject *self, PyObject *args)
3729{
3730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3731 register const Py_UNICODE *e;
3732
3733 if (!PyArg_NoArgs(args))
3734 return NULL;
3735
3736 /* Shortcut for single character strings */
3737 if (PyUnicode_GET_SIZE(self) == 1 &&
3738 Py_UNICODE_ISSPACE(*p))
3739 return PyInt_FromLong(1);
3740
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003741 /* Special case for empty strings */
3742 if (PyString_GET_SIZE(self) == 0)
3743 return PyInt_FromLong(0);
3744
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 e = p + PyUnicode_GET_SIZE(self);
3746 for (; p < e; p++) {
3747 if (!Py_UNICODE_ISSPACE(*p))
3748 return PyInt_FromLong(0);
3749 }
3750 return PyInt_FromLong(1);
3751}
3752
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003753static char isalpha__doc__[] =
3754"S.isalpha() -> int\n\
3755\n\
3756Return 1 if all characters in S are alphabetic\n\
3757and there is at least one character in S, 0 otherwise.";
3758
3759static PyObject*
3760unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3761{
3762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3763 register const Py_UNICODE *e;
3764
3765 if (!PyArg_NoArgs(args))
3766 return NULL;
3767
3768 /* Shortcut for single character strings */
3769 if (PyUnicode_GET_SIZE(self) == 1 &&
3770 Py_UNICODE_ISALPHA(*p))
3771 return PyInt_FromLong(1);
3772
3773 /* Special case for empty strings */
3774 if (PyString_GET_SIZE(self) == 0)
3775 return PyInt_FromLong(0);
3776
3777 e = p + PyUnicode_GET_SIZE(self);
3778 for (; p < e; p++) {
3779 if (!Py_UNICODE_ISALPHA(*p))
3780 return PyInt_FromLong(0);
3781 }
3782 return PyInt_FromLong(1);
3783}
3784
3785static char isalnum__doc__[] =
3786"S.isalnum() -> int\n\
3787\n\
3788Return 1 if all characters in S are alphanumeric\n\
3789and there is at least one character in S, 0 otherwise.";
3790
3791static PyObject*
3792unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3793{
3794 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3795 register const Py_UNICODE *e;
3796
3797 if (!PyArg_NoArgs(args))
3798 return NULL;
3799
3800 /* Shortcut for single character strings */
3801 if (PyUnicode_GET_SIZE(self) == 1 &&
3802 Py_UNICODE_ISALNUM(*p))
3803 return PyInt_FromLong(1);
3804
3805 /* Special case for empty strings */
3806 if (PyString_GET_SIZE(self) == 0)
3807 return PyInt_FromLong(0);
3808
3809 e = p + PyUnicode_GET_SIZE(self);
3810 for (; p < e; p++) {
3811 if (!Py_UNICODE_ISALNUM(*p))
3812 return PyInt_FromLong(0);
3813 }
3814 return PyInt_FromLong(1);
3815}
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817static char isdecimal__doc__[] =
3818"S.isdecimal() -> int\n\
3819\n\
3820Return 1 if there are only decimal characters in S,\n\
38210 otherwise.";
3822
3823static PyObject*
3824unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3825{
3826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3827 register const Py_UNICODE *e;
3828
3829 if (!PyArg_NoArgs(args))
3830 return NULL;
3831
3832 /* Shortcut for single character strings */
3833 if (PyUnicode_GET_SIZE(self) == 1 &&
3834 Py_UNICODE_ISDECIMAL(*p))
3835 return PyInt_FromLong(1);
3836
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003837 /* Special case for empty strings */
3838 if (PyString_GET_SIZE(self) == 0)
3839 return PyInt_FromLong(0);
3840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 e = p + PyUnicode_GET_SIZE(self);
3842 for (; p < e; p++) {
3843 if (!Py_UNICODE_ISDECIMAL(*p))
3844 return PyInt_FromLong(0);
3845 }
3846 return PyInt_FromLong(1);
3847}
3848
3849static char isdigit__doc__[] =
3850"S.isdigit() -> int\n\
3851\n\
3852Return 1 if there are only digit characters in S,\n\
38530 otherwise.";
3854
3855static PyObject*
3856unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3857{
3858 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3859 register const Py_UNICODE *e;
3860
3861 if (!PyArg_NoArgs(args))
3862 return NULL;
3863
3864 /* Shortcut for single character strings */
3865 if (PyUnicode_GET_SIZE(self) == 1 &&
3866 Py_UNICODE_ISDIGIT(*p))
3867 return PyInt_FromLong(1);
3868
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003869 /* Special case for empty strings */
3870 if (PyString_GET_SIZE(self) == 0)
3871 return PyInt_FromLong(0);
3872
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 e = p + PyUnicode_GET_SIZE(self);
3874 for (; p < e; p++) {
3875 if (!Py_UNICODE_ISDIGIT(*p))
3876 return PyInt_FromLong(0);
3877 }
3878 return PyInt_FromLong(1);
3879}
3880
3881static char isnumeric__doc__[] =
3882"S.isnumeric() -> int\n\
3883\n\
3884Return 1 if there are only numeric characters in S,\n\
38850 otherwise.";
3886
3887static PyObject*
3888unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3889{
3890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3891 register const Py_UNICODE *e;
3892
3893 if (!PyArg_NoArgs(args))
3894 return NULL;
3895
3896 /* Shortcut for single character strings */
3897 if (PyUnicode_GET_SIZE(self) == 1 &&
3898 Py_UNICODE_ISNUMERIC(*p))
3899 return PyInt_FromLong(1);
3900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003901 /* Special case for empty strings */
3902 if (PyString_GET_SIZE(self) == 0)
3903 return PyInt_FromLong(0);
3904
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 e = p + PyUnicode_GET_SIZE(self);
3906 for (; p < e; p++) {
3907 if (!Py_UNICODE_ISNUMERIC(*p))
3908 return PyInt_FromLong(0);
3909 }
3910 return PyInt_FromLong(1);
3911}
3912
3913static char join__doc__[] =
3914"S.join(sequence) -> unicode\n\
3915\n\
3916Return a string which is the concatenation of the strings in the\n\
3917sequence. The separator between elements is S.";
3918
3919static PyObject*
3920unicode_join(PyUnicodeObject *self, PyObject *args)
3921{
3922 PyObject *data;
3923 if (!PyArg_ParseTuple(args, "O:join", &data))
3924 return NULL;
3925
3926 return PyUnicode_Join((PyObject *)self, data);
3927}
3928
3929static int
3930unicode_length(PyUnicodeObject *self)
3931{
3932 return self->length;
3933}
3934
3935static char ljust__doc__[] =
3936"S.ljust(width) -> unicode\n\
3937\n\
3938Return S left justified in a Unicode string of length width. Padding is\n\
3939done using spaces.";
3940
3941static PyObject *
3942unicode_ljust(PyUnicodeObject *self, PyObject *args)
3943{
3944 int width;
3945 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3946 return NULL;
3947
3948 if (self->length >= width) {
3949 Py_INCREF(self);
3950 return (PyObject*) self;
3951 }
3952
3953 return (PyObject*) pad(self, 0, width - self->length, ' ');
3954}
3955
3956static char lower__doc__[] =
3957"S.lower() -> unicode\n\
3958\n\
3959Return a copy of the string S converted to lowercase.";
3960
3961static PyObject*
3962unicode_lower(PyUnicodeObject *self, PyObject *args)
3963{
3964 if (!PyArg_NoArgs(args))
3965 return NULL;
3966 return fixup(self, fixlower);
3967}
3968
3969static char lstrip__doc__[] =
3970"S.lstrip() -> unicode\n\
3971\n\
3972Return a copy of the string S with leading whitespace removed.";
3973
3974static PyObject *
3975unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3976{
3977 if (!PyArg_NoArgs(args))
3978 return NULL;
3979 return strip(self, 1, 0);
3980}
3981
3982static PyObject*
3983unicode_repeat(PyUnicodeObject *str, int len)
3984{
3985 PyUnicodeObject *u;
3986 Py_UNICODE *p;
3987
3988 if (len < 0)
3989 len = 0;
3990
3991 if (len == 1) {
3992 /* no repeat, return original string */
3993 Py_INCREF(str);
3994 return (PyObject*) str;
3995 }
3996
3997 u = _PyUnicode_New(len * str->length);
3998 if (!u)
3999 return NULL;
4000
4001 p = u->str;
4002
4003 while (len-- > 0) {
4004 Py_UNICODE_COPY(p, str->str, str->length);
4005 p += str->length;
4006 }
4007
4008 return (PyObject*) u;
4009}
4010
4011PyObject *PyUnicode_Replace(PyObject *obj,
4012 PyObject *subobj,
4013 PyObject *replobj,
4014 int maxcount)
4015{
4016 PyObject *self;
4017 PyObject *str1;
4018 PyObject *str2;
4019 PyObject *result;
4020
4021 self = PyUnicode_FromObject(obj);
4022 if (self == NULL)
4023 return NULL;
4024 str1 = PyUnicode_FromObject(subobj);
4025 if (str1 == NULL) {
4026 Py_DECREF(self);
4027 return NULL;
4028 }
4029 str2 = PyUnicode_FromObject(replobj);
4030 if (str2 == NULL) {
4031 Py_DECREF(self);
4032 Py_DECREF(str1);
4033 return NULL;
4034 }
4035 result = replace((PyUnicodeObject *)self,
4036 (PyUnicodeObject *)str1,
4037 (PyUnicodeObject *)str2,
4038 maxcount);
4039 Py_DECREF(self);
4040 Py_DECREF(str1);
4041 Py_DECREF(str2);
4042 return result;
4043}
4044
4045static char replace__doc__[] =
4046"S.replace (old, new[, maxsplit]) -> unicode\n\
4047\n\
4048Return a copy of S with all occurrences of substring\n\
4049old replaced by new. If the optional argument maxsplit is\n\
4050given, only the first maxsplit occurrences are replaced.";
4051
4052static PyObject*
4053unicode_replace(PyUnicodeObject *self, PyObject *args)
4054{
4055 PyUnicodeObject *str1;
4056 PyUnicodeObject *str2;
4057 int maxcount = -1;
4058 PyObject *result;
4059
4060 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4061 return NULL;
4062 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4063 if (str1 == NULL)
4064 return NULL;
4065 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4066 if (str2 == NULL)
4067 return NULL;
4068
4069 result = replace(self, str1, str2, maxcount);
4070
4071 Py_DECREF(str1);
4072 Py_DECREF(str2);
4073 return result;
4074}
4075
4076static
4077PyObject *unicode_repr(PyObject *unicode)
4078{
4079 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4080 PyUnicode_GET_SIZE(unicode),
4081 1);
4082}
4083
4084static char rfind__doc__[] =
4085"S.rfind(sub [,start [,end]]) -> int\n\
4086\n\
4087Return the highest index in S where substring sub is found,\n\
4088such that sub is contained within s[start,end]. Optional\n\
4089arguments start and end are interpreted as in slice notation.\n\
4090\n\
4091Return -1 on failure.";
4092
4093static PyObject *
4094unicode_rfind(PyUnicodeObject *self, PyObject *args)
4095{
4096 PyUnicodeObject *substring;
4097 int start = 0;
4098 int end = INT_MAX;
4099 PyObject *result;
4100
Guido van Rossumb8872e62000-05-09 14:14:27 +00004101 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4102 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 return NULL;
4104 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4105 (PyObject *)substring);
4106 if (substring == NULL)
4107 return NULL;
4108
4109 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4110
4111 Py_DECREF(substring);
4112 return result;
4113}
4114
4115static char rindex__doc__[] =
4116"S.rindex(sub [,start [,end]]) -> int\n\
4117\n\
4118Like S.rfind() but raise ValueError when the substring is not found.";
4119
4120static PyObject *
4121unicode_rindex(PyUnicodeObject *self, PyObject *args)
4122{
4123 int result;
4124 PyUnicodeObject *substring;
4125 int start = 0;
4126 int end = INT_MAX;
4127
Guido van Rossumb8872e62000-05-09 14:14:27 +00004128 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4129 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 return NULL;
4131 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4132 (PyObject *)substring);
4133 if (substring == NULL)
4134 return NULL;
4135
4136 result = findstring(self, substring, start, end, -1);
4137
4138 Py_DECREF(substring);
4139 if (result < 0) {
4140 PyErr_SetString(PyExc_ValueError, "substring not found");
4141 return NULL;
4142 }
4143 return PyInt_FromLong(result);
4144}
4145
4146static char rjust__doc__[] =
4147"S.rjust(width) -> unicode\n\
4148\n\
4149Return S right justified in a Unicode string of length width. Padding is\n\
4150done using spaces.";
4151
4152static PyObject *
4153unicode_rjust(PyUnicodeObject *self, PyObject *args)
4154{
4155 int width;
4156 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4157 return NULL;
4158
4159 if (self->length >= width) {
4160 Py_INCREF(self);
4161 return (PyObject*) self;
4162 }
4163
4164 return (PyObject*) pad(self, width - self->length, 0, ' ');
4165}
4166
4167static char rstrip__doc__[] =
4168"S.rstrip() -> unicode\n\
4169\n\
4170Return a copy of the string S with trailing whitespace removed.";
4171
4172static PyObject *
4173unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4174{
4175 if (!PyArg_NoArgs(args))
4176 return NULL;
4177 return strip(self, 0, 1);
4178}
4179
4180static PyObject*
4181unicode_slice(PyUnicodeObject *self, int start, int end)
4182{
4183 /* standard clamping */
4184 if (start < 0)
4185 start = 0;
4186 if (end < 0)
4187 end = 0;
4188 if (end > self->length)
4189 end = self->length;
4190 if (start == 0 && end == self->length) {
4191 /* full slice, return original string */
4192 Py_INCREF(self);
4193 return (PyObject*) self;
4194 }
4195 if (start > end)
4196 start = end;
4197 /* copy slice */
4198 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4199 end - start);
4200}
4201
4202PyObject *PyUnicode_Split(PyObject *s,
4203 PyObject *sep,
4204 int maxsplit)
4205{
4206 PyObject *result;
4207
4208 s = PyUnicode_FromObject(s);
4209 if (s == NULL)
4210 return NULL;
4211 if (sep != NULL) {
4212 sep = PyUnicode_FromObject(sep);
4213 if (sep == NULL) {
4214 Py_DECREF(s);
4215 return NULL;
4216 }
4217 }
4218
4219 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4220
4221 Py_DECREF(s);
4222 Py_XDECREF(sep);
4223 return result;
4224}
4225
4226static char split__doc__[] =
4227"S.split([sep [,maxsplit]]) -> list of strings\n\
4228\n\
4229Return a list of the words in S, using sep as the\n\
4230delimiter string. If maxsplit is given, at most maxsplit\n\
4231splits are done. If sep is not specified, any whitespace string\n\
4232is a separator.";
4233
4234static PyObject*
4235unicode_split(PyUnicodeObject *self, PyObject *args)
4236{
4237 PyObject *substring = Py_None;
4238 int maxcount = -1;
4239
4240 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4241 return NULL;
4242
4243 if (substring == Py_None)
4244 return split(self, NULL, maxcount);
4245 else if (PyUnicode_Check(substring))
4246 return split(self, (PyUnicodeObject *)substring, maxcount);
4247 else
4248 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4249}
4250
4251static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004252"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253\n\
4254Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004255Line breaks are not included in the resulting list unless keepends\n\
4256is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257
4258static PyObject*
4259unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4260{
Guido van Rossum86662912000-04-11 15:38:46 +00004261 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262
Guido van Rossum86662912000-04-11 15:38:46 +00004263 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 return NULL;
4265
Guido van Rossum86662912000-04-11 15:38:46 +00004266 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267}
4268
4269static
4270PyObject *unicode_str(PyUnicodeObject *self)
4271{
Fred Drakee4315f52000-05-09 19:53:39 +00004272 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273}
4274
4275static char strip__doc__[] =
4276"S.strip() -> unicode\n\
4277\n\
4278Return a copy of S with leading and trailing whitespace removed.";
4279
4280static PyObject *
4281unicode_strip(PyUnicodeObject *self, PyObject *args)
4282{
4283 if (!PyArg_NoArgs(args))
4284 return NULL;
4285 return strip(self, 1, 1);
4286}
4287
4288static char swapcase__doc__[] =
4289"S.swapcase() -> unicode\n\
4290\n\
4291Return a copy of S with uppercase characters converted to lowercase\n\
4292and vice versa.";
4293
4294static PyObject*
4295unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4296{
4297 if (!PyArg_NoArgs(args))
4298 return NULL;
4299 return fixup(self, fixswapcase);
4300}
4301
4302static char translate__doc__[] =
4303"S.translate(table) -> unicode\n\
4304\n\
4305Return a copy of the string S, where all characters have been mapped\n\
4306through the given translation table, which must be a mapping of\n\
4307Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4308are left untouched. Characters mapped to None are deleted.";
4309
4310static PyObject*
4311unicode_translate(PyUnicodeObject *self, PyObject *args)
4312{
4313 PyObject *table;
4314
4315 if (!PyArg_ParseTuple(args, "O:translate", &table))
4316 return NULL;
4317 return PyUnicode_TranslateCharmap(self->str,
4318 self->length,
4319 table,
4320 "ignore");
4321}
4322
4323static char upper__doc__[] =
4324"S.upper() -> unicode\n\
4325\n\
4326Return a copy of S converted to uppercase.";
4327
4328static PyObject*
4329unicode_upper(PyUnicodeObject *self, PyObject *args)
4330{
4331 if (!PyArg_NoArgs(args))
4332 return NULL;
4333 return fixup(self, fixupper);
4334}
4335
4336#if 0
4337static char zfill__doc__[] =
4338"S.zfill(width) -> unicode\n\
4339\n\
4340Pad a numeric string x with zeros on the left, to fill a field\n\
4341of the specified width. The string x is never truncated.";
4342
4343static PyObject *
4344unicode_zfill(PyUnicodeObject *self, PyObject *args)
4345{
4346 int fill;
4347 PyUnicodeObject *u;
4348
4349 int width;
4350 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4351 return NULL;
4352
4353 if (self->length >= width) {
4354 Py_INCREF(self);
4355 return (PyObject*) self;
4356 }
4357
4358 fill = width - self->length;
4359
4360 u = pad(self, fill, 0, '0');
4361
4362 if (u->str[fill] == '+' || u->str[fill] == '-') {
4363 /* move sign to beginning of string */
4364 u->str[0] = u->str[fill];
4365 u->str[fill] = '0';
4366 }
4367
4368 return (PyObject*) u;
4369}
4370#endif
4371
4372#if 0
4373static PyObject*
4374unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4375{
4376 if (!PyArg_NoArgs(args))
4377 return NULL;
4378 return PyInt_FromLong(unicode_freelist_size);
4379}
4380#endif
4381
4382static char startswith__doc__[] =
4383"S.startswith(prefix[, start[, end]]) -> int\n\
4384\n\
4385Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4386optional start, test S beginning at that position. With optional end, stop\n\
4387comparing S at that position.";
4388
4389static PyObject *
4390unicode_startswith(PyUnicodeObject *self,
4391 PyObject *args)
4392{
4393 PyUnicodeObject *substring;
4394 int start = 0;
4395 int end = INT_MAX;
4396 PyObject *result;
4397
Guido van Rossumb8872e62000-05-09 14:14:27 +00004398 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4399 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 return NULL;
4401 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4402 (PyObject *)substring);
4403 if (substring == NULL)
4404 return NULL;
4405
4406 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4407
4408 Py_DECREF(substring);
4409 return result;
4410}
4411
4412
4413static char endswith__doc__[] =
4414"S.endswith(suffix[, start[, end]]) -> int\n\
4415\n\
4416Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4417optional start, test S beginning at that position. With optional end, stop\n\
4418comparing S at that position.";
4419
4420static PyObject *
4421unicode_endswith(PyUnicodeObject *self,
4422 PyObject *args)
4423{
4424 PyUnicodeObject *substring;
4425 int start = 0;
4426 int end = INT_MAX;
4427 PyObject *result;
4428
Guido van Rossumb8872e62000-05-09 14:14:27 +00004429 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4430 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 return NULL;
4432 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4433 (PyObject *)substring);
4434 if (substring == NULL)
4435 return NULL;
4436
4437 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4438
4439 Py_DECREF(substring);
4440 return result;
4441}
4442
4443
4444static PyMethodDef unicode_methods[] = {
4445
4446 /* Order is according to common usage: often used methods should
4447 appear first, since lookup is done sequentially. */
4448
4449 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4450 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4451 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4452 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4453 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4454 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4455 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4456 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4457 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4458 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4459 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4460 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4461 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4462 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4463/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4464 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4465 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4466 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4467 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4468 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4469 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4470 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4471 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4472 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4473 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4474 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4475 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4476 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4477 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4478 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4479 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4480 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4481 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004482 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4483 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484#if 0
4485 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4486 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4487#endif
4488
4489#if 0
4490 /* This one is just used for debugging the implementation. */
4491 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4492#endif
4493
4494 {NULL, NULL}
4495};
4496
4497static PyObject *
4498unicode_getattr(PyUnicodeObject *self, char *name)
4499{
4500 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4501}
4502
4503static PySequenceMethods unicode_as_sequence = {
4504 (inquiry) unicode_length, /* sq_length */
4505 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4506 (intargfunc) unicode_repeat, /* sq_repeat */
4507 (intargfunc) unicode_getitem, /* sq_item */
4508 (intintargfunc) unicode_slice, /* sq_slice */
4509 0, /* sq_ass_item */
4510 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004511 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512};
4513
4514static int
4515unicode_buffer_getreadbuf(PyUnicodeObject *self,
4516 int index,
4517 const void **ptr)
4518{
4519 if (index != 0) {
4520 PyErr_SetString(PyExc_SystemError,
4521 "accessing non-existent unicode segment");
4522 return -1;
4523 }
4524 *ptr = (void *) self->str;
4525 return PyUnicode_GET_DATA_SIZE(self);
4526}
4527
4528static int
4529unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4530 const void **ptr)
4531{
4532 PyErr_SetString(PyExc_TypeError,
4533 "cannot use unicode as modifyable buffer");
4534 return -1;
4535}
4536
4537static int
4538unicode_buffer_getsegcount(PyUnicodeObject *self,
4539 int *lenp)
4540{
4541 if (lenp)
4542 *lenp = PyUnicode_GET_DATA_SIZE(self);
4543 return 1;
4544}
4545
4546static int
4547unicode_buffer_getcharbuf(PyUnicodeObject *self,
4548 int index,
4549 const void **ptr)
4550{
4551 PyObject *str;
4552
4553 if (index != 0) {
4554 PyErr_SetString(PyExc_SystemError,
4555 "accessing non-existent unicode segment");
4556 return -1;
4557 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004558 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 if (str == NULL)
4560 return -1;
4561 *ptr = (void *) PyString_AS_STRING(str);
4562 return PyString_GET_SIZE(str);
4563}
4564
4565/* Helpers for PyUnicode_Format() */
4566
4567static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004568getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569{
4570 int argidx = *p_argidx;
4571 if (argidx < arglen) {
4572 (*p_argidx)++;
4573 if (arglen < 0)
4574 return args;
4575 else
4576 return PyTuple_GetItem(args, argidx);
4577 }
4578 PyErr_SetString(PyExc_TypeError,
4579 "not enough arguments for format string");
4580 return NULL;
4581}
4582
4583#define F_LJUST (1<<0)
4584#define F_SIGN (1<<1)
4585#define F_BLANK (1<<2)
4586#define F_ALT (1<<3)
4587#define F_ZERO (1<<4)
4588
4589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591{
4592 register int i;
4593 int len;
4594 va_list va;
4595 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
4598 /* First, format the string as char array, then expand to Py_UNICODE
4599 array. */
4600 charbuffer = (char *)buffer;
4601 len = vsprintf(charbuffer, format, va);
4602 for (i = len - 1; i >= 0; i--)
4603 buffer[i] = (Py_UNICODE) charbuffer[i];
4604
4605 va_end(va);
4606 return len;
4607}
4608
4609static int
4610formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004611 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 int flags,
4613 int prec,
4614 int type,
4615 PyObject *v)
4616{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004617 /* fmt = '%#.' + `prec` + `type`
4618 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 char fmt[20];
4620 double x;
4621
4622 x = PyFloat_AsDouble(v);
4623 if (x == -1.0 && PyErr_Occurred())
4624 return -1;
4625 if (prec < 0)
4626 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4628 type = 'g';
4629 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004630 /* worst case length calc to ensure no buffer overrun:
4631 fmt = %#.<prec>g
4632 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4633 for any double rep.)
4634 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4635 If prec=0 the effective precision is 1 (the leading digit is
4636 always given), therefore increase by one to 10+prec. */
4637 if (buflen <= (size_t)10 + (size_t)prec) {
4638 PyErr_SetString(PyExc_OverflowError,
4639 "formatted float is too long (precision too long?)");
4640 return -1;
4641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 return usprintf(buf, fmt, x);
4643}
4644
4645static int
4646formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004647 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 int flags,
4649 int prec,
4650 int type,
4651 PyObject *v)
4652{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004653 /* fmt = '%#.' + `prec` + 'l' + `type`
4654 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 char fmt[20];
4656 long x;
4657
4658 x = PyInt_AsLong(v);
4659 if (x == -1 && PyErr_Occurred())
4660 return -1;
4661 if (prec < 0)
4662 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004663 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4664 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4665 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4666 PyErr_SetString(PyExc_OverflowError,
4667 "formatted integer is too long (precision too long?)");
4668 return -1;
4669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4671 return usprintf(buf, fmt, x);
4672}
4673
4674static int
4675formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004676 size_t buflen,
4677 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004679 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004680 if (PyUnicode_Check(v)) {
4681 if (PyUnicode_GET_SIZE(v) != 1)
4682 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004686 else if (PyString_Check(v)) {
4687 if (PyString_GET_SIZE(v) != 1)
4688 goto onError;
4689 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691
4692 else {
4693 /* Integer input truncated to a character */
4694 long x;
4695 x = PyInt_AsLong(v);
4696 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 buf[0] = (char) x;
4699 }
4700 buf[1] = '\0';
4701 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004702
4703 onError:
4704 PyErr_SetString(PyExc_TypeError,
4705 "%c requires int or char");
4706 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
4708
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004709/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4710
4711 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4712 chars are formatted. XXX This is a magic number. Each formatting
4713 routine does bounds checking to ensure no overflow, but a better
4714 solution may be to malloc a buffer of appropriate size for each
4715 format. For now, the current solution is sufficient.
4716*/
4717#define FORMATBUFLEN (size_t)120
4718
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719PyObject *PyUnicode_Format(PyObject *format,
4720 PyObject *args)
4721{
4722 Py_UNICODE *fmt, *res;
4723 int fmtcnt, rescnt, reslen, arglen, argidx;
4724 int args_owned = 0;
4725 PyUnicodeObject *result = NULL;
4726 PyObject *dict = NULL;
4727 PyObject *uformat;
4728
4729 if (format == NULL || args == NULL) {
4730 PyErr_BadInternalCall();
4731 return NULL;
4732 }
4733 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004734 if (uformat == NULL)
4735 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 fmt = PyUnicode_AS_UNICODE(uformat);
4737 fmtcnt = PyUnicode_GET_SIZE(uformat);
4738
4739 reslen = rescnt = fmtcnt + 100;
4740 result = _PyUnicode_New(reslen);
4741 if (result == NULL)
4742 goto onError;
4743 res = PyUnicode_AS_UNICODE(result);
4744
4745 if (PyTuple_Check(args)) {
4746 arglen = PyTuple_Size(args);
4747 argidx = 0;
4748 }
4749 else {
4750 arglen = -1;
4751 argidx = -2;
4752 }
4753 if (args->ob_type->tp_as_mapping)
4754 dict = args;
4755
4756 while (--fmtcnt >= 0) {
4757 if (*fmt != '%') {
4758 if (--rescnt < 0) {
4759 rescnt = fmtcnt + 100;
4760 reslen += rescnt;
4761 if (_PyUnicode_Resize(result, reslen) < 0)
4762 return NULL;
4763 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4764 --rescnt;
4765 }
4766 *res++ = *fmt++;
4767 }
4768 else {
4769 /* Got a format specifier */
4770 int flags = 0;
4771 int width = -1;
4772 int prec = -1;
4773 int size = 0;
4774 Py_UNICODE c = '\0';
4775 Py_UNICODE fill;
4776 PyObject *v = NULL;
4777 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004778 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 Py_UNICODE sign;
4780 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004781 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782
4783 fmt++;
4784 if (*fmt == '(') {
4785 Py_UNICODE *keystart;
4786 int keylen;
4787 PyObject *key;
4788 int pcount = 1;
4789
4790 if (dict == NULL) {
4791 PyErr_SetString(PyExc_TypeError,
4792 "format requires a mapping");
4793 goto onError;
4794 }
4795 ++fmt;
4796 --fmtcnt;
4797 keystart = fmt;
4798 /* Skip over balanced parentheses */
4799 while (pcount > 0 && --fmtcnt >= 0) {
4800 if (*fmt == ')')
4801 --pcount;
4802 else if (*fmt == '(')
4803 ++pcount;
4804 fmt++;
4805 }
4806 keylen = fmt - keystart - 1;
4807 if (fmtcnt < 0 || pcount > 0) {
4808 PyErr_SetString(PyExc_ValueError,
4809 "incomplete format key");
4810 goto onError;
4811 }
Fred Drakee4315f52000-05-09 19:53:39 +00004812 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 then looked up since Python uses strings to hold
4814 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004815 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 key = PyUnicode_EncodeUTF8(keystart,
4817 keylen,
4818 NULL);
4819 if (key == NULL)
4820 goto onError;
4821 if (args_owned) {
4822 Py_DECREF(args);
4823 args_owned = 0;
4824 }
4825 args = PyObject_GetItem(dict, key);
4826 Py_DECREF(key);
4827 if (args == NULL) {
4828 goto onError;
4829 }
4830 args_owned = 1;
4831 arglen = -1;
4832 argidx = -2;
4833 }
4834 while (--fmtcnt >= 0) {
4835 switch (c = *fmt++) {
4836 case '-': flags |= F_LJUST; continue;
4837 case '+': flags |= F_SIGN; continue;
4838 case ' ': flags |= F_BLANK; continue;
4839 case '#': flags |= F_ALT; continue;
4840 case '0': flags |= F_ZERO; continue;
4841 }
4842 break;
4843 }
4844 if (c == '*') {
4845 v = getnextarg(args, arglen, &argidx);
4846 if (v == NULL)
4847 goto onError;
4848 if (!PyInt_Check(v)) {
4849 PyErr_SetString(PyExc_TypeError,
4850 "* wants int");
4851 goto onError;
4852 }
4853 width = PyInt_AsLong(v);
4854 if (width < 0) {
4855 flags |= F_LJUST;
4856 width = -width;
4857 }
4858 if (--fmtcnt >= 0)
4859 c = *fmt++;
4860 }
4861 else if (c >= '0' && c <= '9') {
4862 width = c - '0';
4863 while (--fmtcnt >= 0) {
4864 c = *fmt++;
4865 if (c < '0' || c > '9')
4866 break;
4867 if ((width*10) / 10 != width) {
4868 PyErr_SetString(PyExc_ValueError,
4869 "width too big");
4870 goto onError;
4871 }
4872 width = width*10 + (c - '0');
4873 }
4874 }
4875 if (c == '.') {
4876 prec = 0;
4877 if (--fmtcnt >= 0)
4878 c = *fmt++;
4879 if (c == '*') {
4880 v = getnextarg(args, arglen, &argidx);
4881 if (v == NULL)
4882 goto onError;
4883 if (!PyInt_Check(v)) {
4884 PyErr_SetString(PyExc_TypeError,
4885 "* wants int");
4886 goto onError;
4887 }
4888 prec = PyInt_AsLong(v);
4889 if (prec < 0)
4890 prec = 0;
4891 if (--fmtcnt >= 0)
4892 c = *fmt++;
4893 }
4894 else if (c >= '0' && c <= '9') {
4895 prec = c - '0';
4896 while (--fmtcnt >= 0) {
4897 c = Py_CHARMASK(*fmt++);
4898 if (c < '0' || c > '9')
4899 break;
4900 if ((prec*10) / 10 != prec) {
4901 PyErr_SetString(PyExc_ValueError,
4902 "prec too big");
4903 goto onError;
4904 }
4905 prec = prec*10 + (c - '0');
4906 }
4907 }
4908 } /* prec */
4909 if (fmtcnt >= 0) {
4910 if (c == 'h' || c == 'l' || c == 'L') {
4911 size = c;
4912 if (--fmtcnt >= 0)
4913 c = *fmt++;
4914 }
4915 }
4916 if (fmtcnt < 0) {
4917 PyErr_SetString(PyExc_ValueError,
4918 "incomplete format");
4919 goto onError;
4920 }
4921 if (c != '%') {
4922 v = getnextarg(args, arglen, &argidx);
4923 if (v == NULL)
4924 goto onError;
4925 }
4926 sign = 0;
4927 fill = ' ';
4928 switch (c) {
4929
4930 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004931 pbuf = formatbuf;
4932 /* presume that buffer length is at least 1 */
4933 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 len = 1;
4935 break;
4936
4937 case 's':
4938 case 'r':
4939 if (PyUnicode_Check(v) && c == 's') {
4940 temp = v;
4941 Py_INCREF(temp);
4942 }
4943 else {
4944 PyObject *unicode;
4945 if (c == 's')
4946 temp = PyObject_Str(v);
4947 else
4948 temp = PyObject_Repr(v);
4949 if (temp == NULL)
4950 goto onError;
4951 if (!PyString_Check(temp)) {
4952 /* XXX Note: this should never happen, since
4953 PyObject_Repr() and PyObject_Str() assure
4954 this */
4955 Py_DECREF(temp);
4956 PyErr_SetString(PyExc_TypeError,
4957 "%s argument has non-string str()");
4958 goto onError;
4959 }
Fred Drakee4315f52000-05-09 19:53:39 +00004960 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004962 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 "strict");
4964 Py_DECREF(temp);
4965 temp = unicode;
4966 if (temp == NULL)
4967 goto onError;
4968 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004969 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 len = PyUnicode_GET_SIZE(temp);
4971 if (prec >= 0 && len > prec)
4972 len = prec;
4973 break;
4974
4975 case 'i':
4976 case 'd':
4977 case 'u':
4978 case 'o':
4979 case 'x':
4980 case 'X':
4981 if (c == 'i')
4982 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004983 pbuf = formatbuf;
4984 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4985 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 if (len < 0)
4987 goto onError;
4988 sign = (c == 'd');
4989 if (flags & F_ZERO) {
4990 fill = '0';
4991 if ((flags&F_ALT) &&
4992 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004993 pbuf[0] == '0' && pbuf[1] == c) {
4994 *res++ = *pbuf++;
4995 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 rescnt -= 2;
4997 len -= 2;
4998 width -= 2;
4999 if (width < 0)
5000 width = 0;
5001 }
5002 }
5003 break;
5004
5005 case 'e':
5006 case 'E':
5007 case 'f':
5008 case 'g':
5009 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005010 pbuf = formatbuf;
5011 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5012 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 if (len < 0)
5014 goto onError;
5015 sign = 1;
5016 if (flags&F_ZERO)
5017 fill = '0';
5018 break;
5019
5020 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005021 pbuf = formatbuf;
5022 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 if (len < 0)
5024 goto onError;
5025 break;
5026
5027 default:
5028 PyErr_Format(PyExc_ValueError,
5029 "unsupported format character '%c' (0x%x)",
5030 c, c);
5031 goto onError;
5032 }
5033 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005034 if (*pbuf == '-' || *pbuf == '+') {
5035 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 len--;
5037 }
5038 else if (flags & F_SIGN)
5039 sign = '+';
5040 else if (flags & F_BLANK)
5041 sign = ' ';
5042 else
5043 sign = 0;
5044 }
5045 if (width < len)
5046 width = len;
5047 if (rescnt < width + (sign != 0)) {
5048 reslen -= rescnt;
5049 rescnt = width + fmtcnt + 100;
5050 reslen += rescnt;
5051 if (_PyUnicode_Resize(result, reslen) < 0)
5052 return NULL;
5053 res = PyUnicode_AS_UNICODE(result)
5054 + reslen - rescnt;
5055 }
5056 if (sign) {
5057 if (fill != ' ')
5058 *res++ = sign;
5059 rescnt--;
5060 if (width > len)
5061 width--;
5062 }
5063 if (width > len && !(flags & F_LJUST)) {
5064 do {
5065 --rescnt;
5066 *res++ = fill;
5067 } while (--width > len);
5068 }
5069 if (sign && fill == ' ')
5070 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005071 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 res += len;
5073 rescnt -= len;
5074 while (--width >= len) {
5075 --rescnt;
5076 *res++ = ' ';
5077 }
5078 if (dict && (argidx < arglen) && c != '%') {
5079 PyErr_SetString(PyExc_TypeError,
5080 "not all arguments converted");
5081 goto onError;
5082 }
5083 Py_XDECREF(temp);
5084 } /* '%' */
5085 } /* until end */
5086 if (argidx < arglen && !dict) {
5087 PyErr_SetString(PyExc_TypeError,
5088 "not all arguments converted");
5089 goto onError;
5090 }
5091
5092 if (args_owned) {
5093 Py_DECREF(args);
5094 }
5095 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005096 if (_PyUnicode_Resize(result, reslen - rescnt))
5097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return (PyObject *)result;
5099
5100 onError:
5101 Py_XDECREF(result);
5102 Py_DECREF(uformat);
5103 if (args_owned) {
5104 Py_DECREF(args);
5105 }
5106 return NULL;
5107}
5108
5109static PyBufferProcs unicode_as_buffer = {
5110 (getreadbufferproc) unicode_buffer_getreadbuf,
5111 (getwritebufferproc) unicode_buffer_getwritebuf,
5112 (getsegcountproc) unicode_buffer_getsegcount,
5113 (getcharbufferproc) unicode_buffer_getcharbuf,
5114};
5115
5116PyTypeObject PyUnicode_Type = {
5117 PyObject_HEAD_INIT(&PyType_Type)
5118 0, /* ob_size */
5119 "unicode", /* tp_name */
5120 sizeof(PyUnicodeObject), /* tp_size */
5121 0, /* tp_itemsize */
5122 /* Slots */
5123 (destructor)_PyUnicode_Free, /* tp_dealloc */
5124 0, /* tp_print */
5125 (getattrfunc)unicode_getattr, /* tp_getattr */
5126 0, /* tp_setattr */
5127 (cmpfunc) unicode_compare, /* tp_compare */
5128 (reprfunc) unicode_repr, /* tp_repr */
5129 0, /* tp_as_number */
5130 &unicode_as_sequence, /* tp_as_sequence */
5131 0, /* tp_as_mapping */
5132 (hashfunc) unicode_hash, /* tp_hash*/
5133 0, /* tp_call*/
5134 (reprfunc) unicode_str, /* tp_str */
5135 (getattrofunc) NULL, /* tp_getattro */
5136 (setattrofunc) NULL, /* tp_setattro */
5137 &unicode_as_buffer, /* tp_as_buffer */
5138 Py_TPFLAGS_DEFAULT, /* tp_flags */
5139};
5140
5141/* Initialize the Unicode implementation */
5142
Thomas Wouters78890102000-07-22 19:25:51 +00005143void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144{
5145 /* Doublecheck the configuration... */
5146 if (sizeof(Py_UNICODE) != 2)
5147 Py_FatalError("Unicode configuration error: "
5148 "sizeof(Py_UNICODE) != 2 bytes");
5149
Fred Drakee4315f52000-05-09 19:53:39 +00005150 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005151 unicode_freelist = NULL;
5152 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005154 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155}
5156
5157/* Finalize the Unicode implementation */
5158
5159void
Thomas Wouters78890102000-07-22 19:25:51 +00005160_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
5162 PyUnicodeObject *u = unicode_freelist;
5163
5164 while (u != NULL) {
5165 PyUnicodeObject *v = u;
5166 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005167 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005168 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005169 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005170 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005172 unicode_freelist = NULL;
5173 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005175 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176}