blob: 76bb92a117f041fde0169aea032fa07f680e2bfc [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000246 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000276 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
364{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 const char *s;
366 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000367 int owned = 0;
368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
373 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000374
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
383 }
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
389 }
390 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
397 }
398 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 }
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
403 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414
415 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 if (len == 0) {
417 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return v;
427
428 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000429 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000430 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000431 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433}
434
435PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *buffer = NULL, *unicode;
441
Fred Drakee4315f52000-05-09 19:53:39 +0000442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
444
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000462 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
466 }
467 Py_DECREF(buffer);
468 return unicode;
469
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
479{
480 PyObject *v, *unicode;
481
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
493{
494 PyObject *v;
495
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
499 }
Fred Drakee4315f52000-05-09 19:53:39 +0000500
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
503
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000521 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
525 }
526 return v;
527
528 onError:
529 return NULL;
530}
531
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000532/* Return a Python string holding the default encoded value of the
533 Unicode object.
534
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
539
540 The refcount of the string is *not* incremented.
541
542 *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
548{
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
557}
558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
564 }
565 return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568 return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577 return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580 return -1;
581}
582
Thomas Wouters78890102000-07-22 19:25:51 +0000583const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000584{
585 return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590 PyObject *v;
591
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
602
603 onError:
604 return -1;
605}
606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
636{
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000640 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641 details);
642 return -1;
643 }
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
647 }
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
653 }
654 else {
655 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000656 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 errors);
658 return -1;
659 }
660}
661
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
665{
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000670 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
679
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
683
684 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000685 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686
687 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000688 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 s++;
690 continue;
691 }
692
693 n = utf8_code_length[ch];
694
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699
700 switch (n) {
701
702 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "unexpected code byte";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 errmsg = "internal error";
709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000710 break;
711
712 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724 break;
725
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000738 *p++ = (Py_UNICODE)ch;
739 break;
740
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000758 /* compute and append the two surrogates: */
759
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
762
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 break;
769
770 default:
771 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 }
776 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 continue;
778
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 }
783
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
787
788 return (PyObject *)unicode;
789
790onError:
791 Py_DECREF(unicode);
792 return NULL;
793}
794
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795/* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000797#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798static
799int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
818 }
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 errors);
824 return -1;
825 }
826}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
832{
833 PyObject *v;
834 char *p;
835 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000842 if (v == NULL)
843 return NULL;
844 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000845 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten++;
853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000857 cbWritten += 2;
858 }
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000871 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000872 }
873
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 i++;
880 cbWritten += 4;
881 }
882 }
883 }
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
887 }
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 }
891 }
892 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000893 if (_PyString_Resize(&v, p - q))
894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895 return v;
896
897 onError:
898 Py_DECREF(v);
899 return NULL;
900}
901
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
903{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 if (!PyUnicode_Check(unicode)) {
905 PyErr_BadArgument();
906 return NULL;
907 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000908 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
909 PyUnicode_GET_SIZE(unicode),
910 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911}
912
913/* --- UTF-16 Codec ------------------------------------------------------- */
914
915static
916int utf16_decoding_error(const Py_UNICODE **source,
917 Py_UNICODE **dest,
918 const char *errors,
919 const char *details)
920{
921 if ((errors == NULL) ||
922 (strcmp(errors,"strict") == 0)) {
923 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000924 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000925 details);
926 return -1;
927 }
928 else if (strcmp(errors,"ignore") == 0) {
929 return 0;
930 }
931 else if (strcmp(errors,"replace") == 0) {
932 if (dest) {
933 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
934 (*dest)++;
935 }
936 return 0;
937 }
938 else {
939 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000940 "UTF-16 decoding error; "
941 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942 errors);
943 return -1;
944 }
945}
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947PyObject *PyUnicode_DecodeUTF16(const char *s,
948 int size,
949 const char *errors,
950 int *byteorder)
951{
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const Py_UNICODE *q, *e;
955 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000956 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000957
958 /* size should be an even number */
959 if (size % sizeof(Py_UNICODE) != 0) {
960 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
961 return NULL;
962 /* The remaining input chars are ignored if we fall through
963 here... */
964 }
965
966 /* Note: size will always be longer than the resulting Unicode
967 character count */
968 unicode = _PyUnicode_New(size);
969 if (!unicode)
970 return NULL;
971 if (size == 0)
972 return (PyObject *)unicode;
973
974 /* Unpack UTF-16 encoded data */
975 p = unicode->str;
976 q = (Py_UNICODE *)s;
977 e = q + (size / sizeof(Py_UNICODE));
978
979 if (byteorder)
980 bo = *byteorder;
981
982 while (q < e) {
983 register Py_UNICODE ch = *q++;
984
985 /* Check for BOM marks (U+FEFF) in the input and adjust
986 current byte order setting accordingly. Swap input
987 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
988 !) */
989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
990 if (ch == 0xFEFF) {
991 bo = -1;
992 continue;
993 } else if (ch == 0xFFFE) {
994 bo = 1;
995 continue;
996 }
997 if (bo == 1)
998 ch = (ch >> 8) | (ch << 8);
999#else
1000 if (ch == 0xFEFF) {
1001 bo = 1;
1002 continue;
1003 } else if (ch == 0xFFFE) {
1004 bo = -1;
1005 continue;
1006 }
1007 if (bo == -1)
1008 ch = (ch >> 8) | (ch << 8);
1009#endif
1010 if (ch < 0xD800 || ch > 0xDFFF) {
1011 *p++ = ch;
1012 continue;
1013 }
1014
1015 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (q >= e) {
1017 errmsg = "unexpected end of data";
1018 goto utf16Error;
1019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020 if (0xDC00 <= *q && *q <= 0xDFFF) {
1021 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001022 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 /* This is valid data (a UTF-16 surrogate pair), but
1024 we are not able to store this information since our
1025 Py_UNICODE type only has 16 bits... this might
1026 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "code pairs are not supported";
1028 goto utf16Error;
1029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030 else
1031 continue;
1032 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001033 errmsg = "illegal encoding";
1034 /* Fall through to report the error */
1035
1036 utf16Error:
1037 if (utf16_decoding_error(&q, &p, errors, errmsg))
1038 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 }
1040
1041 if (byteorder)
1042 *byteorder = bo;
1043
1044 /* Adjust length */
1045 if (_PyUnicode_Resize(unicode, p - unicode->str))
1046 goto onError;
1047
1048 return (PyObject *)unicode;
1049
1050onError:
1051 Py_DECREF(unicode);
1052 return NULL;
1053}
1054
1055#undef UTF16_ERROR
1056
1057PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1058 int size,
1059 const char *errors,
1060 int byteorder)
1061{
1062 PyObject *v;
1063 Py_UNICODE *p;
1064 char *q;
1065
1066 /* We don't create UTF-16 pairs... */
1067 v = PyString_FromStringAndSize(NULL,
1068 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1069 if (v == NULL)
1070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001071
1072 q = PyString_AS_STRING(v);
1073 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 if (byteorder == 0)
1075 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001076 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001077 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 if (byteorder == 0 ||
1079#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1080 byteorder == -1
1081#else
1082 byteorder == 1
1083#endif
1084 )
1085 memcpy(p, s, size * sizeof(Py_UNICODE));
1086 else
1087 while (size-- > 0) {
1088 Py_UNICODE ch = *s++;
1089 *p++ = (ch >> 8) | (ch << 8);
1090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 return v;
1092}
1093
1094PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1095{
1096 if (!PyUnicode_Check(unicode)) {
1097 PyErr_BadArgument();
1098 return NULL;
1099 }
1100 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1101 PyUnicode_GET_SIZE(unicode),
1102 NULL,
1103 0);
1104}
1105
1106/* --- Unicode Escape Codec ----------------------------------------------- */
1107
1108static
1109int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001110 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 const char *errors,
1112 const char *details)
1113{
1114 if ((errors == NULL) ||
1115 (strcmp(errors,"strict") == 0)) {
1116 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001117 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 details);
1119 return -1;
1120 }
1121 else if (strcmp(errors,"ignore") == 0) {
1122 return 0;
1123 }
1124 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001125 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 return 0;
1127 }
1128 else {
1129 PyErr_Format(PyExc_ValueError,
1130 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001131 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 errors);
1133 return -1;
1134 }
1135}
1136
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001137static _Py_UCNHashAPI *pucnHash = NULL;
1138
1139static
1140int mystrnicmp(const char *s1, const char *s2, size_t count)
1141{
1142 char c1, c2;
1143
1144 if (count)
1145 {
1146 do
1147 {
1148 c1 = tolower(*(s1++));
1149 c2 = tolower(*(s2++));
1150 }
1151 while(--count && c1 == c2);
1152
1153 return c1 - c2;
1154 }
1155
1156 return 0;
1157}
1158
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1160 int size,
1161 const char *errors)
1162{
1163 PyUnicodeObject *v;
1164 Py_UNICODE *p = NULL, *buf = NULL;
1165 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001166 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167
1168 /* Escaped strings will always be longer than the resulting
1169 Unicode string, so we start with size here and then reduce the
1170 length after conversion to the true value. */
1171 v = _PyUnicode_New(size);
1172 if (v == NULL)
1173 goto onError;
1174 if (size == 0)
1175 return (PyObject *)v;
1176 p = buf = PyUnicode_AS_UNICODE(v);
1177 end = s + size;
1178 while (s < end) {
1179 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001180 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 int i;
1182
1183 /* Non-escape characters are interpreted as Unicode ordinals */
1184 if (*s != '\\') {
1185 *p++ = (unsigned char)*s++;
1186 continue;
1187 }
1188
1189 /* \ - Escapes */
1190 s++;
1191 switch (*s++) {
1192
1193 /* \x escapes */
1194 case '\n': break;
1195 case '\\': *p++ = '\\'; break;
1196 case '\'': *p++ = '\''; break;
1197 case '\"': *p++ = '\"'; break;
1198 case 'b': *p++ = '\b'; break;
1199 case 'f': *p++ = '\014'; break; /* FF */
1200 case 't': *p++ = '\t'; break;
1201 case 'n': *p++ = '\n'; break;
1202 case 'r': *p++ = '\r'; break;
1203 case 'v': *p++ = '\013'; break; /* VT */
1204 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1205
1206 /* \OOO (octal) escapes */
1207 case '0': case '1': case '2': case '3':
1208 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001209 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001211 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001213 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001215 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 break;
1217
Fredrik Lundhdf846752000-09-03 11:29:49 +00001218 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001220 for (x = 0, i = 0; i < 2; i++) {
1221 c = (unsigned char)s[i];
1222 if (!isxdigit(c)) {
1223 if (unicodeescape_decoding_error(&s, &x, errors,
1224 "truncated \\xXX"))
1225 goto onError;
1226 i++;
1227 break;
1228 }
1229 x = (x<<4) & ~0xF;
1230 if (c >= '0' && c <= '9')
1231 x += c - '0';
1232 else if (c >= 'a' && c <= 'f')
1233 x += 10 + c - 'a';
1234 else
1235 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001237 s += i;
1238 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 break;
1240
1241 /* \uXXXX with 4 hex digits */
1242 case 'u':
1243 for (x = 0, i = 0; i < 4; i++) {
1244 c = (unsigned char)s[i];
1245 if (!isxdigit(c)) {
1246 if (unicodeescape_decoding_error(&s, &x, errors,
1247 "truncated \\uXXXX"))
1248 goto onError;
1249 i++;
1250 break;
1251 }
1252 x = (x<<4) & ~0xF;
1253 if (c >= '0' && c <= '9')
1254 x += c - '0';
1255 else if (c >= 'a' && c <= 'f')
1256 x += 10 + c - 'a';
1257 else
1258 x += 10 + c - 'A';
1259 }
1260 s += i;
1261 *p++ = x;
1262 break;
1263
Fredrik Lundhdf846752000-09-03 11:29:49 +00001264 /* \UXXXXXXXX with 8 hex digits */
1265 case 'U':
1266 for (chr = 0, i = 0; i < 8; i++) {
1267 c = (unsigned char)s[i];
1268 if (!isxdigit(c)) {
1269 if (unicodeescape_decoding_error(&s, &x, errors,
1270 "truncated \\uXXXX"))
1271 goto onError;
1272 i++;
1273 break;
1274 }
1275 chr = (chr<<4) & ~0xF;
1276 if (c >= '0' && c <= '9')
1277 chr += c - '0';
1278 else if (c >= 'a' && c <= 'f')
1279 chr += 10 + c - 'a';
1280 else
1281 chr += 10 + c - 'A';
1282 }
1283 s += i;
1284 goto store;
1285
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 case 'N':
1287 /* Ok, we need to deal with Unicode Character Names now,
1288 * make sure we've imported the hash table data...
1289 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001290 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001291 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001292 mod = PyImport_ImportModule("ucnhash");
1293 if (mod == NULL)
1294 goto onError;
1295 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1296 Py_DECREF(mod);
1297 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001298 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001299 pucnHash = PyCObject_AsVoidPtr(v);
1300 Py_DECREF(v);
1301 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001302 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001303 }
1304
Fredrik Lundhdf846752000-09-03 11:29:49 +00001305 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001306 const char *start = s + 1;
1307 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001308 unsigned long j;
1309
1310 /* look for either the closing brace, or we
1311 * exceed the maximum length of the unicode character names
1312 */
1313 while (*endBrace != '}' &&
1314 (unsigned int)(endBrace - start) <=
1315 pucnHash->cchMax &&
1316 endBrace < end)
1317 {
1318 endBrace++;
1319 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001320 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001321 j = pucnHash->hash(start, endBrace - start);
1322 if (j > pucnHash->cKeys ||
1323 mystrnicmp(
1324 start,
1325 ((_Py_UnicodeCharacterName *)
1326 (pucnHash->getValue(j)))->pszUCN,
1327 (int)(endBrace - start)) != 0)
1328 {
1329 if (unicodeescape_decoding_error(
1330 &s, &x, errors,
1331 "Invalid Unicode Character Name"))
1332 {
1333 goto onError;
1334 }
1335 goto ucnFallthrough;
1336 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001337 chr = ((_Py_UnicodeCharacterName *)
1338 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001339 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001340 goto store;
1341 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001342 if (unicodeescape_decoding_error(
1343 &s, &x, errors,
1344 "Unicode name missing closing brace"))
1345 goto onError;
1346 goto ucnFallthrough;
1347 }
1348 break;
1349 }
1350 if (unicodeescape_decoding_error(
1351 &s, &x, errors,
1352 "Missing opening brace for Unicode Character Name escape"))
1353 goto onError;
1354ucnFallthrough:
1355 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001356 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 *p++ = '\\';
1358 *p++ = (unsigned char)s[-1];
1359 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001360store:
1361 /* when we get here, chr is a 32-bit unicode character */
1362 if (chr <= 0xffff)
1363 /* UCS-2 character */
1364 *p++ = (Py_UNICODE) chr;
1365 else if (chr <= 0x10ffff) {
1366 /* UCS-4 character. store as two surrogate characters */
1367 chr -= 0x10000L;
1368 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1369 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1370 } else {
1371 if (unicodeescape_decoding_error(
1372 &s, &x, errors,
1373 "Illegal Unicode character")
1374 )
1375 goto onError;
1376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
1378 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001379 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 return (PyObject *)v;
1382
1383 onError:
1384 Py_XDECREF(v);
1385 return NULL;
1386}
1387
1388/* Return a Unicode-Escape string version of the Unicode object.
1389
1390 If quotes is true, the string is enclosed in u"" or u'' quotes as
1391 appropriate.
1392
1393*/
1394
Barry Warsaw51ac5802000-03-20 16:36:48 +00001395static const Py_UNICODE *findchar(const Py_UNICODE *s,
1396 int size,
1397 Py_UNICODE ch);
1398
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399static
1400PyObject *unicodeescape_string(const Py_UNICODE *s,
1401 int size,
1402 int quotes)
1403{
1404 PyObject *repr;
1405 char *p;
1406 char *q;
1407
1408 static const char *hexdigit = "0123456789ABCDEF";
1409
1410 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1411 if (repr == NULL)
1412 return NULL;
1413
1414 p = q = PyString_AS_STRING(repr);
1415
1416 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 *p++ = 'u';
1418 *p++ = (findchar(s, size, '\'') &&
1419 !findchar(s, size, '"')) ? '"' : '\'';
1420 }
1421 while (size-- > 0) {
1422 Py_UNICODE ch = *s++;
1423 /* Escape quotes */
1424 if (quotes && (ch == q[1] || ch == '\\')) {
1425 *p++ = '\\';
1426 *p++ = (char) ch;
1427 }
1428 /* Map 16-bit characters to '\uxxxx' */
1429 else if (ch >= 256) {
1430 *p++ = '\\';
1431 *p++ = 'u';
1432 *p++ = hexdigit[(ch >> 12) & 0xf];
1433 *p++ = hexdigit[(ch >> 8) & 0xf];
1434 *p++ = hexdigit[(ch >> 4) & 0xf];
1435 *p++ = hexdigit[ch & 15];
1436 }
1437 /* Map non-printable US ASCII to '\ooo' */
1438 else if (ch < ' ' || ch >= 128) {
1439 *p++ = '\\';
1440 *p++ = hexdigit[(ch >> 6) & 7];
1441 *p++ = hexdigit[(ch >> 3) & 7];
1442 *p++ = hexdigit[ch & 7];
1443 }
1444 /* Copy everything else as-is */
1445 else
1446 *p++ = (char) ch;
1447 }
1448 if (quotes)
1449 *p++ = q[1];
1450
1451 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001452 if (_PyString_Resize(&repr, p - q))
1453 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454
1455 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001456
1457 onError:
1458 Py_DECREF(repr);
1459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460}
1461
1462PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1463 int size)
1464{
1465 return unicodeescape_string(s, size, 0);
1466}
1467
1468PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1469{
1470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
1472 return NULL;
1473 }
1474 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode));
1476}
1477
1478/* --- Raw Unicode Escape Codec ------------------------------------------- */
1479
1480PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1481 int size,
1482 const char *errors)
1483{
1484 PyUnicodeObject *v;
1485 Py_UNICODE *p, *buf;
1486 const char *end;
1487 const char *bs;
1488
1489 /* Escaped strings will always be longer than the resulting
1490 Unicode string, so we start with size here and then reduce the
1491 length after conversion to the true value. */
1492 v = _PyUnicode_New(size);
1493 if (v == NULL)
1494 goto onError;
1495 if (size == 0)
1496 return (PyObject *)v;
1497 p = buf = PyUnicode_AS_UNICODE(v);
1498 end = s + size;
1499 while (s < end) {
1500 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001501 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 int i;
1503
1504 /* Non-escape characters are interpreted as Unicode ordinals */
1505 if (*s != '\\') {
1506 *p++ = (unsigned char)*s++;
1507 continue;
1508 }
1509
1510 /* \u-escapes are only interpreted iff the number of leading
1511 backslashes if odd */
1512 bs = s;
1513 for (;s < end;) {
1514 if (*s != '\\')
1515 break;
1516 *p++ = (unsigned char)*s++;
1517 }
1518 if (((s - bs) & 1) == 0 ||
1519 s >= end ||
1520 *s != 'u') {
1521 continue;
1522 }
1523 p--;
1524 s++;
1525
1526 /* \uXXXX with 4 hex digits */
1527 for (x = 0, i = 0; i < 4; i++) {
1528 c = (unsigned char)s[i];
1529 if (!isxdigit(c)) {
1530 if (unicodeescape_decoding_error(&s, &x, errors,
1531 "truncated \\uXXXX"))
1532 goto onError;
1533 i++;
1534 break;
1535 }
1536 x = (x<<4) & ~0xF;
1537 if (c >= '0' && c <= '9')
1538 x += c - '0';
1539 else if (c >= 'a' && c <= 'f')
1540 x += 10 + c - 'a';
1541 else
1542 x += 10 + c - 'A';
1543 }
1544 s += i;
1545 *p++ = x;
1546 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001547 if (_PyUnicode_Resize(v, (int)(p - buf)))
1548 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 return (PyObject *)v;
1550
1551 onError:
1552 Py_XDECREF(v);
1553 return NULL;
1554}
1555
1556PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1557 int size)
1558{
1559 PyObject *repr;
1560 char *p;
1561 char *q;
1562
1563 static const char *hexdigit = "0123456789ABCDEF";
1564
1565 repr = PyString_FromStringAndSize(NULL, 6 * size);
1566 if (repr == NULL)
1567 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001568 if (size == 0)
1569 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570
1571 p = q = PyString_AS_STRING(repr);
1572 while (size-- > 0) {
1573 Py_UNICODE ch = *s++;
1574 /* Map 16-bit characters to '\uxxxx' */
1575 if (ch >= 256) {
1576 *p++ = '\\';
1577 *p++ = 'u';
1578 *p++ = hexdigit[(ch >> 12) & 0xf];
1579 *p++ = hexdigit[(ch >> 8) & 0xf];
1580 *p++ = hexdigit[(ch >> 4) & 0xf];
1581 *p++ = hexdigit[ch & 15];
1582 }
1583 /* Copy everything else as-is */
1584 else
1585 *p++ = (char) ch;
1586 }
1587 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001588 if (_PyString_Resize(&repr, p - q))
1589 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
1591 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001592
1593 onError:
1594 Py_DECREF(repr);
1595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596}
1597
1598PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1599{
1600 if (!PyUnicode_Check(unicode)) {
1601 PyErr_BadArgument();
1602 return NULL;
1603 }
1604 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1605 PyUnicode_GET_SIZE(unicode));
1606}
1607
1608/* --- Latin-1 Codec ------------------------------------------------------ */
1609
1610PyObject *PyUnicode_DecodeLatin1(const char *s,
1611 int size,
1612 const char *errors)
1613{
1614 PyUnicodeObject *v;
1615 Py_UNICODE *p;
1616
1617 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1618 v = _PyUnicode_New(size);
1619 if (v == NULL)
1620 goto onError;
1621 if (size == 0)
1622 return (PyObject *)v;
1623 p = PyUnicode_AS_UNICODE(v);
1624 while (size-- > 0)
1625 *p++ = (unsigned char)*s++;
1626 return (PyObject *)v;
1627
1628 onError:
1629 Py_XDECREF(v);
1630 return NULL;
1631}
1632
1633static
1634int latin1_encoding_error(const Py_UNICODE **source,
1635 char **dest,
1636 const char *errors,
1637 const char *details)
1638{
1639 if ((errors == NULL) ||
1640 (strcmp(errors,"strict") == 0)) {
1641 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001642 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 details);
1644 return -1;
1645 }
1646 else if (strcmp(errors,"ignore") == 0) {
1647 return 0;
1648 }
1649 else if (strcmp(errors,"replace") == 0) {
1650 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001651 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 return 0;
1653 }
1654 else {
1655 PyErr_Format(PyExc_ValueError,
1656 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001657 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658 errors);
1659 return -1;
1660 }
1661}
1662
1663PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1664 int size,
1665 const char *errors)
1666{
1667 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001668 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 repr = PyString_FromStringAndSize(NULL, size);
1671 if (repr == NULL)
1672 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001673 if (size == 0)
1674 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675
1676 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001677 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 while (size-- > 0) {
1679 Py_UNICODE ch = *p++;
1680 if (ch >= 256) {
1681 if (latin1_encoding_error(&p, &s, errors,
1682 "ordinal not in range(256)"))
1683 goto onError;
1684 }
1685 else
1686 *s++ = (char)ch;
1687 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001688 /* Resize if error handling skipped some characters */
1689 if (s - start < PyString_GET_SIZE(repr))
1690 if (_PyString_Resize(&repr, s - start))
1691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 return repr;
1693
1694 onError:
1695 Py_DECREF(repr);
1696 return NULL;
1697}
1698
1699PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1700{
1701 if (!PyUnicode_Check(unicode)) {
1702 PyErr_BadArgument();
1703 return NULL;
1704 }
1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 NULL);
1708}
1709
1710/* --- 7-bit ASCII Codec -------------------------------------------------- */
1711
1712static
1713int ascii_decoding_error(const char **source,
1714 Py_UNICODE **dest,
1715 const char *errors,
1716 const char *details)
1717{
1718 if ((errors == NULL) ||
1719 (strcmp(errors,"strict") == 0)) {
1720 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001721 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 details);
1723 return -1;
1724 }
1725 else if (strcmp(errors,"ignore") == 0) {
1726 return 0;
1727 }
1728 else if (strcmp(errors,"replace") == 0) {
1729 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1730 (*dest)++;
1731 return 0;
1732 }
1733 else {
1734 PyErr_Format(PyExc_ValueError,
1735 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 errors);
1738 return -1;
1739 }
1740}
1741
1742PyObject *PyUnicode_DecodeASCII(const char *s,
1743 int size,
1744 const char *errors)
1745{
1746 PyUnicodeObject *v;
1747 Py_UNICODE *p;
1748
1749 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1750 v = _PyUnicode_New(size);
1751 if (v == NULL)
1752 goto onError;
1753 if (size == 0)
1754 return (PyObject *)v;
1755 p = PyUnicode_AS_UNICODE(v);
1756 while (size-- > 0) {
1757 register unsigned char c;
1758
1759 c = (unsigned char)*s++;
1760 if (c < 128)
1761 *p++ = c;
1762 else if (ascii_decoding_error(&s, &p, errors,
1763 "ordinal not in range(128)"))
1764 goto onError;
1765 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001766 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1767 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 return (PyObject *)v;
1770
1771 onError:
1772 Py_XDECREF(v);
1773 return NULL;
1774}
1775
1776static
1777int ascii_encoding_error(const Py_UNICODE **source,
1778 char **dest,
1779 const char *errors,
1780 const char *details)
1781{
1782 if ((errors == NULL) ||
1783 (strcmp(errors,"strict") == 0)) {
1784 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001785 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 details);
1787 return -1;
1788 }
1789 else if (strcmp(errors,"ignore") == 0) {
1790 return 0;
1791 }
1792 else if (strcmp(errors,"replace") == 0) {
1793 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001794 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 return 0;
1796 }
1797 else {
1798 PyErr_Format(PyExc_ValueError,
1799 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001800 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 errors);
1802 return -1;
1803 }
1804}
1805
1806PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1807 int size,
1808 const char *errors)
1809{
1810 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001811 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001812
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 repr = PyString_FromStringAndSize(NULL, size);
1814 if (repr == NULL)
1815 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001816 if (size == 0)
1817 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
1819 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001820 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 while (size-- > 0) {
1822 Py_UNICODE ch = *p++;
1823 if (ch >= 128) {
1824 if (ascii_encoding_error(&p, &s, errors,
1825 "ordinal not in range(128)"))
1826 goto onError;
1827 }
1828 else
1829 *s++ = (char)ch;
1830 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001831 /* Resize if error handling skipped some characters */
1832 if (s - start < PyString_GET_SIZE(repr))
1833 if (_PyString_Resize(&repr, s - start))
1834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 return repr;
1836
1837 onError:
1838 Py_DECREF(repr);
1839 return NULL;
1840}
1841
1842PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1843{
1844 if (!PyUnicode_Check(unicode)) {
1845 PyErr_BadArgument();
1846 return NULL;
1847 }
1848 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1849 PyUnicode_GET_SIZE(unicode),
1850 NULL);
1851}
1852
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001853#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001854
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001855/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001856
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001857PyObject *PyUnicode_DecodeMBCS(const char *s,
1858 int size,
1859 const char *errors)
1860{
1861 PyUnicodeObject *v;
1862 Py_UNICODE *p;
1863
1864 /* First get the size of the result */
1865 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001866 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001867 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1868
1869 v = _PyUnicode_New(usize);
1870 if (v == NULL)
1871 return NULL;
1872 if (usize == 0)
1873 return (PyObject *)v;
1874 p = PyUnicode_AS_UNICODE(v);
1875 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1876 Py_DECREF(v);
1877 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1878 }
1879
1880 return (PyObject *)v;
1881}
1882
1883PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1884 int size,
1885 const char *errors)
1886{
1887 PyObject *repr;
1888 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001889 DWORD mbcssize;
1890
1891 /* If there are no characters, bail now! */
1892 if (size==0)
1893 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001894
1895 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001896 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001897 if (mbcssize==0)
1898 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1899
1900 repr = PyString_FromStringAndSize(NULL, mbcssize);
1901 if (repr == NULL)
1902 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001903 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001904 return repr;
1905
1906 /* Do the conversion */
1907 s = PyString_AS_STRING(repr);
1908 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1909 Py_DECREF(repr);
1910 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1911 }
1912 return repr;
1913}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001914
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001915#endif /* MS_WIN32 */
1916
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917/* --- Character Mapping Codec -------------------------------------------- */
1918
1919static
1920int charmap_decoding_error(const char **source,
1921 Py_UNICODE **dest,
1922 const char *errors,
1923 const char *details)
1924{
1925 if ((errors == NULL) ||
1926 (strcmp(errors,"strict") == 0)) {
1927 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001928 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 details);
1930 return -1;
1931 }
1932 else if (strcmp(errors,"ignore") == 0) {
1933 return 0;
1934 }
1935 else if (strcmp(errors,"replace") == 0) {
1936 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1937 (*dest)++;
1938 return 0;
1939 }
1940 else {
1941 PyErr_Format(PyExc_ValueError,
1942 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001943 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 errors);
1945 return -1;
1946 }
1947}
1948
1949PyObject *PyUnicode_DecodeCharmap(const char *s,
1950 int size,
1951 PyObject *mapping,
1952 const char *errors)
1953{
1954 PyUnicodeObject *v;
1955 Py_UNICODE *p;
1956
1957 /* Default to Latin-1 */
1958 if (mapping == NULL)
1959 return PyUnicode_DecodeLatin1(s, size, errors);
1960
1961 v = _PyUnicode_New(size);
1962 if (v == NULL)
1963 goto onError;
1964 if (size == 0)
1965 return (PyObject *)v;
1966 p = PyUnicode_AS_UNICODE(v);
1967 while (size-- > 0) {
1968 unsigned char ch = *s++;
1969 PyObject *w, *x;
1970
1971 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1972 w = PyInt_FromLong((long)ch);
1973 if (w == NULL)
1974 goto onError;
1975 x = PyObject_GetItem(mapping, w);
1976 Py_DECREF(w);
1977 if (x == NULL) {
1978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1979 /* No mapping found: default to Latin-1 mapping */
1980 PyErr_Clear();
1981 *p++ = (Py_UNICODE)ch;
1982 continue;
1983 }
1984 goto onError;
1985 }
1986
1987 /* Apply mapping */
1988 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001989 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 if (value < 0 || value > 65535) {
1991 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001992 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 Py_DECREF(x);
1994 goto onError;
1995 }
1996 *p++ = (Py_UNICODE)value;
1997 }
1998 else if (x == Py_None) {
1999 /* undefined mapping */
2000 if (charmap_decoding_error(&s, &p, errors,
2001 "character maps to <undefined>")) {
2002 Py_DECREF(x);
2003 goto onError;
2004 }
2005 }
2006 else if (PyUnicode_Check(x)) {
2007 if (PyUnicode_GET_SIZE(x) != 1) {
2008 /* 1-n mapping */
2009 PyErr_SetString(PyExc_NotImplementedError,
2010 "1-n mappings are currently not implemented");
2011 Py_DECREF(x);
2012 goto onError;
2013 }
2014 *p++ = *PyUnicode_AS_UNICODE(x);
2015 }
2016 else {
2017 /* wrong return value */
2018 PyErr_SetString(PyExc_TypeError,
2019 "character mapping must return integer, None or unicode");
2020 Py_DECREF(x);
2021 goto onError;
2022 }
2023 Py_DECREF(x);
2024 }
2025 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2026 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2027 goto onError;
2028 return (PyObject *)v;
2029
2030 onError:
2031 Py_XDECREF(v);
2032 return NULL;
2033}
2034
2035static
2036int charmap_encoding_error(const Py_UNICODE **source,
2037 char **dest,
2038 const char *errors,
2039 const char *details)
2040{
2041 if ((errors == NULL) ||
2042 (strcmp(errors,"strict") == 0)) {
2043 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002044 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 details);
2046 return -1;
2047 }
2048 else if (strcmp(errors,"ignore") == 0) {
2049 return 0;
2050 }
2051 else if (strcmp(errors,"replace") == 0) {
2052 **dest = '?';
2053 (*dest)++;
2054 return 0;
2055 }
2056 else {
2057 PyErr_Format(PyExc_ValueError,
2058 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002059 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 errors);
2061 return -1;
2062 }
2063}
2064
2065PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2066 int size,
2067 PyObject *mapping,
2068 const char *errors)
2069{
2070 PyObject *v;
2071 char *s;
2072
2073 /* Default to Latin-1 */
2074 if (mapping == NULL)
2075 return PyUnicode_EncodeLatin1(p, size, errors);
2076
2077 v = PyString_FromStringAndSize(NULL, size);
2078 if (v == NULL)
2079 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002080 if (size == 0)
2081 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 s = PyString_AS_STRING(v);
2083 while (size-- > 0) {
2084 Py_UNICODE ch = *p++;
2085 PyObject *w, *x;
2086
2087 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2088 w = PyInt_FromLong((long)ch);
2089 if (w == NULL)
2090 goto onError;
2091 x = PyObject_GetItem(mapping, w);
2092 Py_DECREF(w);
2093 if (x == NULL) {
2094 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2095 /* No mapping found: default to Latin-1 mapping if possible */
2096 PyErr_Clear();
2097 if (ch < 256) {
2098 *s++ = (char)ch;
2099 continue;
2100 }
2101 else if (!charmap_encoding_error(&p, &s, errors,
2102 "missing character mapping"))
2103 continue;
2104 }
2105 goto onError;
2106 }
2107
2108 /* Apply mapping */
2109 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002110 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 if (value < 0 || value > 255) {
2112 PyErr_SetString(PyExc_TypeError,
2113 "character mapping must be in range(256)");
2114 Py_DECREF(x);
2115 goto onError;
2116 }
2117 *s++ = (char)value;
2118 }
2119 else if (x == Py_None) {
2120 /* undefined mapping */
2121 if (charmap_encoding_error(&p, &s, errors,
2122 "character maps to <undefined>")) {
2123 Py_DECREF(x);
2124 goto onError;
2125 }
2126 }
2127 else if (PyString_Check(x)) {
2128 if (PyString_GET_SIZE(x) != 1) {
2129 /* 1-n mapping */
2130 PyErr_SetString(PyExc_NotImplementedError,
2131 "1-n mappings are currently not implemented");
2132 Py_DECREF(x);
2133 goto onError;
2134 }
2135 *s++ = *PyString_AS_STRING(x);
2136 }
2137 else {
2138 /* wrong return value */
2139 PyErr_SetString(PyExc_TypeError,
2140 "character mapping must return integer, None or unicode");
2141 Py_DECREF(x);
2142 goto onError;
2143 }
2144 Py_DECREF(x);
2145 }
2146 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2147 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2148 goto onError;
2149 return v;
2150
2151 onError:
2152 Py_DECREF(v);
2153 return NULL;
2154}
2155
2156PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2157 PyObject *mapping)
2158{
2159 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2160 PyErr_BadArgument();
2161 return NULL;
2162 }
2163 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2164 PyUnicode_GET_SIZE(unicode),
2165 mapping,
2166 NULL);
2167}
2168
2169static
2170int translate_error(const Py_UNICODE **source,
2171 Py_UNICODE **dest,
2172 const char *errors,
2173 const char *details)
2174{
2175 if ((errors == NULL) ||
2176 (strcmp(errors,"strict") == 0)) {
2177 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002178 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 details);
2180 return -1;
2181 }
2182 else if (strcmp(errors,"ignore") == 0) {
2183 return 0;
2184 }
2185 else if (strcmp(errors,"replace") == 0) {
2186 **dest = '?';
2187 (*dest)++;
2188 return 0;
2189 }
2190 else {
2191 PyErr_Format(PyExc_ValueError,
2192 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002193 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 errors);
2195 return -1;
2196 }
2197}
2198
2199PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2200 int size,
2201 PyObject *mapping,
2202 const char *errors)
2203{
2204 PyUnicodeObject *v;
2205 Py_UNICODE *p;
2206
2207 if (mapping == NULL) {
2208 PyErr_BadArgument();
2209 return NULL;
2210 }
2211
2212 /* Output will never be longer than input */
2213 v = _PyUnicode_New(size);
2214 if (v == NULL)
2215 goto onError;
2216 if (size == 0)
2217 goto done;
2218 p = PyUnicode_AS_UNICODE(v);
2219 while (size-- > 0) {
2220 Py_UNICODE ch = *s++;
2221 PyObject *w, *x;
2222
2223 /* Get mapping */
2224 w = PyInt_FromLong(ch);
2225 if (w == NULL)
2226 goto onError;
2227 x = PyObject_GetItem(mapping, w);
2228 Py_DECREF(w);
2229 if (x == NULL) {
2230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2231 /* No mapping found: default to 1-1 mapping */
2232 PyErr_Clear();
2233 *p++ = ch;
2234 continue;
2235 }
2236 goto onError;
2237 }
2238
2239 /* Apply mapping */
2240 if (PyInt_Check(x))
2241 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2242 else if (x == Py_None) {
2243 /* undefined mapping */
2244 if (translate_error(&s, &p, errors,
2245 "character maps to <undefined>")) {
2246 Py_DECREF(x);
2247 goto onError;
2248 }
2249 }
2250 else if (PyUnicode_Check(x)) {
2251 if (PyUnicode_GET_SIZE(x) != 1) {
2252 /* 1-n mapping */
2253 PyErr_SetString(PyExc_NotImplementedError,
2254 "1-n mappings are currently not implemented");
2255 Py_DECREF(x);
2256 goto onError;
2257 }
2258 *p++ = *PyUnicode_AS_UNICODE(x);
2259 }
2260 else {
2261 /* wrong return value */
2262 PyErr_SetString(PyExc_TypeError,
2263 "translate mapping must return integer, None or unicode");
2264 Py_DECREF(x);
2265 goto onError;
2266 }
2267 Py_DECREF(x);
2268 }
2269 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002270 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272
2273 done:
2274 return (PyObject *)v;
2275
2276 onError:
2277 Py_XDECREF(v);
2278 return NULL;
2279}
2280
2281PyObject *PyUnicode_Translate(PyObject *str,
2282 PyObject *mapping,
2283 const char *errors)
2284{
2285 PyObject *result;
2286
2287 str = PyUnicode_FromObject(str);
2288 if (str == NULL)
2289 goto onError;
2290 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2291 PyUnicode_GET_SIZE(str),
2292 mapping,
2293 errors);
2294 Py_DECREF(str);
2295 return result;
2296
2297 onError:
2298 Py_XDECREF(str);
2299 return NULL;
2300}
2301
Guido van Rossum9e896b32000-04-05 20:11:21 +00002302/* --- Decimal Encoder ---------------------------------------------------- */
2303
2304int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2305 int length,
2306 char *output,
2307 const char *errors)
2308{
2309 Py_UNICODE *p, *end;
2310
2311 if (output == NULL) {
2312 PyErr_BadArgument();
2313 return -1;
2314 }
2315
2316 p = s;
2317 end = s + length;
2318 while (p < end) {
2319 register Py_UNICODE ch = *p++;
2320 int decimal;
2321
2322 if (Py_UNICODE_ISSPACE(ch)) {
2323 *output++ = ' ';
2324 continue;
2325 }
2326 decimal = Py_UNICODE_TODECIMAL(ch);
2327 if (decimal >= 0) {
2328 *output++ = '0' + decimal;
2329 continue;
2330 }
Guido van Rossumba477042000-04-06 18:18:10 +00002331 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002332 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002333 continue;
2334 }
2335 /* All other characters are considered invalid */
2336 if (errors == NULL || strcmp(errors, "strict") == 0) {
2337 PyErr_SetString(PyExc_ValueError,
2338 "invalid decimal Unicode string");
2339 goto onError;
2340 }
2341 else if (strcmp(errors, "ignore") == 0)
2342 continue;
2343 else if (strcmp(errors, "replace") == 0) {
2344 *output++ = '?';
2345 continue;
2346 }
2347 }
2348 /* 0-terminate the output string */
2349 *output++ = '\0';
2350 return 0;
2351
2352 onError:
2353 return -1;
2354}
2355
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356/* --- Helpers ------------------------------------------------------------ */
2357
2358static
2359int count(PyUnicodeObject *self,
2360 int start,
2361 int end,
2362 PyUnicodeObject *substring)
2363{
2364 int count = 0;
2365
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002366 if (substring->length == 0)
2367 return (end - start + 1);
2368
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 end -= substring->length;
2370
2371 while (start <= end)
2372 if (Py_UNICODE_MATCH(self, start, substring)) {
2373 count++;
2374 start += substring->length;
2375 } else
2376 start++;
2377
2378 return count;
2379}
2380
2381int PyUnicode_Count(PyObject *str,
2382 PyObject *substr,
2383 int start,
2384 int end)
2385{
2386 int result;
2387
2388 str = PyUnicode_FromObject(str);
2389 if (str == NULL)
2390 return -1;
2391 substr = PyUnicode_FromObject(substr);
2392 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002393 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 return -1;
2395 }
2396
2397 result = count((PyUnicodeObject *)str,
2398 start, end,
2399 (PyUnicodeObject *)substr);
2400
2401 Py_DECREF(str);
2402 Py_DECREF(substr);
2403 return result;
2404}
2405
2406static
2407int findstring(PyUnicodeObject *self,
2408 PyUnicodeObject *substring,
2409 int start,
2410 int end,
2411 int direction)
2412{
2413 if (start < 0)
2414 start += self->length;
2415 if (start < 0)
2416 start = 0;
2417
2418 if (substring->length == 0)
2419 return start;
2420
2421 if (end > self->length)
2422 end = self->length;
2423 if (end < 0)
2424 end += self->length;
2425 if (end < 0)
2426 end = 0;
2427
2428 end -= substring->length;
2429
2430 if (direction < 0) {
2431 for (; end >= start; end--)
2432 if (Py_UNICODE_MATCH(self, end, substring))
2433 return end;
2434 } else {
2435 for (; start <= end; start++)
2436 if (Py_UNICODE_MATCH(self, start, substring))
2437 return start;
2438 }
2439
2440 return -1;
2441}
2442
2443int PyUnicode_Find(PyObject *str,
2444 PyObject *substr,
2445 int start,
2446 int end,
2447 int direction)
2448{
2449 int result;
2450
2451 str = PyUnicode_FromObject(str);
2452 if (str == NULL)
2453 return -1;
2454 substr = PyUnicode_FromObject(substr);
2455 if (substr == NULL) {
2456 Py_DECREF(substr);
2457 return -1;
2458 }
2459
2460 result = findstring((PyUnicodeObject *)str,
2461 (PyUnicodeObject *)substr,
2462 start, end, direction);
2463 Py_DECREF(str);
2464 Py_DECREF(substr);
2465 return result;
2466}
2467
2468static
2469int tailmatch(PyUnicodeObject *self,
2470 PyUnicodeObject *substring,
2471 int start,
2472 int end,
2473 int direction)
2474{
2475 if (start < 0)
2476 start += self->length;
2477 if (start < 0)
2478 start = 0;
2479
2480 if (substring->length == 0)
2481 return 1;
2482
2483 if (end > self->length)
2484 end = self->length;
2485 if (end < 0)
2486 end += self->length;
2487 if (end < 0)
2488 end = 0;
2489
2490 end -= substring->length;
2491 if (end < start)
2492 return 0;
2493
2494 if (direction > 0) {
2495 if (Py_UNICODE_MATCH(self, end, substring))
2496 return 1;
2497 } else {
2498 if (Py_UNICODE_MATCH(self, start, substring))
2499 return 1;
2500 }
2501
2502 return 0;
2503}
2504
2505int PyUnicode_Tailmatch(PyObject *str,
2506 PyObject *substr,
2507 int start,
2508 int end,
2509 int direction)
2510{
2511 int result;
2512
2513 str = PyUnicode_FromObject(str);
2514 if (str == NULL)
2515 return -1;
2516 substr = PyUnicode_FromObject(substr);
2517 if (substr == NULL) {
2518 Py_DECREF(substr);
2519 return -1;
2520 }
2521
2522 result = tailmatch((PyUnicodeObject *)str,
2523 (PyUnicodeObject *)substr,
2524 start, end, direction);
2525 Py_DECREF(str);
2526 Py_DECREF(substr);
2527 return result;
2528}
2529
2530static
2531const Py_UNICODE *findchar(const Py_UNICODE *s,
2532 int size,
2533 Py_UNICODE ch)
2534{
2535 /* like wcschr, but doesn't stop at NULL characters */
2536
2537 while (size-- > 0) {
2538 if (*s == ch)
2539 return s;
2540 s++;
2541 }
2542
2543 return NULL;
2544}
2545
2546/* Apply fixfct filter to the Unicode object self and return a
2547 reference to the modified object */
2548
2549static
2550PyObject *fixup(PyUnicodeObject *self,
2551 int (*fixfct)(PyUnicodeObject *s))
2552{
2553
2554 PyUnicodeObject *u;
2555
2556 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2557 self->length);
2558 if (u == NULL)
2559 return NULL;
2560 if (!fixfct(u)) {
2561 /* fixfct should return TRUE if it modified the buffer. If
2562 FALSE, return a reference to the original buffer instead
2563 (to save space, not time) */
2564 Py_INCREF(self);
2565 Py_DECREF(u);
2566 return (PyObject*) self;
2567 }
2568 return (PyObject*) u;
2569}
2570
2571static
2572int fixupper(PyUnicodeObject *self)
2573{
2574 int len = self->length;
2575 Py_UNICODE *s = self->str;
2576 int status = 0;
2577
2578 while (len-- > 0) {
2579 register Py_UNICODE ch;
2580
2581 ch = Py_UNICODE_TOUPPER(*s);
2582 if (ch != *s) {
2583 status = 1;
2584 *s = ch;
2585 }
2586 s++;
2587 }
2588
2589 return status;
2590}
2591
2592static
2593int fixlower(PyUnicodeObject *self)
2594{
2595 int len = self->length;
2596 Py_UNICODE *s = self->str;
2597 int status = 0;
2598
2599 while (len-- > 0) {
2600 register Py_UNICODE ch;
2601
2602 ch = Py_UNICODE_TOLOWER(*s);
2603 if (ch != *s) {
2604 status = 1;
2605 *s = ch;
2606 }
2607 s++;
2608 }
2609
2610 return status;
2611}
2612
2613static
2614int fixswapcase(PyUnicodeObject *self)
2615{
2616 int len = self->length;
2617 Py_UNICODE *s = self->str;
2618 int status = 0;
2619
2620 while (len-- > 0) {
2621 if (Py_UNICODE_ISUPPER(*s)) {
2622 *s = Py_UNICODE_TOLOWER(*s);
2623 status = 1;
2624 } else if (Py_UNICODE_ISLOWER(*s)) {
2625 *s = Py_UNICODE_TOUPPER(*s);
2626 status = 1;
2627 }
2628 s++;
2629 }
2630
2631 return status;
2632}
2633
2634static
2635int fixcapitalize(PyUnicodeObject *self)
2636{
2637 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2638 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2639 return 1;
2640 }
2641 return 0;
2642}
2643
2644static
2645int fixtitle(PyUnicodeObject *self)
2646{
2647 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2648 register Py_UNICODE *e;
2649 int previous_is_cased;
2650
2651 /* Shortcut for single character strings */
2652 if (PyUnicode_GET_SIZE(self) == 1) {
2653 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2654 if (*p != ch) {
2655 *p = ch;
2656 return 1;
2657 }
2658 else
2659 return 0;
2660 }
2661
2662 e = p + PyUnicode_GET_SIZE(self);
2663 previous_is_cased = 0;
2664 for (; p < e; p++) {
2665 register const Py_UNICODE ch = *p;
2666
2667 if (previous_is_cased)
2668 *p = Py_UNICODE_TOLOWER(ch);
2669 else
2670 *p = Py_UNICODE_TOTITLE(ch);
2671
2672 if (Py_UNICODE_ISLOWER(ch) ||
2673 Py_UNICODE_ISUPPER(ch) ||
2674 Py_UNICODE_ISTITLE(ch))
2675 previous_is_cased = 1;
2676 else
2677 previous_is_cased = 0;
2678 }
2679 return 1;
2680}
2681
2682PyObject *PyUnicode_Join(PyObject *separator,
2683 PyObject *seq)
2684{
2685 Py_UNICODE *sep;
2686 int seplen;
2687 PyUnicodeObject *res = NULL;
2688 int reslen = 0;
2689 Py_UNICODE *p;
2690 int seqlen = 0;
2691 int sz = 100;
2692 int i;
2693
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002694 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 if (seqlen < 0 && PyErr_Occurred())
2696 return NULL;
2697
2698 if (separator == NULL) {
2699 Py_UNICODE blank = ' ';
2700 sep = &blank;
2701 seplen = 1;
2702 }
2703 else {
2704 separator = PyUnicode_FromObject(separator);
2705 if (separator == NULL)
2706 return NULL;
2707 sep = PyUnicode_AS_UNICODE(separator);
2708 seplen = PyUnicode_GET_SIZE(separator);
2709 }
2710
2711 res = _PyUnicode_New(sz);
2712 if (res == NULL)
2713 goto onError;
2714 p = PyUnicode_AS_UNICODE(res);
2715 reslen = 0;
2716
2717 for (i = 0; i < seqlen; i++) {
2718 int itemlen;
2719 PyObject *item;
2720
2721 item = PySequence_GetItem(seq, i);
2722 if (item == NULL)
2723 goto onError;
2724 if (!PyUnicode_Check(item)) {
2725 PyObject *v;
2726 v = PyUnicode_FromObject(item);
2727 Py_DECREF(item);
2728 item = v;
2729 if (item == NULL)
2730 goto onError;
2731 }
2732 itemlen = PyUnicode_GET_SIZE(item);
2733 while (reslen + itemlen + seplen >= sz) {
2734 if (_PyUnicode_Resize(res, sz*2))
2735 goto onError;
2736 sz *= 2;
2737 p = PyUnicode_AS_UNICODE(res) + reslen;
2738 }
2739 if (i > 0) {
2740 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2741 p += seplen;
2742 reslen += seplen;
2743 }
2744 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2745 p += itemlen;
2746 reslen += itemlen;
2747 Py_DECREF(item);
2748 }
2749 if (_PyUnicode_Resize(res, reslen))
2750 goto onError;
2751
2752 Py_XDECREF(separator);
2753 return (PyObject *)res;
2754
2755 onError:
2756 Py_XDECREF(separator);
2757 Py_DECREF(res);
2758 return NULL;
2759}
2760
2761static
2762PyUnicodeObject *pad(PyUnicodeObject *self,
2763 int left,
2764 int right,
2765 Py_UNICODE fill)
2766{
2767 PyUnicodeObject *u;
2768
2769 if (left < 0)
2770 left = 0;
2771 if (right < 0)
2772 right = 0;
2773
2774 if (left == 0 && right == 0) {
2775 Py_INCREF(self);
2776 return self;
2777 }
2778
2779 u = _PyUnicode_New(left + self->length + right);
2780 if (u) {
2781 if (left)
2782 Py_UNICODE_FILL(u->str, fill, left);
2783 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2784 if (right)
2785 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2786 }
2787
2788 return u;
2789}
2790
2791#define SPLIT_APPEND(data, left, right) \
2792 str = PyUnicode_FromUnicode(data + left, right - left); \
2793 if (!str) \
2794 goto onError; \
2795 if (PyList_Append(list, str)) { \
2796 Py_DECREF(str); \
2797 goto onError; \
2798 } \
2799 else \
2800 Py_DECREF(str);
2801
2802static
2803PyObject *split_whitespace(PyUnicodeObject *self,
2804 PyObject *list,
2805 int maxcount)
2806{
2807 register int i;
2808 register int j;
2809 int len = self->length;
2810 PyObject *str;
2811
2812 for (i = j = 0; i < len; ) {
2813 /* find a token */
2814 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2815 i++;
2816 j = i;
2817 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2818 i++;
2819 if (j < i) {
2820 if (maxcount-- <= 0)
2821 break;
2822 SPLIT_APPEND(self->str, j, i);
2823 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2824 i++;
2825 j = i;
2826 }
2827 }
2828 if (j < len) {
2829 SPLIT_APPEND(self->str, j, len);
2830 }
2831 return list;
2832
2833 onError:
2834 Py_DECREF(list);
2835 return NULL;
2836}
2837
2838PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002839 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840{
2841 register int i;
2842 register int j;
2843 int len;
2844 PyObject *list;
2845 PyObject *str;
2846 Py_UNICODE *data;
2847
2848 string = PyUnicode_FromObject(string);
2849 if (string == NULL)
2850 return NULL;
2851 data = PyUnicode_AS_UNICODE(string);
2852 len = PyUnicode_GET_SIZE(string);
2853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 list = PyList_New(0);
2855 if (!list)
2856 goto onError;
2857
2858 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002859 int eol;
2860
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 /* Find a line and append it */
2862 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2863 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
2865 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002866 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 if (i < len) {
2868 if (data[i] == '\r' && i + 1 < len &&
2869 data[i+1] == '\n')
2870 i += 2;
2871 else
2872 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002873 if (keepends)
2874 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
Guido van Rossum86662912000-04-11 15:38:46 +00002876 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 j = i;
2878 }
2879 if (j < len) {
2880 SPLIT_APPEND(data, j, len);
2881 }
2882
2883 Py_DECREF(string);
2884 return list;
2885
2886 onError:
2887 Py_DECREF(list);
2888 Py_DECREF(string);
2889 return NULL;
2890}
2891
2892static
2893PyObject *split_char(PyUnicodeObject *self,
2894 PyObject *list,
2895 Py_UNICODE ch,
2896 int maxcount)
2897{
2898 register int i;
2899 register int j;
2900 int len = self->length;
2901 PyObject *str;
2902
2903 for (i = j = 0; i < len; ) {
2904 if (self->str[i] == ch) {
2905 if (maxcount-- <= 0)
2906 break;
2907 SPLIT_APPEND(self->str, j, i);
2908 i = j = i + 1;
2909 } else
2910 i++;
2911 }
2912 if (j <= len) {
2913 SPLIT_APPEND(self->str, j, len);
2914 }
2915 return list;
2916
2917 onError:
2918 Py_DECREF(list);
2919 return NULL;
2920}
2921
2922static
2923PyObject *split_substring(PyUnicodeObject *self,
2924 PyObject *list,
2925 PyUnicodeObject *substring,
2926 int maxcount)
2927{
2928 register int i;
2929 register int j;
2930 int len = self->length;
2931 int sublen = substring->length;
2932 PyObject *str;
2933
2934 for (i = j = 0; i < len - sublen; ) {
2935 if (Py_UNICODE_MATCH(self, i, substring)) {
2936 if (maxcount-- <= 0)
2937 break;
2938 SPLIT_APPEND(self->str, j, i);
2939 i = j = i + sublen;
2940 } else
2941 i++;
2942 }
2943 if (j <= len) {
2944 SPLIT_APPEND(self->str, j, len);
2945 }
2946 return list;
2947
2948 onError:
2949 Py_DECREF(list);
2950 return NULL;
2951}
2952
2953#undef SPLIT_APPEND
2954
2955static
2956PyObject *split(PyUnicodeObject *self,
2957 PyUnicodeObject *substring,
2958 int maxcount)
2959{
2960 PyObject *list;
2961
2962 if (maxcount < 0)
2963 maxcount = INT_MAX;
2964
2965 list = PyList_New(0);
2966 if (!list)
2967 return NULL;
2968
2969 if (substring == NULL)
2970 return split_whitespace(self,list,maxcount);
2971
2972 else if (substring->length == 1)
2973 return split_char(self,list,substring->str[0],maxcount);
2974
2975 else if (substring->length == 0) {
2976 Py_DECREF(list);
2977 PyErr_SetString(PyExc_ValueError, "empty separator");
2978 return NULL;
2979 }
2980 else
2981 return split_substring(self,list,substring,maxcount);
2982}
2983
2984static
2985PyObject *strip(PyUnicodeObject *self,
2986 int left,
2987 int right)
2988{
2989 Py_UNICODE *p = self->str;
2990 int start = 0;
2991 int end = self->length;
2992
2993 if (left)
2994 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2995 start++;
2996
2997 if (right)
2998 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2999 end--;
3000
3001 if (start == 0 && end == self->length) {
3002 /* couldn't strip anything off, return original string */
3003 Py_INCREF(self);
3004 return (PyObject*) self;
3005 }
3006
3007 return (PyObject*) PyUnicode_FromUnicode(
3008 self->str + start,
3009 end - start
3010 );
3011}
3012
3013static
3014PyObject *replace(PyUnicodeObject *self,
3015 PyUnicodeObject *str1,
3016 PyUnicodeObject *str2,
3017 int maxcount)
3018{
3019 PyUnicodeObject *u;
3020
3021 if (maxcount < 0)
3022 maxcount = INT_MAX;
3023
3024 if (str1->length == 1 && str2->length == 1) {
3025 int i;
3026
3027 /* replace characters */
3028 if (!findchar(self->str, self->length, str1->str[0])) {
3029 /* nothing to replace, return original string */
3030 Py_INCREF(self);
3031 u = self;
3032 } else {
3033 Py_UNICODE u1 = str1->str[0];
3034 Py_UNICODE u2 = str2->str[0];
3035
3036 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3037 self->str,
3038 self->length
3039 );
3040 if (u)
3041 for (i = 0; i < u->length; i++)
3042 if (u->str[i] == u1) {
3043 if (--maxcount < 0)
3044 break;
3045 u->str[i] = u2;
3046 }
3047 }
3048
3049 } else {
3050 int n, i;
3051 Py_UNICODE *p;
3052
3053 /* replace strings */
3054 n = count(self, 0, self->length, str1);
3055 if (n > maxcount)
3056 n = maxcount;
3057 if (n == 0) {
3058 /* nothing to replace, return original string */
3059 Py_INCREF(self);
3060 u = self;
3061 } else {
3062 u = _PyUnicode_New(
3063 self->length + n * (str2->length - str1->length));
3064 if (u) {
3065 i = 0;
3066 p = u->str;
3067 while (i <= self->length - str1->length)
3068 if (Py_UNICODE_MATCH(self, i, str1)) {
3069 /* replace string segment */
3070 Py_UNICODE_COPY(p, str2->str, str2->length);
3071 p += str2->length;
3072 i += str1->length;
3073 if (--n <= 0) {
3074 /* copy remaining part */
3075 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3076 break;
3077 }
3078 } else
3079 *p++ = self->str[i++];
3080 }
3081 }
3082 }
3083
3084 return (PyObject *) u;
3085}
3086
3087/* --- Unicode Object Methods --------------------------------------------- */
3088
3089static char title__doc__[] =
3090"S.title() -> unicode\n\
3091\n\
3092Return a titlecased version of S, i.e. words start with title case\n\
3093characters, all remaining cased characters have lower case.";
3094
3095static PyObject*
3096unicode_title(PyUnicodeObject *self, PyObject *args)
3097{
3098 if (!PyArg_NoArgs(args))
3099 return NULL;
3100 return fixup(self, fixtitle);
3101}
3102
3103static char capitalize__doc__[] =
3104"S.capitalize() -> unicode\n\
3105\n\
3106Return a capitalized version of S, i.e. make the first character\n\
3107have upper case.";
3108
3109static PyObject*
3110unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3111{
3112 if (!PyArg_NoArgs(args))
3113 return NULL;
3114 return fixup(self, fixcapitalize);
3115}
3116
3117#if 0
3118static char capwords__doc__[] =
3119"S.capwords() -> unicode\n\
3120\n\
3121Apply .capitalize() to all words in S and return the result with\n\
3122normalized whitespace (all whitespace strings are replaced by ' ').";
3123
3124static PyObject*
3125unicode_capwords(PyUnicodeObject *self, PyObject *args)
3126{
3127 PyObject *list;
3128 PyObject *item;
3129 int i;
3130
3131 if (!PyArg_NoArgs(args))
3132 return NULL;
3133
3134 /* Split into words */
3135 list = split(self, NULL, -1);
3136 if (!list)
3137 return NULL;
3138
3139 /* Capitalize each word */
3140 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3141 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3142 fixcapitalize);
3143 if (item == NULL)
3144 goto onError;
3145 Py_DECREF(PyList_GET_ITEM(list, i));
3146 PyList_SET_ITEM(list, i, item);
3147 }
3148
3149 /* Join the words to form a new string */
3150 item = PyUnicode_Join(NULL, list);
3151
3152onError:
3153 Py_DECREF(list);
3154 return (PyObject *)item;
3155}
3156#endif
3157
3158static char center__doc__[] =
3159"S.center(width) -> unicode\n\
3160\n\
3161Return S centered in a Unicode string of length width. Padding is done\n\
3162using spaces.";
3163
3164static PyObject *
3165unicode_center(PyUnicodeObject *self, PyObject *args)
3166{
3167 int marg, left;
3168 int width;
3169
3170 if (!PyArg_ParseTuple(args, "i:center", &width))
3171 return NULL;
3172
3173 if (self->length >= width) {
3174 Py_INCREF(self);
3175 return (PyObject*) self;
3176 }
3177
3178 marg = width - self->length;
3179 left = marg / 2 + (marg & width & 1);
3180
3181 return (PyObject*) pad(self, left, marg - left, ' ');
3182}
3183
Marc-André Lemburge5034372000-08-08 08:04:29 +00003184#if 0
3185
3186/* This code should go into some future Unicode collation support
3187 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003188 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003189
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003190/* speedy UTF-16 code point order comparison */
3191/* gleaned from: */
3192/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3193
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003194static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003195{
3196 0, 0, 0, 0, 0, 0, 0, 0,
3197 0, 0, 0, 0, 0, 0, 0, 0,
3198 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003199 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003200};
3201
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202static int
3203unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3204{
3205 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 Py_UNICODE *s1 = str1->str;
3208 Py_UNICODE *s2 = str2->str;
3209
3210 len1 = str1->length;
3211 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003214 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003215 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003216
3217 c1 = *s1++;
3218 c2 = *s2++;
3219 if (c1 > (1<<11) * 26)
3220 c1 += utf16Fixup[c1>>11];
3221 if (c2 > (1<<11) * 26)
3222 c2 += utf16Fixup[c2>>11];
3223
3224 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003225 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003226 if (diff)
3227 return (diff < 0) ? -1 : (diff != 0);
3228 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 }
3230
3231 return (len1 < len2) ? -1 : (len1 != len2);
3232}
3233
Marc-André Lemburge5034372000-08-08 08:04:29 +00003234#else
3235
3236static int
3237unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3238{
3239 register int len1, len2;
3240
3241 Py_UNICODE *s1 = str1->str;
3242 Py_UNICODE *s2 = str2->str;
3243
3244 len1 = str1->length;
3245 len2 = str2->length;
3246
3247 while (len1 > 0 && len2 > 0) {
3248 register long diff;
3249
3250 diff = (long)*s1++ - (long)*s2++;
3251 if (diff)
3252 return (diff < 0) ? -1 : (diff != 0);
3253 len1--; len2--;
3254 }
3255
3256 return (len1 < len2) ? -1 : (len1 != len2);
3257}
3258
3259#endif
3260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261int PyUnicode_Compare(PyObject *left,
3262 PyObject *right)
3263{
3264 PyUnicodeObject *u = NULL, *v = NULL;
3265 int result;
3266
3267 /* Coerce the two arguments */
3268 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3269 if (u == NULL)
3270 goto onError;
3271 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3272 if (v == NULL)
3273 goto onError;
3274
Thomas Wouters7e474022000-07-16 12:04:32 +00003275 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 if (v == u) {
3277 Py_DECREF(u);
3278 Py_DECREF(v);
3279 return 0;
3280 }
3281
3282 result = unicode_compare(u, v);
3283
3284 Py_DECREF(u);
3285 Py_DECREF(v);
3286 return result;
3287
3288onError:
3289 Py_XDECREF(u);
3290 Py_XDECREF(v);
3291 return -1;
3292}
3293
Guido van Rossum403d68b2000-03-13 15:55:09 +00003294int PyUnicode_Contains(PyObject *container,
3295 PyObject *element)
3296{
3297 PyUnicodeObject *u = NULL, *v = NULL;
3298 int result;
3299 register const Py_UNICODE *p, *e;
3300 register Py_UNICODE ch;
3301
3302 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003303 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003304 if (v == NULL) {
3305 PyErr_SetString(PyExc_TypeError,
3306 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003307 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003308 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003309 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3310 if (u == NULL) {
3311 Py_DECREF(v);
3312 goto onError;
3313 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003314
3315 /* Check v in u */
3316 if (PyUnicode_GET_SIZE(v) != 1) {
3317 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003318 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003319 goto onError;
3320 }
3321 ch = *PyUnicode_AS_UNICODE(v);
3322 p = PyUnicode_AS_UNICODE(u);
3323 e = p + PyUnicode_GET_SIZE(u);
3324 result = 0;
3325 while (p < e) {
3326 if (*p++ == ch) {
3327 result = 1;
3328 break;
3329 }
3330 }
3331
3332 Py_DECREF(u);
3333 Py_DECREF(v);
3334 return result;
3335
3336onError:
3337 Py_XDECREF(u);
3338 Py_XDECREF(v);
3339 return -1;
3340}
3341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342/* Concat to string or Unicode object giving a new Unicode object. */
3343
3344PyObject *PyUnicode_Concat(PyObject *left,
3345 PyObject *right)
3346{
3347 PyUnicodeObject *u = NULL, *v = NULL, *w;
3348
3349 /* Coerce the two arguments */
3350 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3351 if (u == NULL)
3352 goto onError;
3353 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3354 if (v == NULL)
3355 goto onError;
3356
3357 /* Shortcuts */
3358 if (v == unicode_empty) {
3359 Py_DECREF(v);
3360 return (PyObject *)u;
3361 }
3362 if (u == unicode_empty) {
3363 Py_DECREF(u);
3364 return (PyObject *)v;
3365 }
3366
3367 /* Concat the two Unicode strings */
3368 w = _PyUnicode_New(u->length + v->length);
3369 if (w == NULL)
3370 goto onError;
3371 Py_UNICODE_COPY(w->str, u->str, u->length);
3372 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3373
3374 Py_DECREF(u);
3375 Py_DECREF(v);
3376 return (PyObject *)w;
3377
3378onError:
3379 Py_XDECREF(u);
3380 Py_XDECREF(v);
3381 return NULL;
3382}
3383
3384static char count__doc__[] =
3385"S.count(sub[, start[, end]]) -> int\n\
3386\n\
3387Return the number of occurrences of substring sub in Unicode string\n\
3388S[start:end]. Optional arguments start and end are\n\
3389interpreted as in slice notation.";
3390
3391static PyObject *
3392unicode_count(PyUnicodeObject *self, PyObject *args)
3393{
3394 PyUnicodeObject *substring;
3395 int start = 0;
3396 int end = INT_MAX;
3397 PyObject *result;
3398
Guido van Rossumb8872e62000-05-09 14:14:27 +00003399 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3400 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 return NULL;
3402
3403 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3404 (PyObject *)substring);
3405 if (substring == NULL)
3406 return NULL;
3407
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 if (start < 0)
3409 start += self->length;
3410 if (start < 0)
3411 start = 0;
3412 if (end > self->length)
3413 end = self->length;
3414 if (end < 0)
3415 end += self->length;
3416 if (end < 0)
3417 end = 0;
3418
3419 result = PyInt_FromLong((long) count(self, start, end, substring));
3420
3421 Py_DECREF(substring);
3422 return result;
3423}
3424
3425static char encode__doc__[] =
3426"S.encode([encoding[,errors]]) -> string\n\
3427\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003428Return an encoded string version of S. Default encoding is the current\n\
3429default string encoding. errors may be given to set a different error\n\
3430handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3431a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432
3433static PyObject *
3434unicode_encode(PyUnicodeObject *self, PyObject *args)
3435{
3436 char *encoding = NULL;
3437 char *errors = NULL;
3438 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3439 return NULL;
3440 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3441}
3442
3443static char expandtabs__doc__[] =
3444"S.expandtabs([tabsize]) -> unicode\n\
3445\n\
3446Return a copy of S where all tab characters are expanded using spaces.\n\
3447If tabsize is not given, a tab size of 8 characters is assumed.";
3448
3449static PyObject*
3450unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3451{
3452 Py_UNICODE *e;
3453 Py_UNICODE *p;
3454 Py_UNICODE *q;
3455 int i, j;
3456 PyUnicodeObject *u;
3457 int tabsize = 8;
3458
3459 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3460 return NULL;
3461
Thomas Wouters7e474022000-07-16 12:04:32 +00003462 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 i = j = 0;
3464 e = self->str + self->length;
3465 for (p = self->str; p < e; p++)
3466 if (*p == '\t') {
3467 if (tabsize > 0)
3468 j += tabsize - (j % tabsize);
3469 }
3470 else {
3471 j++;
3472 if (*p == '\n' || *p == '\r') {
3473 i += j;
3474 j = 0;
3475 }
3476 }
3477
3478 /* Second pass: create output string and fill it */
3479 u = _PyUnicode_New(i + j);
3480 if (!u)
3481 return NULL;
3482
3483 j = 0;
3484 q = u->str;
3485
3486 for (p = self->str; p < e; p++)
3487 if (*p == '\t') {
3488 if (tabsize > 0) {
3489 i = tabsize - (j % tabsize);
3490 j += i;
3491 while (i--)
3492 *q++ = ' ';
3493 }
3494 }
3495 else {
3496 j++;
3497 *q++ = *p;
3498 if (*p == '\n' || *p == '\r')
3499 j = 0;
3500 }
3501
3502 return (PyObject*) u;
3503}
3504
3505static char find__doc__[] =
3506"S.find(sub [,start [,end]]) -> int\n\
3507\n\
3508Return the lowest index in S where substring sub is found,\n\
3509such that sub is contained within s[start,end]. Optional\n\
3510arguments start and end are interpreted as in slice notation.\n\
3511\n\
3512Return -1 on failure.";
3513
3514static PyObject *
3515unicode_find(PyUnicodeObject *self, PyObject *args)
3516{
3517 PyUnicodeObject *substring;
3518 int start = 0;
3519 int end = INT_MAX;
3520 PyObject *result;
3521
Guido van Rossumb8872e62000-05-09 14:14:27 +00003522 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3523 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 return NULL;
3525 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3526 (PyObject *)substring);
3527 if (substring == NULL)
3528 return NULL;
3529
3530 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3531
3532 Py_DECREF(substring);
3533 return result;
3534}
3535
3536static PyObject *
3537unicode_getitem(PyUnicodeObject *self, int index)
3538{
3539 if (index < 0 || index >= self->length) {
3540 PyErr_SetString(PyExc_IndexError, "string index out of range");
3541 return NULL;
3542 }
3543
3544 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3545}
3546
3547static long
3548unicode_hash(PyUnicodeObject *self)
3549{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003550 /* Since Unicode objects compare equal to their ASCII string
3551 counterparts, they should use the individual character values
3552 as basis for their hash value. This is needed to assure that
3553 strings and Unicode objects behave in the same way as
3554 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555
Fredrik Lundhdde61642000-07-10 18:27:47 +00003556 register int len;
3557 register Py_UNICODE *p;
3558 register long x;
3559
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 if (self->hash != -1)
3561 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003562 len = PyUnicode_GET_SIZE(self);
3563 p = PyUnicode_AS_UNICODE(self);
3564 x = *p << 7;
3565 while (--len >= 0)
3566 x = (1000003*x) ^ *p++;
3567 x ^= PyUnicode_GET_SIZE(self);
3568 if (x == -1)
3569 x = -2;
3570 self->hash = x;
3571 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572}
3573
3574static char index__doc__[] =
3575"S.index(sub [,start [,end]]) -> int\n\
3576\n\
3577Like S.find() but raise ValueError when the substring is not found.";
3578
3579static PyObject *
3580unicode_index(PyUnicodeObject *self, PyObject *args)
3581{
3582 int result;
3583 PyUnicodeObject *substring;
3584 int start = 0;
3585 int end = INT_MAX;
3586
Guido van Rossumb8872e62000-05-09 14:14:27 +00003587 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3588 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 return NULL;
3590
3591 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3592 (PyObject *)substring);
3593 if (substring == NULL)
3594 return NULL;
3595
3596 result = findstring(self, substring, start, end, 1);
3597
3598 Py_DECREF(substring);
3599 if (result < 0) {
3600 PyErr_SetString(PyExc_ValueError, "substring not found");
3601 return NULL;
3602 }
3603 return PyInt_FromLong(result);
3604}
3605
3606static char islower__doc__[] =
3607"S.islower() -> int\n\
3608\n\
3609Return 1 if all cased characters in S are lowercase and there is\n\
3610at least one cased character in S, 0 otherwise.";
3611
3612static PyObject*
3613unicode_islower(PyUnicodeObject *self, PyObject *args)
3614{
3615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3616 register const Py_UNICODE *e;
3617 int cased;
3618
3619 if (!PyArg_NoArgs(args))
3620 return NULL;
3621
3622 /* Shortcut for single character strings */
3623 if (PyUnicode_GET_SIZE(self) == 1)
3624 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3625
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003626 /* Special case for empty strings */
3627 if (PyString_GET_SIZE(self) == 0)
3628 return PyInt_FromLong(0);
3629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 e = p + PyUnicode_GET_SIZE(self);
3631 cased = 0;
3632 for (; p < e; p++) {
3633 register const Py_UNICODE ch = *p;
3634
3635 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3636 return PyInt_FromLong(0);
3637 else if (!cased && Py_UNICODE_ISLOWER(ch))
3638 cased = 1;
3639 }
3640 return PyInt_FromLong(cased);
3641}
3642
3643static char isupper__doc__[] =
3644"S.isupper() -> int\n\
3645\n\
3646Return 1 if all cased characters in S are uppercase and there is\n\
3647at least one cased character in S, 0 otherwise.";
3648
3649static PyObject*
3650unicode_isupper(PyUnicodeObject *self, PyObject *args)
3651{
3652 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3653 register const Py_UNICODE *e;
3654 int cased;
3655
3656 if (!PyArg_NoArgs(args))
3657 return NULL;
3658
3659 /* Shortcut for single character strings */
3660 if (PyUnicode_GET_SIZE(self) == 1)
3661 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003663 /* Special case for empty strings */
3664 if (PyString_GET_SIZE(self) == 0)
3665 return PyInt_FromLong(0);
3666
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 e = p + PyUnicode_GET_SIZE(self);
3668 cased = 0;
3669 for (; p < e; p++) {
3670 register const Py_UNICODE ch = *p;
3671
3672 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3673 return PyInt_FromLong(0);
3674 else if (!cased && Py_UNICODE_ISUPPER(ch))
3675 cased = 1;
3676 }
3677 return PyInt_FromLong(cased);
3678}
3679
3680static char istitle__doc__[] =
3681"S.istitle() -> int\n\
3682\n\
3683Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3684may only follow uncased characters and lowercase characters only cased\n\
3685ones. Return 0 otherwise.";
3686
3687static PyObject*
3688unicode_istitle(PyUnicodeObject *self, PyObject *args)
3689{
3690 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3691 register const Py_UNICODE *e;
3692 int cased, previous_is_cased;
3693
3694 if (!PyArg_NoArgs(args))
3695 return NULL;
3696
3697 /* Shortcut for single character strings */
3698 if (PyUnicode_GET_SIZE(self) == 1)
3699 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3700 (Py_UNICODE_ISUPPER(*p) != 0));
3701
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003702 /* Special case for empty strings */
3703 if (PyString_GET_SIZE(self) == 0)
3704 return PyInt_FromLong(0);
3705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 e = p + PyUnicode_GET_SIZE(self);
3707 cased = 0;
3708 previous_is_cased = 0;
3709 for (; p < e; p++) {
3710 register const Py_UNICODE ch = *p;
3711
3712 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3713 if (previous_is_cased)
3714 return PyInt_FromLong(0);
3715 previous_is_cased = 1;
3716 cased = 1;
3717 }
3718 else if (Py_UNICODE_ISLOWER(ch)) {
3719 if (!previous_is_cased)
3720 return PyInt_FromLong(0);
3721 previous_is_cased = 1;
3722 cased = 1;
3723 }
3724 else
3725 previous_is_cased = 0;
3726 }
3727 return PyInt_FromLong(cased);
3728}
3729
3730static char isspace__doc__[] =
3731"S.isspace() -> int\n\
3732\n\
3733Return 1 if there are only whitespace characters in S,\n\
37340 otherwise.";
3735
3736static PyObject*
3737unicode_isspace(PyUnicodeObject *self, PyObject *args)
3738{
3739 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3740 register const Py_UNICODE *e;
3741
3742 if (!PyArg_NoArgs(args))
3743 return NULL;
3744
3745 /* Shortcut for single character strings */
3746 if (PyUnicode_GET_SIZE(self) == 1 &&
3747 Py_UNICODE_ISSPACE(*p))
3748 return PyInt_FromLong(1);
3749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003750 /* Special case for empty strings */
3751 if (PyString_GET_SIZE(self) == 0)
3752 return PyInt_FromLong(0);
3753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 e = p + PyUnicode_GET_SIZE(self);
3755 for (; p < e; p++) {
3756 if (!Py_UNICODE_ISSPACE(*p))
3757 return PyInt_FromLong(0);
3758 }
3759 return PyInt_FromLong(1);
3760}
3761
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003762static char isalpha__doc__[] =
3763"S.isalpha() -> int\n\
3764\n\
3765Return 1 if all characters in S are alphabetic\n\
3766and there is at least one character in S, 0 otherwise.";
3767
3768static PyObject*
3769unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3770{
3771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3772 register const Py_UNICODE *e;
3773
3774 if (!PyArg_NoArgs(args))
3775 return NULL;
3776
3777 /* Shortcut for single character strings */
3778 if (PyUnicode_GET_SIZE(self) == 1 &&
3779 Py_UNICODE_ISALPHA(*p))
3780 return PyInt_FromLong(1);
3781
3782 /* Special case for empty strings */
3783 if (PyString_GET_SIZE(self) == 0)
3784 return PyInt_FromLong(0);
3785
3786 e = p + PyUnicode_GET_SIZE(self);
3787 for (; p < e; p++) {
3788 if (!Py_UNICODE_ISALPHA(*p))
3789 return PyInt_FromLong(0);
3790 }
3791 return PyInt_FromLong(1);
3792}
3793
3794static char isalnum__doc__[] =
3795"S.isalnum() -> int\n\
3796\n\
3797Return 1 if all characters in S are alphanumeric\n\
3798and there is at least one character in S, 0 otherwise.";
3799
3800static PyObject*
3801unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3802{
3803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3804 register const Py_UNICODE *e;
3805
3806 if (!PyArg_NoArgs(args))
3807 return NULL;
3808
3809 /* Shortcut for single character strings */
3810 if (PyUnicode_GET_SIZE(self) == 1 &&
3811 Py_UNICODE_ISALNUM(*p))
3812 return PyInt_FromLong(1);
3813
3814 /* Special case for empty strings */
3815 if (PyString_GET_SIZE(self) == 0)
3816 return PyInt_FromLong(0);
3817
3818 e = p + PyUnicode_GET_SIZE(self);
3819 for (; p < e; p++) {
3820 if (!Py_UNICODE_ISALNUM(*p))
3821 return PyInt_FromLong(0);
3822 }
3823 return PyInt_FromLong(1);
3824}
3825
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826static char isdecimal__doc__[] =
3827"S.isdecimal() -> int\n\
3828\n\
3829Return 1 if there are only decimal characters in S,\n\
38300 otherwise.";
3831
3832static PyObject*
3833unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3834{
3835 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3836 register const Py_UNICODE *e;
3837
3838 if (!PyArg_NoArgs(args))
3839 return NULL;
3840
3841 /* Shortcut for single character strings */
3842 if (PyUnicode_GET_SIZE(self) == 1 &&
3843 Py_UNICODE_ISDECIMAL(*p))
3844 return PyInt_FromLong(1);
3845
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003846 /* Special case for empty strings */
3847 if (PyString_GET_SIZE(self) == 0)
3848 return PyInt_FromLong(0);
3849
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 e = p + PyUnicode_GET_SIZE(self);
3851 for (; p < e; p++) {
3852 if (!Py_UNICODE_ISDECIMAL(*p))
3853 return PyInt_FromLong(0);
3854 }
3855 return PyInt_FromLong(1);
3856}
3857
3858static char isdigit__doc__[] =
3859"S.isdigit() -> int\n\
3860\n\
3861Return 1 if there are only digit characters in S,\n\
38620 otherwise.";
3863
3864static PyObject*
3865unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3866{
3867 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3868 register const Py_UNICODE *e;
3869
3870 if (!PyArg_NoArgs(args))
3871 return NULL;
3872
3873 /* Shortcut for single character strings */
3874 if (PyUnicode_GET_SIZE(self) == 1 &&
3875 Py_UNICODE_ISDIGIT(*p))
3876 return PyInt_FromLong(1);
3877
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003878 /* Special case for empty strings */
3879 if (PyString_GET_SIZE(self) == 0)
3880 return PyInt_FromLong(0);
3881
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 e = p + PyUnicode_GET_SIZE(self);
3883 for (; p < e; p++) {
3884 if (!Py_UNICODE_ISDIGIT(*p))
3885 return PyInt_FromLong(0);
3886 }
3887 return PyInt_FromLong(1);
3888}
3889
3890static char isnumeric__doc__[] =
3891"S.isnumeric() -> int\n\
3892\n\
3893Return 1 if there are only numeric characters in S,\n\
38940 otherwise.";
3895
3896static PyObject*
3897unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3898{
3899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3900 register const Py_UNICODE *e;
3901
3902 if (!PyArg_NoArgs(args))
3903 return NULL;
3904
3905 /* Shortcut for single character strings */
3906 if (PyUnicode_GET_SIZE(self) == 1 &&
3907 Py_UNICODE_ISNUMERIC(*p))
3908 return PyInt_FromLong(1);
3909
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003910 /* Special case for empty strings */
3911 if (PyString_GET_SIZE(self) == 0)
3912 return PyInt_FromLong(0);
3913
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 e = p + PyUnicode_GET_SIZE(self);
3915 for (; p < e; p++) {
3916 if (!Py_UNICODE_ISNUMERIC(*p))
3917 return PyInt_FromLong(0);
3918 }
3919 return PyInt_FromLong(1);
3920}
3921
3922static char join__doc__[] =
3923"S.join(sequence) -> unicode\n\
3924\n\
3925Return a string which is the concatenation of the strings in the\n\
3926sequence. The separator between elements is S.";
3927
3928static PyObject*
3929unicode_join(PyUnicodeObject *self, PyObject *args)
3930{
3931 PyObject *data;
3932 if (!PyArg_ParseTuple(args, "O:join", &data))
3933 return NULL;
3934
3935 return PyUnicode_Join((PyObject *)self, data);
3936}
3937
3938static int
3939unicode_length(PyUnicodeObject *self)
3940{
3941 return self->length;
3942}
3943
3944static char ljust__doc__[] =
3945"S.ljust(width) -> unicode\n\
3946\n\
3947Return S left justified in a Unicode string of length width. Padding is\n\
3948done using spaces.";
3949
3950static PyObject *
3951unicode_ljust(PyUnicodeObject *self, PyObject *args)
3952{
3953 int width;
3954 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3955 return NULL;
3956
3957 if (self->length >= width) {
3958 Py_INCREF(self);
3959 return (PyObject*) self;
3960 }
3961
3962 return (PyObject*) pad(self, 0, width - self->length, ' ');
3963}
3964
3965static char lower__doc__[] =
3966"S.lower() -> unicode\n\
3967\n\
3968Return a copy of the string S converted to lowercase.";
3969
3970static PyObject*
3971unicode_lower(PyUnicodeObject *self, PyObject *args)
3972{
3973 if (!PyArg_NoArgs(args))
3974 return NULL;
3975 return fixup(self, fixlower);
3976}
3977
3978static char lstrip__doc__[] =
3979"S.lstrip() -> unicode\n\
3980\n\
3981Return a copy of the string S with leading whitespace removed.";
3982
3983static PyObject *
3984unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3985{
3986 if (!PyArg_NoArgs(args))
3987 return NULL;
3988 return strip(self, 1, 0);
3989}
3990
3991static PyObject*
3992unicode_repeat(PyUnicodeObject *str, int len)
3993{
3994 PyUnicodeObject *u;
3995 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003996 int nchars;
3997 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998
3999 if (len < 0)
4000 len = 0;
4001
4002 if (len == 1) {
4003 /* no repeat, return original string */
4004 Py_INCREF(str);
4005 return (PyObject*) str;
4006 }
Tim Peters8f422462000-09-09 06:13:41 +00004007
4008 /* ensure # of chars needed doesn't overflow int and # of bytes
4009 * needed doesn't overflow size_t
4010 */
4011 nchars = len * str->length;
4012 if (len && nchars / len != str->length) {
4013 PyErr_SetString(PyExc_OverflowError,
4014 "repeated string is too long");
4015 return NULL;
4016 }
4017 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4018 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4019 PyErr_SetString(PyExc_OverflowError,
4020 "repeated string is too long");
4021 return NULL;
4022 }
4023 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 if (!u)
4025 return NULL;
4026
4027 p = u->str;
4028
4029 while (len-- > 0) {
4030 Py_UNICODE_COPY(p, str->str, str->length);
4031 p += str->length;
4032 }
4033
4034 return (PyObject*) u;
4035}
4036
4037PyObject *PyUnicode_Replace(PyObject *obj,
4038 PyObject *subobj,
4039 PyObject *replobj,
4040 int maxcount)
4041{
4042 PyObject *self;
4043 PyObject *str1;
4044 PyObject *str2;
4045 PyObject *result;
4046
4047 self = PyUnicode_FromObject(obj);
4048 if (self == NULL)
4049 return NULL;
4050 str1 = PyUnicode_FromObject(subobj);
4051 if (str1 == NULL) {
4052 Py_DECREF(self);
4053 return NULL;
4054 }
4055 str2 = PyUnicode_FromObject(replobj);
4056 if (str2 == NULL) {
4057 Py_DECREF(self);
4058 Py_DECREF(str1);
4059 return NULL;
4060 }
4061 result = replace((PyUnicodeObject *)self,
4062 (PyUnicodeObject *)str1,
4063 (PyUnicodeObject *)str2,
4064 maxcount);
4065 Py_DECREF(self);
4066 Py_DECREF(str1);
4067 Py_DECREF(str2);
4068 return result;
4069}
4070
4071static char replace__doc__[] =
4072"S.replace (old, new[, maxsplit]) -> unicode\n\
4073\n\
4074Return a copy of S with all occurrences of substring\n\
4075old replaced by new. If the optional argument maxsplit is\n\
4076given, only the first maxsplit occurrences are replaced.";
4077
4078static PyObject*
4079unicode_replace(PyUnicodeObject *self, PyObject *args)
4080{
4081 PyUnicodeObject *str1;
4082 PyUnicodeObject *str2;
4083 int maxcount = -1;
4084 PyObject *result;
4085
4086 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4087 return NULL;
4088 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4089 if (str1 == NULL)
4090 return NULL;
4091 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4092 if (str2 == NULL)
4093 return NULL;
4094
4095 result = replace(self, str1, str2, maxcount);
4096
4097 Py_DECREF(str1);
4098 Py_DECREF(str2);
4099 return result;
4100}
4101
4102static
4103PyObject *unicode_repr(PyObject *unicode)
4104{
4105 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4106 PyUnicode_GET_SIZE(unicode),
4107 1);
4108}
4109
4110static char rfind__doc__[] =
4111"S.rfind(sub [,start [,end]]) -> int\n\
4112\n\
4113Return the highest index in S where substring sub is found,\n\
4114such that sub is contained within s[start,end]. Optional\n\
4115arguments start and end are interpreted as in slice notation.\n\
4116\n\
4117Return -1 on failure.";
4118
4119static PyObject *
4120unicode_rfind(PyUnicodeObject *self, PyObject *args)
4121{
4122 PyUnicodeObject *substring;
4123 int start = 0;
4124 int end = INT_MAX;
4125 PyObject *result;
4126
Guido van Rossumb8872e62000-05-09 14:14:27 +00004127 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4128 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 return NULL;
4130 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4131 (PyObject *)substring);
4132 if (substring == NULL)
4133 return NULL;
4134
4135 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4136
4137 Py_DECREF(substring);
4138 return result;
4139}
4140
4141static char rindex__doc__[] =
4142"S.rindex(sub [,start [,end]]) -> int\n\
4143\n\
4144Like S.rfind() but raise ValueError when the substring is not found.";
4145
4146static PyObject *
4147unicode_rindex(PyUnicodeObject *self, PyObject *args)
4148{
4149 int result;
4150 PyUnicodeObject *substring;
4151 int start = 0;
4152 int end = INT_MAX;
4153
Guido van Rossumb8872e62000-05-09 14:14:27 +00004154 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4155 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 return NULL;
4157 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4158 (PyObject *)substring);
4159 if (substring == NULL)
4160 return NULL;
4161
4162 result = findstring(self, substring, start, end, -1);
4163
4164 Py_DECREF(substring);
4165 if (result < 0) {
4166 PyErr_SetString(PyExc_ValueError, "substring not found");
4167 return NULL;
4168 }
4169 return PyInt_FromLong(result);
4170}
4171
4172static char rjust__doc__[] =
4173"S.rjust(width) -> unicode\n\
4174\n\
4175Return S right justified in a Unicode string of length width. Padding is\n\
4176done using spaces.";
4177
4178static PyObject *
4179unicode_rjust(PyUnicodeObject *self, PyObject *args)
4180{
4181 int width;
4182 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4183 return NULL;
4184
4185 if (self->length >= width) {
4186 Py_INCREF(self);
4187 return (PyObject*) self;
4188 }
4189
4190 return (PyObject*) pad(self, width - self->length, 0, ' ');
4191}
4192
4193static char rstrip__doc__[] =
4194"S.rstrip() -> unicode\n\
4195\n\
4196Return a copy of the string S with trailing whitespace removed.";
4197
4198static PyObject *
4199unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4200{
4201 if (!PyArg_NoArgs(args))
4202 return NULL;
4203 return strip(self, 0, 1);
4204}
4205
4206static PyObject*
4207unicode_slice(PyUnicodeObject *self, int start, int end)
4208{
4209 /* standard clamping */
4210 if (start < 0)
4211 start = 0;
4212 if (end < 0)
4213 end = 0;
4214 if (end > self->length)
4215 end = self->length;
4216 if (start == 0 && end == self->length) {
4217 /* full slice, return original string */
4218 Py_INCREF(self);
4219 return (PyObject*) self;
4220 }
4221 if (start > end)
4222 start = end;
4223 /* copy slice */
4224 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4225 end - start);
4226}
4227
4228PyObject *PyUnicode_Split(PyObject *s,
4229 PyObject *sep,
4230 int maxsplit)
4231{
4232 PyObject *result;
4233
4234 s = PyUnicode_FromObject(s);
4235 if (s == NULL)
4236 return NULL;
4237 if (sep != NULL) {
4238 sep = PyUnicode_FromObject(sep);
4239 if (sep == NULL) {
4240 Py_DECREF(s);
4241 return NULL;
4242 }
4243 }
4244
4245 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4246
4247 Py_DECREF(s);
4248 Py_XDECREF(sep);
4249 return result;
4250}
4251
4252static char split__doc__[] =
4253"S.split([sep [,maxsplit]]) -> list of strings\n\
4254\n\
4255Return a list of the words in S, using sep as the\n\
4256delimiter string. If maxsplit is given, at most maxsplit\n\
4257splits are done. If sep is not specified, any whitespace string\n\
4258is a separator.";
4259
4260static PyObject*
4261unicode_split(PyUnicodeObject *self, PyObject *args)
4262{
4263 PyObject *substring = Py_None;
4264 int maxcount = -1;
4265
4266 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4267 return NULL;
4268
4269 if (substring == Py_None)
4270 return split(self, NULL, maxcount);
4271 else if (PyUnicode_Check(substring))
4272 return split(self, (PyUnicodeObject *)substring, maxcount);
4273 else
4274 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4275}
4276
4277static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004278"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279\n\
4280Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004281Line breaks are not included in the resulting list unless keepends\n\
4282is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283
4284static PyObject*
4285unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4286{
Guido van Rossum86662912000-04-11 15:38:46 +00004287 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288
Guido van Rossum86662912000-04-11 15:38:46 +00004289 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 return NULL;
4291
Guido van Rossum86662912000-04-11 15:38:46 +00004292 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293}
4294
4295static
4296PyObject *unicode_str(PyUnicodeObject *self)
4297{
Fred Drakee4315f52000-05-09 19:53:39 +00004298 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299}
4300
4301static char strip__doc__[] =
4302"S.strip() -> unicode\n\
4303\n\
4304Return a copy of S with leading and trailing whitespace removed.";
4305
4306static PyObject *
4307unicode_strip(PyUnicodeObject *self, PyObject *args)
4308{
4309 if (!PyArg_NoArgs(args))
4310 return NULL;
4311 return strip(self, 1, 1);
4312}
4313
4314static char swapcase__doc__[] =
4315"S.swapcase() -> unicode\n\
4316\n\
4317Return a copy of S with uppercase characters converted to lowercase\n\
4318and vice versa.";
4319
4320static PyObject*
4321unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4322{
4323 if (!PyArg_NoArgs(args))
4324 return NULL;
4325 return fixup(self, fixswapcase);
4326}
4327
4328static char translate__doc__[] =
4329"S.translate(table) -> unicode\n\
4330\n\
4331Return a copy of the string S, where all characters have been mapped\n\
4332through the given translation table, which must be a mapping of\n\
4333Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4334are left untouched. Characters mapped to None are deleted.";
4335
4336static PyObject*
4337unicode_translate(PyUnicodeObject *self, PyObject *args)
4338{
4339 PyObject *table;
4340
4341 if (!PyArg_ParseTuple(args, "O:translate", &table))
4342 return NULL;
4343 return PyUnicode_TranslateCharmap(self->str,
4344 self->length,
4345 table,
4346 "ignore");
4347}
4348
4349static char upper__doc__[] =
4350"S.upper() -> unicode\n\
4351\n\
4352Return a copy of S converted to uppercase.";
4353
4354static PyObject*
4355unicode_upper(PyUnicodeObject *self, PyObject *args)
4356{
4357 if (!PyArg_NoArgs(args))
4358 return NULL;
4359 return fixup(self, fixupper);
4360}
4361
4362#if 0
4363static char zfill__doc__[] =
4364"S.zfill(width) -> unicode\n\
4365\n\
4366Pad a numeric string x with zeros on the left, to fill a field\n\
4367of the specified width. The string x is never truncated.";
4368
4369static PyObject *
4370unicode_zfill(PyUnicodeObject *self, PyObject *args)
4371{
4372 int fill;
4373 PyUnicodeObject *u;
4374
4375 int width;
4376 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4377 return NULL;
4378
4379 if (self->length >= width) {
4380 Py_INCREF(self);
4381 return (PyObject*) self;
4382 }
4383
4384 fill = width - self->length;
4385
4386 u = pad(self, fill, 0, '0');
4387
4388 if (u->str[fill] == '+' || u->str[fill] == '-') {
4389 /* move sign to beginning of string */
4390 u->str[0] = u->str[fill];
4391 u->str[fill] = '0';
4392 }
4393
4394 return (PyObject*) u;
4395}
4396#endif
4397
4398#if 0
4399static PyObject*
4400unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4401{
4402 if (!PyArg_NoArgs(args))
4403 return NULL;
4404 return PyInt_FromLong(unicode_freelist_size);
4405}
4406#endif
4407
4408static char startswith__doc__[] =
4409"S.startswith(prefix[, start[, end]]) -> int\n\
4410\n\
4411Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4412optional start, test S beginning at that position. With optional end, stop\n\
4413comparing S at that position.";
4414
4415static PyObject *
4416unicode_startswith(PyUnicodeObject *self,
4417 PyObject *args)
4418{
4419 PyUnicodeObject *substring;
4420 int start = 0;
4421 int end = INT_MAX;
4422 PyObject *result;
4423
Guido van Rossumb8872e62000-05-09 14:14:27 +00004424 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4425 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 return NULL;
4427 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4428 (PyObject *)substring);
4429 if (substring == NULL)
4430 return NULL;
4431
4432 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4433
4434 Py_DECREF(substring);
4435 return result;
4436}
4437
4438
4439static char endswith__doc__[] =
4440"S.endswith(suffix[, start[, end]]) -> int\n\
4441\n\
4442Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4443optional start, test S beginning at that position. With optional end, stop\n\
4444comparing S at that position.";
4445
4446static PyObject *
4447unicode_endswith(PyUnicodeObject *self,
4448 PyObject *args)
4449{
4450 PyUnicodeObject *substring;
4451 int start = 0;
4452 int end = INT_MAX;
4453 PyObject *result;
4454
Guido van Rossumb8872e62000-05-09 14:14:27 +00004455 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4456 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 return NULL;
4458 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4459 (PyObject *)substring);
4460 if (substring == NULL)
4461 return NULL;
4462
4463 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4464
4465 Py_DECREF(substring);
4466 return result;
4467}
4468
4469
4470static PyMethodDef unicode_methods[] = {
4471
4472 /* Order is according to common usage: often used methods should
4473 appear first, since lookup is done sequentially. */
4474
4475 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4476 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4477 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4478 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4479 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4480 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4481 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4482 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4483 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4484 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4485 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4486 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4487 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4488 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4489/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4490 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4491 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4492 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4493 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4494 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4495 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4496 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4497 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4498 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4499 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4500 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4501 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4502 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4503 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4504 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4505 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4506 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4507 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004508 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4509 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510#if 0
4511 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4512 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4513#endif
4514
4515#if 0
4516 /* This one is just used for debugging the implementation. */
4517 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4518#endif
4519
4520 {NULL, NULL}
4521};
4522
4523static PyObject *
4524unicode_getattr(PyUnicodeObject *self, char *name)
4525{
4526 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4527}
4528
4529static PySequenceMethods unicode_as_sequence = {
4530 (inquiry) unicode_length, /* sq_length */
4531 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4532 (intargfunc) unicode_repeat, /* sq_repeat */
4533 (intargfunc) unicode_getitem, /* sq_item */
4534 (intintargfunc) unicode_slice, /* sq_slice */
4535 0, /* sq_ass_item */
4536 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004537 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538};
4539
4540static int
4541unicode_buffer_getreadbuf(PyUnicodeObject *self,
4542 int index,
4543 const void **ptr)
4544{
4545 if (index != 0) {
4546 PyErr_SetString(PyExc_SystemError,
4547 "accessing non-existent unicode segment");
4548 return -1;
4549 }
4550 *ptr = (void *) self->str;
4551 return PyUnicode_GET_DATA_SIZE(self);
4552}
4553
4554static int
4555unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4556 const void **ptr)
4557{
4558 PyErr_SetString(PyExc_TypeError,
4559 "cannot use unicode as modifyable buffer");
4560 return -1;
4561}
4562
4563static int
4564unicode_buffer_getsegcount(PyUnicodeObject *self,
4565 int *lenp)
4566{
4567 if (lenp)
4568 *lenp = PyUnicode_GET_DATA_SIZE(self);
4569 return 1;
4570}
4571
4572static int
4573unicode_buffer_getcharbuf(PyUnicodeObject *self,
4574 int index,
4575 const void **ptr)
4576{
4577 PyObject *str;
4578
4579 if (index != 0) {
4580 PyErr_SetString(PyExc_SystemError,
4581 "accessing non-existent unicode segment");
4582 return -1;
4583 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004584 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 if (str == NULL)
4586 return -1;
4587 *ptr = (void *) PyString_AS_STRING(str);
4588 return PyString_GET_SIZE(str);
4589}
4590
4591/* Helpers for PyUnicode_Format() */
4592
4593static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004594getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595{
4596 int argidx = *p_argidx;
4597 if (argidx < arglen) {
4598 (*p_argidx)++;
4599 if (arglen < 0)
4600 return args;
4601 else
4602 return PyTuple_GetItem(args, argidx);
4603 }
4604 PyErr_SetString(PyExc_TypeError,
4605 "not enough arguments for format string");
4606 return NULL;
4607}
4608
4609#define F_LJUST (1<<0)
4610#define F_SIGN (1<<1)
4611#define F_BLANK (1<<2)
4612#define F_ALT (1<<3)
4613#define F_ZERO (1<<4)
4614
4615static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
4618 register int i;
4619 int len;
4620 va_list va;
4621 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
4624 /* First, format the string as char array, then expand to Py_UNICODE
4625 array. */
4626 charbuffer = (char *)buffer;
4627 len = vsprintf(charbuffer, format, va);
4628 for (i = len - 1; i >= 0; i--)
4629 buffer[i] = (Py_UNICODE) charbuffer[i];
4630
4631 va_end(va);
4632 return len;
4633}
4634
4635static int
4636formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004637 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 int flags,
4639 int prec,
4640 int type,
4641 PyObject *v)
4642{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004643 /* fmt = '%#.' + `prec` + `type`
4644 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 char fmt[20];
4646 double x;
4647
4648 x = PyFloat_AsDouble(v);
4649 if (x == -1.0 && PyErr_Occurred())
4650 return -1;
4651 if (prec < 0)
4652 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4654 type = 'g';
4655 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004656 /* worst case length calc to ensure no buffer overrun:
4657 fmt = %#.<prec>g
4658 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4659 for any double rep.)
4660 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4661 If prec=0 the effective precision is 1 (the leading digit is
4662 always given), therefore increase by one to 10+prec. */
4663 if (buflen <= (size_t)10 + (size_t)prec) {
4664 PyErr_SetString(PyExc_OverflowError,
4665 "formatted float is too long (precision too long?)");
4666 return -1;
4667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 return usprintf(buf, fmt, x);
4669}
4670
4671static int
4672formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004673 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 int flags,
4675 int prec,
4676 int type,
4677 PyObject *v)
4678{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004679 /* fmt = '%#.' + `prec` + 'l' + `type`
4680 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 char fmt[20];
4682 long x;
4683
4684 x = PyInt_AsLong(v);
4685 if (x == -1 && PyErr_Occurred())
4686 return -1;
4687 if (prec < 0)
4688 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004689 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4690 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4691 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4692 PyErr_SetString(PyExc_OverflowError,
4693 "formatted integer is too long (precision too long?)");
4694 return -1;
4695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4697 return usprintf(buf, fmt, x);
4698}
4699
4700static int
4701formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004702 size_t buflen,
4703 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004705 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004706 if (PyUnicode_Check(v)) {
4707 if (PyUnicode_GET_SIZE(v) != 1)
4708 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004712 else if (PyString_Check(v)) {
4713 if (PyString_GET_SIZE(v) != 1)
4714 goto onError;
4715 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717
4718 else {
4719 /* Integer input truncated to a character */
4720 long x;
4721 x = PyInt_AsLong(v);
4722 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004723 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 buf[0] = (char) x;
4725 }
4726 buf[1] = '\0';
4727 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004728
4729 onError:
4730 PyErr_SetString(PyExc_TypeError,
4731 "%c requires int or char");
4732 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733}
4734
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004735/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4736
4737 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4738 chars are formatted. XXX This is a magic number. Each formatting
4739 routine does bounds checking to ensure no overflow, but a better
4740 solution may be to malloc a buffer of appropriate size for each
4741 format. For now, the current solution is sufficient.
4742*/
4743#define FORMATBUFLEN (size_t)120
4744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745PyObject *PyUnicode_Format(PyObject *format,
4746 PyObject *args)
4747{
4748 Py_UNICODE *fmt, *res;
4749 int fmtcnt, rescnt, reslen, arglen, argidx;
4750 int args_owned = 0;
4751 PyUnicodeObject *result = NULL;
4752 PyObject *dict = NULL;
4753 PyObject *uformat;
4754
4755 if (format == NULL || args == NULL) {
4756 PyErr_BadInternalCall();
4757 return NULL;
4758 }
4759 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004760 if (uformat == NULL)
4761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 fmt = PyUnicode_AS_UNICODE(uformat);
4763 fmtcnt = PyUnicode_GET_SIZE(uformat);
4764
4765 reslen = rescnt = fmtcnt + 100;
4766 result = _PyUnicode_New(reslen);
4767 if (result == NULL)
4768 goto onError;
4769 res = PyUnicode_AS_UNICODE(result);
4770
4771 if (PyTuple_Check(args)) {
4772 arglen = PyTuple_Size(args);
4773 argidx = 0;
4774 }
4775 else {
4776 arglen = -1;
4777 argidx = -2;
4778 }
4779 if (args->ob_type->tp_as_mapping)
4780 dict = args;
4781
4782 while (--fmtcnt >= 0) {
4783 if (*fmt != '%') {
4784 if (--rescnt < 0) {
4785 rescnt = fmtcnt + 100;
4786 reslen += rescnt;
4787 if (_PyUnicode_Resize(result, reslen) < 0)
4788 return NULL;
4789 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4790 --rescnt;
4791 }
4792 *res++ = *fmt++;
4793 }
4794 else {
4795 /* Got a format specifier */
4796 int flags = 0;
4797 int width = -1;
4798 int prec = -1;
4799 int size = 0;
4800 Py_UNICODE c = '\0';
4801 Py_UNICODE fill;
4802 PyObject *v = NULL;
4803 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004804 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 Py_UNICODE sign;
4806 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004807 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
4809 fmt++;
4810 if (*fmt == '(') {
4811 Py_UNICODE *keystart;
4812 int keylen;
4813 PyObject *key;
4814 int pcount = 1;
4815
4816 if (dict == NULL) {
4817 PyErr_SetString(PyExc_TypeError,
4818 "format requires a mapping");
4819 goto onError;
4820 }
4821 ++fmt;
4822 --fmtcnt;
4823 keystart = fmt;
4824 /* Skip over balanced parentheses */
4825 while (pcount > 0 && --fmtcnt >= 0) {
4826 if (*fmt == ')')
4827 --pcount;
4828 else if (*fmt == '(')
4829 ++pcount;
4830 fmt++;
4831 }
4832 keylen = fmt - keystart - 1;
4833 if (fmtcnt < 0 || pcount > 0) {
4834 PyErr_SetString(PyExc_ValueError,
4835 "incomplete format key");
4836 goto onError;
4837 }
Fred Drakee4315f52000-05-09 19:53:39 +00004838 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 then looked up since Python uses strings to hold
4840 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004841 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 key = PyUnicode_EncodeUTF8(keystart,
4843 keylen,
4844 NULL);
4845 if (key == NULL)
4846 goto onError;
4847 if (args_owned) {
4848 Py_DECREF(args);
4849 args_owned = 0;
4850 }
4851 args = PyObject_GetItem(dict, key);
4852 Py_DECREF(key);
4853 if (args == NULL) {
4854 goto onError;
4855 }
4856 args_owned = 1;
4857 arglen = -1;
4858 argidx = -2;
4859 }
4860 while (--fmtcnt >= 0) {
4861 switch (c = *fmt++) {
4862 case '-': flags |= F_LJUST; continue;
4863 case '+': flags |= F_SIGN; continue;
4864 case ' ': flags |= F_BLANK; continue;
4865 case '#': flags |= F_ALT; continue;
4866 case '0': flags |= F_ZERO; continue;
4867 }
4868 break;
4869 }
4870 if (c == '*') {
4871 v = getnextarg(args, arglen, &argidx);
4872 if (v == NULL)
4873 goto onError;
4874 if (!PyInt_Check(v)) {
4875 PyErr_SetString(PyExc_TypeError,
4876 "* wants int");
4877 goto onError;
4878 }
4879 width = PyInt_AsLong(v);
4880 if (width < 0) {
4881 flags |= F_LJUST;
4882 width = -width;
4883 }
4884 if (--fmtcnt >= 0)
4885 c = *fmt++;
4886 }
4887 else if (c >= '0' && c <= '9') {
4888 width = c - '0';
4889 while (--fmtcnt >= 0) {
4890 c = *fmt++;
4891 if (c < '0' || c > '9')
4892 break;
4893 if ((width*10) / 10 != width) {
4894 PyErr_SetString(PyExc_ValueError,
4895 "width too big");
4896 goto onError;
4897 }
4898 width = width*10 + (c - '0');
4899 }
4900 }
4901 if (c == '.') {
4902 prec = 0;
4903 if (--fmtcnt >= 0)
4904 c = *fmt++;
4905 if (c == '*') {
4906 v = getnextarg(args, arglen, &argidx);
4907 if (v == NULL)
4908 goto onError;
4909 if (!PyInt_Check(v)) {
4910 PyErr_SetString(PyExc_TypeError,
4911 "* wants int");
4912 goto onError;
4913 }
4914 prec = PyInt_AsLong(v);
4915 if (prec < 0)
4916 prec = 0;
4917 if (--fmtcnt >= 0)
4918 c = *fmt++;
4919 }
4920 else if (c >= '0' && c <= '9') {
4921 prec = c - '0';
4922 while (--fmtcnt >= 0) {
4923 c = Py_CHARMASK(*fmt++);
4924 if (c < '0' || c > '9')
4925 break;
4926 if ((prec*10) / 10 != prec) {
4927 PyErr_SetString(PyExc_ValueError,
4928 "prec too big");
4929 goto onError;
4930 }
4931 prec = prec*10 + (c - '0');
4932 }
4933 }
4934 } /* prec */
4935 if (fmtcnt >= 0) {
4936 if (c == 'h' || c == 'l' || c == 'L') {
4937 size = c;
4938 if (--fmtcnt >= 0)
4939 c = *fmt++;
4940 }
4941 }
4942 if (fmtcnt < 0) {
4943 PyErr_SetString(PyExc_ValueError,
4944 "incomplete format");
4945 goto onError;
4946 }
4947 if (c != '%') {
4948 v = getnextarg(args, arglen, &argidx);
4949 if (v == NULL)
4950 goto onError;
4951 }
4952 sign = 0;
4953 fill = ' ';
4954 switch (c) {
4955
4956 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004957 pbuf = formatbuf;
4958 /* presume that buffer length is at least 1 */
4959 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 len = 1;
4961 break;
4962
4963 case 's':
4964 case 'r':
4965 if (PyUnicode_Check(v) && c == 's') {
4966 temp = v;
4967 Py_INCREF(temp);
4968 }
4969 else {
4970 PyObject *unicode;
4971 if (c == 's')
4972 temp = PyObject_Str(v);
4973 else
4974 temp = PyObject_Repr(v);
4975 if (temp == NULL)
4976 goto onError;
4977 if (!PyString_Check(temp)) {
4978 /* XXX Note: this should never happen, since
4979 PyObject_Repr() and PyObject_Str() assure
4980 this */
4981 Py_DECREF(temp);
4982 PyErr_SetString(PyExc_TypeError,
4983 "%s argument has non-string str()");
4984 goto onError;
4985 }
Fred Drakee4315f52000-05-09 19:53:39 +00004986 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004988 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 "strict");
4990 Py_DECREF(temp);
4991 temp = unicode;
4992 if (temp == NULL)
4993 goto onError;
4994 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004995 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 len = PyUnicode_GET_SIZE(temp);
4997 if (prec >= 0 && len > prec)
4998 len = prec;
4999 break;
5000
5001 case 'i':
5002 case 'd':
5003 case 'u':
5004 case 'o':
5005 case 'x':
5006 case 'X':
5007 if (c == 'i')
5008 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005009 pbuf = formatbuf;
5010 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5011 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 if (len < 0)
5013 goto onError;
5014 sign = (c == 'd');
5015 if (flags & F_ZERO) {
5016 fill = '0';
5017 if ((flags&F_ALT) &&
5018 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005019 pbuf[0] == '0' && pbuf[1] == c) {
5020 *res++ = *pbuf++;
5021 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 rescnt -= 2;
5023 len -= 2;
5024 width -= 2;
5025 if (width < 0)
5026 width = 0;
5027 }
5028 }
5029 break;
5030
5031 case 'e':
5032 case 'E':
5033 case 'f':
5034 case 'g':
5035 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005036 pbuf = formatbuf;
5037 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5038 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 if (len < 0)
5040 goto onError;
5041 sign = 1;
5042 if (flags&F_ZERO)
5043 fill = '0';
5044 break;
5045
5046 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005047 pbuf = formatbuf;
5048 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 if (len < 0)
5050 goto onError;
5051 break;
5052
5053 default:
5054 PyErr_Format(PyExc_ValueError,
5055 "unsupported format character '%c' (0x%x)",
5056 c, c);
5057 goto onError;
5058 }
5059 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005060 if (*pbuf == '-' || *pbuf == '+') {
5061 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 len--;
5063 }
5064 else if (flags & F_SIGN)
5065 sign = '+';
5066 else if (flags & F_BLANK)
5067 sign = ' ';
5068 else
5069 sign = 0;
5070 }
5071 if (width < len)
5072 width = len;
5073 if (rescnt < width + (sign != 0)) {
5074 reslen -= rescnt;
5075 rescnt = width + fmtcnt + 100;
5076 reslen += rescnt;
5077 if (_PyUnicode_Resize(result, reslen) < 0)
5078 return NULL;
5079 res = PyUnicode_AS_UNICODE(result)
5080 + reslen - rescnt;
5081 }
5082 if (sign) {
5083 if (fill != ' ')
5084 *res++ = sign;
5085 rescnt--;
5086 if (width > len)
5087 width--;
5088 }
5089 if (width > len && !(flags & F_LJUST)) {
5090 do {
5091 --rescnt;
5092 *res++ = fill;
5093 } while (--width > len);
5094 }
5095 if (sign && fill == ' ')
5096 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005097 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 res += len;
5099 rescnt -= len;
5100 while (--width >= len) {
5101 --rescnt;
5102 *res++ = ' ';
5103 }
5104 if (dict && (argidx < arglen) && c != '%') {
5105 PyErr_SetString(PyExc_TypeError,
5106 "not all arguments converted");
5107 goto onError;
5108 }
5109 Py_XDECREF(temp);
5110 } /* '%' */
5111 } /* until end */
5112 if (argidx < arglen && !dict) {
5113 PyErr_SetString(PyExc_TypeError,
5114 "not all arguments converted");
5115 goto onError;
5116 }
5117
5118 if (args_owned) {
5119 Py_DECREF(args);
5120 }
5121 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005122 if (_PyUnicode_Resize(result, reslen - rescnt))
5123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 return (PyObject *)result;
5125
5126 onError:
5127 Py_XDECREF(result);
5128 Py_DECREF(uformat);
5129 if (args_owned) {
5130 Py_DECREF(args);
5131 }
5132 return NULL;
5133}
5134
5135static PyBufferProcs unicode_as_buffer = {
5136 (getreadbufferproc) unicode_buffer_getreadbuf,
5137 (getwritebufferproc) unicode_buffer_getwritebuf,
5138 (getsegcountproc) unicode_buffer_getsegcount,
5139 (getcharbufferproc) unicode_buffer_getcharbuf,
5140};
5141
5142PyTypeObject PyUnicode_Type = {
5143 PyObject_HEAD_INIT(&PyType_Type)
5144 0, /* ob_size */
5145 "unicode", /* tp_name */
5146 sizeof(PyUnicodeObject), /* tp_size */
5147 0, /* tp_itemsize */
5148 /* Slots */
5149 (destructor)_PyUnicode_Free, /* tp_dealloc */
5150 0, /* tp_print */
5151 (getattrfunc)unicode_getattr, /* tp_getattr */
5152 0, /* tp_setattr */
5153 (cmpfunc) unicode_compare, /* tp_compare */
5154 (reprfunc) unicode_repr, /* tp_repr */
5155 0, /* tp_as_number */
5156 &unicode_as_sequence, /* tp_as_sequence */
5157 0, /* tp_as_mapping */
5158 (hashfunc) unicode_hash, /* tp_hash*/
5159 0, /* tp_call*/
5160 (reprfunc) unicode_str, /* tp_str */
5161 (getattrofunc) NULL, /* tp_getattro */
5162 (setattrofunc) NULL, /* tp_setattro */
5163 &unicode_as_buffer, /* tp_as_buffer */
5164 Py_TPFLAGS_DEFAULT, /* tp_flags */
5165};
5166
5167/* Initialize the Unicode implementation */
5168
Thomas Wouters78890102000-07-22 19:25:51 +00005169void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
5171 /* Doublecheck the configuration... */
5172 if (sizeof(Py_UNICODE) != 2)
5173 Py_FatalError("Unicode configuration error: "
5174 "sizeof(Py_UNICODE) != 2 bytes");
5175
Fred Drakee4315f52000-05-09 19:53:39 +00005176 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005177 unicode_freelist = NULL;
5178 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005180 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181}
5182
5183/* Finalize the Unicode implementation */
5184
5185void
Thomas Wouters78890102000-07-22 19:25:51 +00005186_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187{
5188 PyUnicodeObject *u = unicode_freelist;
5189
5190 while (u != NULL) {
5191 PyUnicodeObject *v = u;
5192 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005193 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005194 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005195 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005196 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005198 unicode_freelist = NULL;
5199 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005201 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}