blob: b9e457d6a7beb553e60d36ccaef75f0b01b15876 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000070#ifdef MS_WIN32
71#include <windows.h>
72#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073
Guido van Rossumd57fd912000-03-10 22:53:23 +000074/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE 1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
83
Barry Warsaw51ac5802000-03-20 16:36:48 +000084 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000086 malloc()-overhead) bytes of unused garbage.
87
88 Setting the limit to 0 effectively turns the feature off.
89
Guido van Rossumfd4b9572000-04-10 13:51:10 +000090 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
93*/
94
Guido van Rossumfd4b9572000-04-10 13:51:10 +000095#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000096
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105/* --- Globals ------------------------------------------------------------
106
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
109
110*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
112/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000113static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
134{
135 void *oldstr;
136
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000137 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000138 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000139 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000140
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
158 unicode->length = length;
159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
167
168 return 0;
169}
170
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000171int PyUnicode_Resize(PyObject **unicode,
172 int length)
173{
174 PyUnicodeObject *v;
175
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
179 }
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
184 }
185 return _PyUnicode_Resize(v, length);
186}
187
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188/* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
190
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199 register PyUnicodeObject *unicode;
200
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
205 }
206
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000210 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000217 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 }
220 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000221 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000223 }
224 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000270 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
358{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 const char *s;
360 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000361 int owned = 0;
362 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
367 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
377 }
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
383 }
384 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
391 }
392 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 }
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000406 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000407 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000408
409 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 if (len == 0) {
411 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000417 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 return v;
421
422 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427}
428
429PyObject *PyUnicode_Decode(const char *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *buffer = NULL, *unicode;
435
Fred Drakee4315f52000-05-09 19:53:39 +0000436 if (encoding == NULL)
437 encoding = PyUnicode_GetDefaultEncoding();
438
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000442 else if (strcmp(encoding, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s, size, errors);
444 else if (strcmp(encoding, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446
447 /* Decode via the codec registry */
448 buffer = PyBuffer_FromMemory((void *)s, size);
449 if (buffer == NULL)
450 goto onError;
451 unicode = PyCodec_Decode(buffer, encoding, errors);
452 if (unicode == NULL)
453 goto onError;
454 if (!PyUnicode_Check(unicode)) {
455 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000456 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 unicode->ob_type->tp_name);
458 Py_DECREF(unicode);
459 goto onError;
460 }
461 Py_DECREF(buffer);
462 return unicode;
463
464 onError:
465 Py_XDECREF(buffer);
466 return NULL;
467}
468
469PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470 int size,
471 const char *encoding,
472 const char *errors)
473{
474 PyObject *v, *unicode;
475
476 unicode = PyUnicode_FromUnicode(s, size);
477 if (unicode == NULL)
478 return NULL;
479 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480 Py_DECREF(unicode);
481 return v;
482}
483
484PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485 const char *encoding,
486 const char *errors)
487{
488 PyObject *v;
489
490 if (!PyUnicode_Check(unicode)) {
491 PyErr_BadArgument();
492 goto onError;
493 }
Fred Drakee4315f52000-05-09 19:53:39 +0000494
495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
497
498 /* Shortcuts for common default encodings */
499 if (errors == NULL) {
500 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode);
506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507
508 /* Encode via the codec registry */
509 v = PyCodec_Encode(unicode, encoding, errors);
510 if (v == NULL)
511 goto onError;
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v)) {
514 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000515 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 v->ob_type->tp_name);
517 Py_DECREF(v);
518 goto onError;
519 }
520 return v;
521
522 onError:
523 return NULL;
524}
525
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000526/* Return a Python string holding the default encoded value of the
527 Unicode object.
528
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
533
534 The refcount of the string is *not* incremented.
535
536 *** Exported for internal use by the interpreter only !!! ***
537
538*/
539
540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541 const char *errors)
542{
543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
544
545 if (v)
546 return v;
547 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548 if (v && errors == NULL)
549 ((PyUnicodeObject *)unicode)->defenc = v;
550 return v;
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
554{
555 if (!PyUnicode_Check(unicode)) {
556 PyErr_BadArgument();
557 goto onError;
558 }
559 return PyUnicode_AS_UNICODE(unicode);
560
561 onError:
562 return NULL;
563}
564
565int PyUnicode_GetSize(PyObject *unicode)
566{
567 if (!PyUnicode_Check(unicode)) {
568 PyErr_BadArgument();
569 goto onError;
570 }
571 return PyUnicode_GET_SIZE(unicode);
572
573 onError:
574 return -1;
575}
576
Thomas Wouters78890102000-07-22 19:25:51 +0000577const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000578{
579 return unicode_default_encoding;
580}
581
582int PyUnicode_SetDefaultEncoding(const char *encoding)
583{
584 PyObject *v;
585
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v = _PyCodec_Lookup(encoding);
589 if (v == NULL)
590 goto onError;
591 Py_DECREF(v);
592 strncpy(unicode_default_encoding,
593 encoding,
594 sizeof(unicode_default_encoding));
595 return 0;
596
597 onError:
598 return -1;
599}
600
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601/* --- UTF-8 Codec -------------------------------------------------------- */
602
603static
604char utf8_code_length[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
623};
624
625static
626int utf8_decoding_error(const char **source,
627 Py_UNICODE **dest,
628 const char *errors,
629 const char *details)
630{
631 if ((errors == NULL) ||
632 (strcmp(errors,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000634 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 details);
636 return -1;
637 }
638 else if (strcmp(errors,"ignore") == 0) {
639 (*source)++;
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 (*source)++;
644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656PyObject *PyUnicode_DecodeUTF8(const char *s,
657 int size,
658 const char *errors)
659{
660 int n;
661 const char *e;
662 PyUnicodeObject *unicode;
663 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665
666 /* Note: size will always be longer than the resulting Unicode
667 character count */
668 unicode = _PyUnicode_New(size);
669 if (!unicode)
670 return NULL;
671 if (size == 0)
672 return (PyObject *)unicode;
673
674 /* Unpack UTF-8 encoded data */
675 p = unicode->str;
676 e = s + size;
677
678 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000679 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680
681 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683 s++;
684 continue;
685 }
686
687 n = utf8_code_length[ch];
688
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000689 if (s + n > e) {
690 errmsg = "unexpected end of data";
691 goto utf8Error;
692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693
694 switch (n) {
695
696 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 errmsg = "unexpected code byte";
698 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 break;
700
701 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 errmsg = "internal error";
703 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 break;
705
706 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 if ((s[1] & 0xc0) != 0x80) {
708 errmsg = "invalid data";
709 goto utf8Error;
710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000712 if (ch < 0x80) {
713 errmsg = "illegal encoding";
714 goto utf8Error;
715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000717 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 break;
719
720 case 3:
721 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000722 (s[2] & 0xc0) != 0x80) {
723 errmsg = "invalid data";
724 goto utf8Error;
725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728 errmsg = "illegal encoding";
729 goto utf8Error;
730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000732 *p++ = (Py_UNICODE)ch;
733 break;
734
735 case 4:
736 if ((s[1] & 0xc0) != 0x80 ||
737 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 (s[3] & 0xc0) != 0x80) {
739 errmsg = "invalid data";
740 goto utf8Error;
741 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000745 if ((ch < 0x10000) || /* minimum value allowed for 4
746 byte encoding */
747 (ch > 0x10ffff)) { /* maximum value allowed for
748 UTF-16 */
749 errmsg = "illegal encoding";
750 goto utf8Error;
751 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000752 /* compute and append the two surrogates: */
753
754 /* translate from 10000..10FFFF to 0..FFFF */
755 ch -= 0x10000;
756
757 /* high surrogate = top 10 bits added to D800 */
758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
759
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 break;
763
764 default:
765 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000766 errmsg = "unsupported Unicode code range";
767 goto utf8Error;
768 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769 }
770 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000771 continue;
772
773 utf8Error:
774 if (utf8_decoding_error(&s, &p, errors, errmsg))
775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 }
777
778 /* Adjust length */
779 if (_PyUnicode_Resize(unicode, p - unicode->str))
780 goto onError;
781
782 return (PyObject *)unicode;
783
784onError:
785 Py_DECREF(unicode);
786 return NULL;
787}
788
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789/* Not used anymore, now that the encoder supports UTF-16
790 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000791#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792static
793int utf8_encoding_error(const Py_UNICODE **source,
794 char **dest,
795 const char *errors,
796 const char *details)
797{
798 if ((errors == NULL) ||
799 (strcmp(errors,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000801 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 details);
803 return -1;
804 }
805 else if (strcmp(errors,"ignore") == 0) {
806 return 0;
807 }
808 else if (strcmp(errors,"replace") == 0) {
809 **dest = '?';
810 (*dest)++;
811 return 0;
812 }
813 else {
814 PyErr_Format(PyExc_ValueError,
815 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000816 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 errors);
818 return -1;
819 }
820}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824 int size,
825 const char *errors)
826{
827 PyObject *v;
828 char *p;
829 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000830 Py_UCS4 ch2;
831 unsigned int cbAllocated = 3 * size;
832 unsigned int cbWritten = 0;
833 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000835 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (v == NULL)
837 return NULL;
838 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000839 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
841 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000842 while (i < size) {
843 Py_UCS4 ch = s[i++];
844 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000846 cbWritten++;
847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 else if (ch < 0x0800) {
849 *p++ = 0xc0 | (ch >> 6);
850 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000851 cbWritten += 2;
852 }
853 else {
854 /* Check for high surrogate */
855 if (0xD800 <= ch && ch <= 0xDBFF) {
856 if (i != size) {
857 ch2 = s[i];
858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
859
860 if (cbWritten >= (cbAllocated - 4)) {
861 /* Provide enough room for some more
862 surrogates */
863 cbAllocated += 4*10;
864 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000865 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 }
867
868 /* combine the two values */
869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
870
871 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 i++;
874 cbWritten += 4;
875 }
876 }
877 }
878 else {
879 *p++ = (char)(0xe0 | (ch >> 12));
880 cbWritten += 3;
881 }
882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 }
885 }
886 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000887 if (_PyString_Resize(&v, p - q))
888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 return v;
890
891 onError:
892 Py_DECREF(v);
893 return NULL;
894}
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
897{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 if (!PyUnicode_Check(unicode)) {
899 PyErr_BadArgument();
900 return NULL;
901 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903 PyUnicode_GET_SIZE(unicode),
904 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905}
906
907/* --- UTF-16 Codec ------------------------------------------------------- */
908
909static
910int utf16_decoding_error(const Py_UNICODE **source,
911 Py_UNICODE **dest,
912 const char *errors,
913 const char *details)
914{
915 if ((errors == NULL) ||
916 (strcmp(errors,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000918 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000919 details);
920 return -1;
921 }
922 else if (strcmp(errors,"ignore") == 0) {
923 return 0;
924 }
925 else if (strcmp(errors,"replace") == 0) {
926 if (dest) {
927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928 (*dest)++;
929 }
930 return 0;
931 }
932 else {
933 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 errors);
937 return -1;
938 }
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_DecodeUTF16(const char *s,
942 int size,
943 const char *errors,
944 int *byteorder)
945{
946 PyUnicodeObject *unicode;
947 Py_UNICODE *p;
948 const Py_UNICODE *q, *e;
949 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000950 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951
952 /* size should be an even number */
953 if (size % sizeof(Py_UNICODE) != 0) {
954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955 return NULL;
956 /* The remaining input chars are ignored if we fall through
957 here... */
958 }
959
960 /* Note: size will always be longer than the resulting Unicode
961 character count */
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 /* Unpack UTF-16 encoded data */
969 p = unicode->str;
970 q = (Py_UNICODE *)s;
971 e = q + (size / sizeof(Py_UNICODE));
972
973 if (byteorder)
974 bo = *byteorder;
975
976 while (q < e) {
977 register Py_UNICODE ch = *q++;
978
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982 !) */
983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
984 if (ch == 0xFEFF) {
985 bo = -1;
986 continue;
987 } else if (ch == 0xFFFE) {
988 bo = 1;
989 continue;
990 }
991 if (bo == 1)
992 ch = (ch >> 8) | (ch << 8);
993#else
994 if (ch == 0xFEFF) {
995 bo = 1;
996 continue;
997 } else if (ch == 0xFFFE) {
998 bo = -1;
999 continue;
1000 }
1001 if (bo == -1)
1002 ch = (ch >> 8) | (ch << 8);
1003#endif
1004 if (ch < 0xD800 || ch > 0xDFFF) {
1005 *p++ = ch;
1006 continue;
1007 }
1008
1009 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001010 if (q >= e) {
1011 errmsg = "unexpected end of data";
1012 goto utf16Error;
1013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 if (0xDC00 <= *q && *q <= 0xDFFF) {
1015 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001021 errmsg = "code pairs are not supported";
1022 goto utf16Error;
1023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 else
1025 continue;
1026 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "illegal encoding";
1028 /* Fall through to report the error */
1029
1030 utf16Error:
1031 if (utf16_decoding_error(&q, &p, errors, errmsg))
1032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 }
1034
1035 if (byteorder)
1036 *byteorder = bo;
1037
1038 /* Adjust length */
1039 if (_PyUnicode_Resize(unicode, p - unicode->str))
1040 goto onError;
1041
1042 return (PyObject *)unicode;
1043
1044onError:
1045 Py_DECREF(unicode);
1046 return NULL;
1047}
1048
1049#undef UTF16_ERROR
1050
1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052 int size,
1053 const char *errors,
1054 int byteorder)
1055{
1056 PyObject *v;
1057 Py_UNICODE *p;
1058 char *q;
1059
1060 /* We don't create UTF-16 pairs... */
1061 v = PyString_FromStringAndSize(NULL,
1062 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063 if (v == NULL)
1064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 q = PyString_AS_STRING(v);
1067 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (byteorder == 0)
1069 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001070 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (byteorder == 0 ||
1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074 byteorder == -1
1075#else
1076 byteorder == 1
1077#endif
1078 )
1079 memcpy(p, s, size * sizeof(Py_UNICODE));
1080 else
1081 while (size-- > 0) {
1082 Py_UNICODE ch = *s++;
1083 *p++ = (ch >> 8) | (ch << 8);
1084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 return v;
1086}
1087
1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089{
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 return NULL;
1093 }
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095 PyUnicode_GET_SIZE(unicode),
1096 NULL,
1097 0);
1098}
1099
1100/* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102static
1103int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001104 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 const char *errors,
1106 const char *details)
1107{
1108 if ((errors == NULL) ||
1109 (strcmp(errors,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001111 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 details);
1113 return -1;
1114 }
1115 else if (strcmp(errors,"ignore") == 0) {
1116 return 0;
1117 }
1118 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001119 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 return 0;
1121 }
1122 else {
1123 PyErr_Format(PyExc_ValueError,
1124 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 errors);
1127 return -1;
1128 }
1129}
1130
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001131static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133static
1134int mystrnicmp(const char *s1, const char *s2, size_t count)
1135{
1136 char c1, c2;
1137
1138 if (count)
1139 {
1140 do
1141 {
1142 c1 = tolower(*(s1++));
1143 c2 = tolower(*(s2++));
1144 }
1145 while(--count && c1 == c2);
1146
1147 return c1 - c2;
1148 }
1149
1150 return 0;
1151}
1152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154 int size,
1155 const char *errors)
1156{
1157 PyUnicodeObject *v;
1158 Py_UNICODE *p = NULL, *buf = NULL;
1159 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001160 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v = _PyUnicode_New(size);
1166 if (v == NULL)
1167 goto onError;
1168 if (size == 0)
1169 return (PyObject *)v;
1170 p = buf = PyUnicode_AS_UNICODE(v);
1171 end = s + size;
1172 while (s < end) {
1173 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001174 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 int i;
1176
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1178 if (*s != '\\') {
1179 *p++ = (unsigned char)*s++;
1180 continue;
1181 }
1182
1183 /* \ - Escapes */
1184 s++;
1185 switch (*s++) {
1186
1187 /* \x escapes */
1188 case '\n': break;
1189 case '\\': *p++ = '\\'; break;
1190 case '\'': *p++ = '\''; break;
1191 case '\"': *p++ = '\"'; break;
1192 case 'b': *p++ = '\b'; break;
1193 case 'f': *p++ = '\014'; break; /* FF */
1194 case 't': *p++ = '\t'; break;
1195 case 'n': *p++ = '\n'; break;
1196 case 'r': *p++ = '\r'; break;
1197 case 'v': *p++ = '\013'; break; /* VT */
1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001203 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001205 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001207 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001209 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 break;
1211
Fredrik Lundhdf846752000-09-03 11:29:49 +00001212 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001214 for (x = 0, i = 0; i < 2; i++) {
1215 c = (unsigned char)s[i];
1216 if (!isxdigit(c)) {
1217 if (unicodeescape_decoding_error(&s, &x, errors,
1218 "truncated \\xXX"))
1219 goto onError;
1220 i++;
1221 break;
1222 }
1223 x = (x<<4) & ~0xF;
1224 if (c >= '0' && c <= '9')
1225 x += c - '0';
1226 else if (c >= 'a' && c <= 'f')
1227 x += 10 + c - 'a';
1228 else
1229 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001231 s += i;
1232 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 break;
1234
1235 /* \uXXXX with 4 hex digits */
1236 case 'u':
1237 for (x = 0, i = 0; i < 4; i++) {
1238 c = (unsigned char)s[i];
1239 if (!isxdigit(c)) {
1240 if (unicodeescape_decoding_error(&s, &x, errors,
1241 "truncated \\uXXXX"))
1242 goto onError;
1243 i++;
1244 break;
1245 }
1246 x = (x<<4) & ~0xF;
1247 if (c >= '0' && c <= '9')
1248 x += c - '0';
1249 else if (c >= 'a' && c <= 'f')
1250 x += 10 + c - 'a';
1251 else
1252 x += 10 + c - 'A';
1253 }
1254 s += i;
1255 *p++ = x;
1256 break;
1257
Fredrik Lundhdf846752000-09-03 11:29:49 +00001258 /* \UXXXXXXXX with 8 hex digits */
1259 case 'U':
1260 for (chr = 0, i = 0; i < 8; i++) {
1261 c = (unsigned char)s[i];
1262 if (!isxdigit(c)) {
1263 if (unicodeescape_decoding_error(&s, &x, errors,
1264 "truncated \\uXXXX"))
1265 goto onError;
1266 i++;
1267 break;
1268 }
1269 chr = (chr<<4) & ~0xF;
1270 if (c >= '0' && c <= '9')
1271 chr += c - '0';
1272 else if (c >= 'a' && c <= 'f')
1273 chr += 10 + c - 'a';
1274 else
1275 chr += 10 + c - 'A';
1276 }
1277 s += i;
1278 goto store;
1279
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001280 case 'N':
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1283 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001284 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001285 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 mod = PyImport_ImportModule("ucnhash");
1287 if (mod == NULL)
1288 goto onError;
1289 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290 Py_DECREF(mod);
1291 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001292 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001293 pucnHash = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001296 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001297 }
1298
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001300 const char *start = s + 1;
1301 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001302 unsigned long j;
1303
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1306 */
1307 while (*endBrace != '}' &&
1308 (unsigned int)(endBrace - start) <=
1309 pucnHash->cchMax &&
1310 endBrace < end)
1311 {
1312 endBrace++;
1313 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001314 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001315 j = pucnHash->hash(start, endBrace - start);
1316 if (j > pucnHash->cKeys ||
1317 mystrnicmp(
1318 start,
1319 ((_Py_UnicodeCharacterName *)
1320 (pucnHash->getValue(j)))->pszUCN,
1321 (int)(endBrace - start)) != 0)
1322 {
1323 if (unicodeescape_decoding_error(
1324 &s, &x, errors,
1325 "Invalid Unicode Character Name"))
1326 {
1327 goto onError;
1328 }
1329 goto ucnFallthrough;
1330 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001331 chr = ((_Py_UnicodeCharacterName *)
1332 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001333 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001334 goto store;
1335 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Unicode name missing closing brace"))
1339 goto onError;
1340 goto ucnFallthrough;
1341 }
1342 break;
1343 }
1344 if (unicodeescape_decoding_error(
1345 &s, &x, errors,
1346 "Missing opening brace for Unicode Character Name escape"))
1347 goto onError;
1348ucnFallthrough:
1349 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001350 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 *p++ = '\\';
1352 *p++ = (unsigned char)s[-1];
1353 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001354store:
1355 /* when we get here, chr is a 32-bit unicode character */
1356 if (chr <= 0xffff)
1357 /* UCS-2 character */
1358 *p++ = (Py_UNICODE) chr;
1359 else if (chr <= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1361 chr -= 0x10000L;
1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364 } else {
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Illegal Unicode character")
1368 )
1369 goto onError;
1370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 }
1372 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001373 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return (PyObject *)v;
1376
1377 onError:
1378 Py_XDECREF(v);
1379 return NULL;
1380}
1381
1382/* Return a Unicode-Escape string version of the Unicode object.
1383
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1385 appropriate.
1386
1387*/
1388
Barry Warsaw51ac5802000-03-20 16:36:48 +00001389static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390 int size,
1391 Py_UNICODE ch);
1392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393static
1394PyObject *unicodeescape_string(const Py_UNICODE *s,
1395 int size,
1396 int quotes)
1397{
1398 PyObject *repr;
1399 char *p;
1400 char *q;
1401
1402 static const char *hexdigit = "0123456789ABCDEF";
1403
1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405 if (repr == NULL)
1406 return NULL;
1407
1408 p = q = PyString_AS_STRING(repr);
1409
1410 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 *p++ = 'u';
1412 *p++ = (findchar(s, size, '\'') &&
1413 !findchar(s, size, '"')) ? '"' : '\'';
1414 }
1415 while (size-- > 0) {
1416 Py_UNICODE ch = *s++;
1417 /* Escape quotes */
1418 if (quotes && (ch == q[1] || ch == '\\')) {
1419 *p++ = '\\';
1420 *p++ = (char) ch;
1421 }
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch >= 256) {
1424 *p++ = '\\';
1425 *p++ = 'u';
1426 *p++ = hexdigit[(ch >> 12) & 0xf];
1427 *p++ = hexdigit[(ch >> 8) & 0xf];
1428 *p++ = hexdigit[(ch >> 4) & 0xf];
1429 *p++ = hexdigit[ch & 15];
1430 }
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch < ' ' || ch >= 128) {
1433 *p++ = '\\';
1434 *p++ = hexdigit[(ch >> 6) & 7];
1435 *p++ = hexdigit[(ch >> 3) & 7];
1436 *p++ = hexdigit[ch & 7];
1437 }
1438 /* Copy everything else as-is */
1439 else
1440 *p++ = (char) ch;
1441 }
1442 if (quotes)
1443 *p++ = q[1];
1444
1445 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001446 if (_PyString_Resize(&repr, p - q))
1447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001450
1451 onError:
1452 Py_DECREF(repr);
1453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457 int size)
1458{
1459 return unicodeescape_string(s, size, 0);
1460}
1461
1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463{
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 return NULL;
1467 }
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469 PyUnicode_GET_SIZE(unicode));
1470}
1471
1472/* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475 int size,
1476 const char *errors)
1477{
1478 PyUnicodeObject *v;
1479 Py_UNICODE *p, *buf;
1480 const char *end;
1481 const char *bs;
1482
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v = _PyUnicode_New(size);
1487 if (v == NULL)
1488 goto onError;
1489 if (size == 0)
1490 return (PyObject *)v;
1491 p = buf = PyUnicode_AS_UNICODE(v);
1492 end = s + size;
1493 while (s < end) {
1494 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001495 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 int i;
1497
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1499 if (*s != '\\') {
1500 *p++ = (unsigned char)*s++;
1501 continue;
1502 }
1503
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1506 bs = s;
1507 for (;s < end;) {
1508 if (*s != '\\')
1509 break;
1510 *p++ = (unsigned char)*s++;
1511 }
1512 if (((s - bs) & 1) == 0 ||
1513 s >= end ||
1514 *s != 'u') {
1515 continue;
1516 }
1517 p--;
1518 s++;
1519
1520 /* \uXXXX with 4 hex digits */
1521 for (x = 0, i = 0; i < 4; i++) {
1522 c = (unsigned char)s[i];
1523 if (!isxdigit(c)) {
1524 if (unicodeescape_decoding_error(&s, &x, errors,
1525 "truncated \\uXXXX"))
1526 goto onError;
1527 i++;
1528 break;
1529 }
1530 x = (x<<4) & ~0xF;
1531 if (c >= '0' && c <= '9')
1532 x += c - '0';
1533 else if (c >= 'a' && c <= 'f')
1534 x += 10 + c - 'a';
1535 else
1536 x += 10 + c - 'A';
1537 }
1538 s += i;
1539 *p++ = x;
1540 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001541 if (_PyUnicode_Resize(v, (int)(p - buf)))
1542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return (PyObject *)v;
1544
1545 onError:
1546 Py_XDECREF(v);
1547 return NULL;
1548}
1549
1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551 int size)
1552{
1553 PyObject *repr;
1554 char *p;
1555 char *q;
1556
1557 static const char *hexdigit = "0123456789ABCDEF";
1558
1559 repr = PyString_FromStringAndSize(NULL, 6 * size);
1560 if (repr == NULL)
1561 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001562 if (size == 0)
1563 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564
1565 p = q = PyString_AS_STRING(repr);
1566 while (size-- > 0) {
1567 Py_UNICODE ch = *s++;
1568 /* Map 16-bit characters to '\uxxxx' */
1569 if (ch >= 256) {
1570 *p++ = '\\';
1571 *p++ = 'u';
1572 *p++ = hexdigit[(ch >> 12) & 0xf];
1573 *p++ = hexdigit[(ch >> 8) & 0xf];
1574 *p++ = hexdigit[(ch >> 4) & 0xf];
1575 *p++ = hexdigit[ch & 15];
1576 }
1577 /* Copy everything else as-is */
1578 else
1579 *p++ = (char) ch;
1580 }
1581 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001582 if (_PyString_Resize(&repr, p - q))
1583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584
1585 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001586
1587 onError:
1588 Py_DECREF(repr);
1589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590}
1591
1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593{
1594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1597 }
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599 PyUnicode_GET_SIZE(unicode));
1600}
1601
1602/* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604PyObject *PyUnicode_DecodeLatin1(const char *s,
1605 int size,
1606 const char *errors)
1607{
1608 PyUnicodeObject *v;
1609 Py_UNICODE *p;
1610
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v = _PyUnicode_New(size);
1613 if (v == NULL)
1614 goto onError;
1615 if (size == 0)
1616 return (PyObject *)v;
1617 p = PyUnicode_AS_UNICODE(v);
1618 while (size-- > 0)
1619 *p++ = (unsigned char)*s++;
1620 return (PyObject *)v;
1621
1622 onError:
1623 Py_XDECREF(v);
1624 return NULL;
1625}
1626
1627static
1628int latin1_encoding_error(const Py_UNICODE **source,
1629 char **dest,
1630 const char *errors,
1631 const char *details)
1632{
1633 if ((errors == NULL) ||
1634 (strcmp(errors,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001636 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 details);
1638 return -1;
1639 }
1640 else if (strcmp(errors,"ignore") == 0) {
1641 return 0;
1642 }
1643 else if (strcmp(errors,"replace") == 0) {
1644 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 return 0;
1647 }
1648 else {
1649 PyErr_Format(PyExc_ValueError,
1650 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 errors);
1653 return -1;
1654 }
1655}
1656
1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658 int size,
1659 const char *errors)
1660{
1661 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001662 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001663
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 repr = PyString_FromStringAndSize(NULL, size);
1665 if (repr == NULL)
1666 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001667 if (size == 0)
1668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
1670 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001671 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1678 }
1679 else
1680 *s++ = (char)ch;
1681 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return repr;
1687
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1698 }
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1711{
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001715 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 details);
1717 return -1;
1718 }
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1721 }
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1726 }
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 errors);
1732 return -1;
1733 }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1739{
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1742
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1752
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return (PyObject *)v;
1764
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1775{
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001779 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 details);
1781 return -1;
1782 }
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1785 }
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001788 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return 0;
1790 }
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 errors);
1796 return -1;
1797 }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1803{
1804 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001805 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 repr = PyString_FromStringAndSize(NULL, size);
1808 if (repr == NULL)
1809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001810 if (size == 0)
1811 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812
1813 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001814 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (size-- > 0) {
1816 Py_UNICODE ch = *p++;
1817 if (ch >= 128) {
1818 if (ascii_encoding_error(&p, &s, errors,
1819 "ordinal not in range(128)"))
1820 goto onError;
1821 }
1822 else
1823 *s++ = (char)ch;
1824 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001825 /* Resize if error handling skipped some characters */
1826 if (s - start < PyString_GET_SIZE(repr))
1827 if (_PyString_Resize(&repr, s - start))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return repr;
1830
1831 onError:
1832 Py_DECREF(repr);
1833 return NULL;
1834}
1835
1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837{
1838 if (!PyUnicode_Check(unicode)) {
1839 PyErr_BadArgument();
1840 return NULL;
1841 }
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843 PyUnicode_GET_SIZE(unicode),
1844 NULL);
1845}
1846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001847#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001851PyObject *PyUnicode_DecodeMBCS(const char *s,
1852 int size,
1853 const char *errors)
1854{
1855 PyUnicodeObject *v;
1856 Py_UNICODE *p;
1857
1858 /* First get the size of the result */
1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001860 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863 v = _PyUnicode_New(usize);
1864 if (v == NULL)
1865 return NULL;
1866 if (usize == 0)
1867 return (PyObject *)v;
1868 p = PyUnicode_AS_UNICODE(v);
1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870 Py_DECREF(v);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872 }
1873
1874 return (PyObject *)v;
1875}
1876
1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878 int size,
1879 const char *errors)
1880{
1881 PyObject *repr;
1882 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001883 DWORD mbcssize;
1884
1885 /* If there are no characters, bail now! */
1886 if (size==0)
1887 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888
1889 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001891 if (mbcssize==0)
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894 repr = PyString_FromStringAndSize(NULL, mbcssize);
1895 if (repr == NULL)
1896 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001897 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001898 return repr;
1899
1900 /* Do the conversion */
1901 s = PyString_AS_STRING(repr);
1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903 Py_DECREF(repr);
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905 }
1906 return repr;
1907}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909#endif /* MS_WIN32 */
1910
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911/* --- Character Mapping Codec -------------------------------------------- */
1912
1913static
1914int charmap_decoding_error(const char **source,
1915 Py_UNICODE **dest,
1916 const char *errors,
1917 const char *details)
1918{
1919 if ((errors == NULL) ||
1920 (strcmp(errors,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001922 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 details);
1924 return -1;
1925 }
1926 else if (strcmp(errors,"ignore") == 0) {
1927 return 0;
1928 }
1929 else if (strcmp(errors,"replace") == 0) {
1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931 (*dest)++;
1932 return 0;
1933 }
1934 else {
1935 PyErr_Format(PyExc_ValueError,
1936 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001937 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 errors);
1939 return -1;
1940 }
1941}
1942
1943PyObject *PyUnicode_DecodeCharmap(const char *s,
1944 int size,
1945 PyObject *mapping,
1946 const char *errors)
1947{
1948 PyUnicodeObject *v;
1949 Py_UNICODE *p;
1950
1951 /* Default to Latin-1 */
1952 if (mapping == NULL)
1953 return PyUnicode_DecodeLatin1(s, size, errors);
1954
1955 v = _PyUnicode_New(size);
1956 if (v == NULL)
1957 goto onError;
1958 if (size == 0)
1959 return (PyObject *)v;
1960 p = PyUnicode_AS_UNICODE(v);
1961 while (size-- > 0) {
1962 unsigned char ch = *s++;
1963 PyObject *w, *x;
1964
1965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1966 w = PyInt_FromLong((long)ch);
1967 if (w == NULL)
1968 goto onError;
1969 x = PyObject_GetItem(mapping, w);
1970 Py_DECREF(w);
1971 if (x == NULL) {
1972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001973 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001975 x = Py_None;
1976 Py_INCREF(x);
1977 } else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 goto onError;
1979 }
1980
1981 /* Apply mapping */
1982 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001983 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 if (value < 0 || value > 65535) {
1985 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001986 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 Py_DECREF(x);
1988 goto onError;
1989 }
1990 *p++ = (Py_UNICODE)value;
1991 }
1992 else if (x == Py_None) {
1993 /* undefined mapping */
1994 if (charmap_decoding_error(&s, &p, errors,
1995 "character maps to <undefined>")) {
1996 Py_DECREF(x);
1997 goto onError;
1998 }
1999 }
2000 else if (PyUnicode_Check(x)) {
2001 if (PyUnicode_GET_SIZE(x) != 1) {
2002 /* 1-n mapping */
2003 PyErr_SetString(PyExc_NotImplementedError,
2004 "1-n mappings are currently not implemented");
2005 Py_DECREF(x);
2006 goto onError;
2007 }
2008 *p++ = *PyUnicode_AS_UNICODE(x);
2009 }
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2016 }
2017 Py_DECREF(x);
2018 }
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2020 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2021 goto onError;
2022 return (PyObject *)v;
2023
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2034{
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002038 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 details);
2040 return -1;
2041 }
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2044 }
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2049 }
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002053 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 errors);
2055 return -1;
2056 }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2063{
2064 PyObject *v;
2065 char *s;
2066
2067 /* Default to Latin-1 */
2068 if (mapping == NULL)
2069 return PyUnicode_EncodeLatin1(p, size, errors);
2070
2071 v = PyString_FromStringAndSize(NULL, size);
2072 if (v == NULL)
2073 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002074 if (size == 0)
2075 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 s = PyString_AS_STRING(v);
2077 while (size-- > 0) {
2078 Py_UNICODE ch = *p++;
2079 PyObject *w, *x;
2080
2081 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2082 w = PyInt_FromLong((long)ch);
2083 if (w == NULL)
2084 goto onError;
2085 x = PyObject_GetItem(mapping, w);
2086 Py_DECREF(w);
2087 if (x == NULL) {
2088 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002089 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002091 x = Py_None;
2092 Py_INCREF(x);
2093 } else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 goto onError;
2095 }
2096
2097 /* Apply mapping */
2098 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002099 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 if (value < 0 || value > 255) {
2101 PyErr_SetString(PyExc_TypeError,
2102 "character mapping must be in range(256)");
2103 Py_DECREF(x);
2104 goto onError;
2105 }
2106 *s++ = (char)value;
2107 }
2108 else if (x == Py_None) {
2109 /* undefined mapping */
2110 if (charmap_encoding_error(&p, &s, errors,
2111 "character maps to <undefined>")) {
2112 Py_DECREF(x);
2113 goto onError;
2114 }
2115 }
2116 else if (PyString_Check(x)) {
2117 if (PyString_GET_SIZE(x) != 1) {
2118 /* 1-n mapping */
2119 PyErr_SetString(PyExc_NotImplementedError,
2120 "1-n mappings are currently not implemented");
2121 Py_DECREF(x);
2122 goto onError;
2123 }
2124 *s++ = *PyString_AS_STRING(x);
2125 }
2126 else {
2127 /* wrong return value */
2128 PyErr_SetString(PyExc_TypeError,
2129 "character mapping must return integer, None or unicode");
2130 Py_DECREF(x);
2131 goto onError;
2132 }
2133 Py_DECREF(x);
2134 }
2135 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2136 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2137 goto onError;
2138 return v;
2139
2140 onError:
2141 Py_DECREF(v);
2142 return NULL;
2143}
2144
2145PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2146 PyObject *mapping)
2147{
2148 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2149 PyErr_BadArgument();
2150 return NULL;
2151 }
2152 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2153 PyUnicode_GET_SIZE(unicode),
2154 mapping,
2155 NULL);
2156}
2157
2158static
2159int translate_error(const Py_UNICODE **source,
2160 Py_UNICODE **dest,
2161 const char *errors,
2162 const char *details)
2163{
2164 if ((errors == NULL) ||
2165 (strcmp(errors,"strict") == 0)) {
2166 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002167 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 details);
2169 return -1;
2170 }
2171 else if (strcmp(errors,"ignore") == 0) {
2172 return 0;
2173 }
2174 else if (strcmp(errors,"replace") == 0) {
2175 **dest = '?';
2176 (*dest)++;
2177 return 0;
2178 }
2179 else {
2180 PyErr_Format(PyExc_ValueError,
2181 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002182 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 errors);
2184 return -1;
2185 }
2186}
2187
2188PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2189 int size,
2190 PyObject *mapping,
2191 const char *errors)
2192{
2193 PyUnicodeObject *v;
2194 Py_UNICODE *p;
2195
2196 if (mapping == NULL) {
2197 PyErr_BadArgument();
2198 return NULL;
2199 }
2200
2201 /* Output will never be longer than input */
2202 v = _PyUnicode_New(size);
2203 if (v == NULL)
2204 goto onError;
2205 if (size == 0)
2206 goto done;
2207 p = PyUnicode_AS_UNICODE(v);
2208 while (size-- > 0) {
2209 Py_UNICODE ch = *s++;
2210 PyObject *w, *x;
2211
2212 /* Get mapping */
2213 w = PyInt_FromLong(ch);
2214 if (w == NULL)
2215 goto onError;
2216 x = PyObject_GetItem(mapping, w);
2217 Py_DECREF(w);
2218 if (x == NULL) {
2219 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2220 /* No mapping found: default to 1-1 mapping */
2221 PyErr_Clear();
2222 *p++ = ch;
2223 continue;
2224 }
2225 goto onError;
2226 }
2227
2228 /* Apply mapping */
2229 if (PyInt_Check(x))
2230 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2231 else if (x == Py_None) {
2232 /* undefined mapping */
2233 if (translate_error(&s, &p, errors,
2234 "character maps to <undefined>")) {
2235 Py_DECREF(x);
2236 goto onError;
2237 }
2238 }
2239 else if (PyUnicode_Check(x)) {
2240 if (PyUnicode_GET_SIZE(x) != 1) {
2241 /* 1-n mapping */
2242 PyErr_SetString(PyExc_NotImplementedError,
2243 "1-n mappings are currently not implemented");
2244 Py_DECREF(x);
2245 goto onError;
2246 }
2247 *p++ = *PyUnicode_AS_UNICODE(x);
2248 }
2249 else {
2250 /* wrong return value */
2251 PyErr_SetString(PyExc_TypeError,
2252 "translate mapping must return integer, None or unicode");
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 Py_DECREF(x);
2257 }
2258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002259 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261
2262 done:
2263 return (PyObject *)v;
2264
2265 onError:
2266 Py_XDECREF(v);
2267 return NULL;
2268}
2269
2270PyObject *PyUnicode_Translate(PyObject *str,
2271 PyObject *mapping,
2272 const char *errors)
2273{
2274 PyObject *result;
2275
2276 str = PyUnicode_FromObject(str);
2277 if (str == NULL)
2278 goto onError;
2279 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2280 PyUnicode_GET_SIZE(str),
2281 mapping,
2282 errors);
2283 Py_DECREF(str);
2284 return result;
2285
2286 onError:
2287 Py_XDECREF(str);
2288 return NULL;
2289}
2290
Guido van Rossum9e896b32000-04-05 20:11:21 +00002291/* --- Decimal Encoder ---------------------------------------------------- */
2292
2293int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2294 int length,
2295 char *output,
2296 const char *errors)
2297{
2298 Py_UNICODE *p, *end;
2299
2300 if (output == NULL) {
2301 PyErr_BadArgument();
2302 return -1;
2303 }
2304
2305 p = s;
2306 end = s + length;
2307 while (p < end) {
2308 register Py_UNICODE ch = *p++;
2309 int decimal;
2310
2311 if (Py_UNICODE_ISSPACE(ch)) {
2312 *output++ = ' ';
2313 continue;
2314 }
2315 decimal = Py_UNICODE_TODECIMAL(ch);
2316 if (decimal >= 0) {
2317 *output++ = '0' + decimal;
2318 continue;
2319 }
Guido van Rossumba477042000-04-06 18:18:10 +00002320 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002321 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002322 continue;
2323 }
2324 /* All other characters are considered invalid */
2325 if (errors == NULL || strcmp(errors, "strict") == 0) {
2326 PyErr_SetString(PyExc_ValueError,
2327 "invalid decimal Unicode string");
2328 goto onError;
2329 }
2330 else if (strcmp(errors, "ignore") == 0)
2331 continue;
2332 else if (strcmp(errors, "replace") == 0) {
2333 *output++ = '?';
2334 continue;
2335 }
2336 }
2337 /* 0-terminate the output string */
2338 *output++ = '\0';
2339 return 0;
2340
2341 onError:
2342 return -1;
2343}
2344
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345/* --- Helpers ------------------------------------------------------------ */
2346
2347static
2348int count(PyUnicodeObject *self,
2349 int start,
2350 int end,
2351 PyUnicodeObject *substring)
2352{
2353 int count = 0;
2354
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002355 if (substring->length == 0)
2356 return (end - start + 1);
2357
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358 end -= substring->length;
2359
2360 while (start <= end)
2361 if (Py_UNICODE_MATCH(self, start, substring)) {
2362 count++;
2363 start += substring->length;
2364 } else
2365 start++;
2366
2367 return count;
2368}
2369
2370int PyUnicode_Count(PyObject *str,
2371 PyObject *substr,
2372 int start,
2373 int end)
2374{
2375 int result;
2376
2377 str = PyUnicode_FromObject(str);
2378 if (str == NULL)
2379 return -1;
2380 substr = PyUnicode_FromObject(substr);
2381 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002382 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 return -1;
2384 }
2385
2386 result = count((PyUnicodeObject *)str,
2387 start, end,
2388 (PyUnicodeObject *)substr);
2389
2390 Py_DECREF(str);
2391 Py_DECREF(substr);
2392 return result;
2393}
2394
2395static
2396int findstring(PyUnicodeObject *self,
2397 PyUnicodeObject *substring,
2398 int start,
2399 int end,
2400 int direction)
2401{
2402 if (start < 0)
2403 start += self->length;
2404 if (start < 0)
2405 start = 0;
2406
2407 if (substring->length == 0)
2408 return start;
2409
2410 if (end > self->length)
2411 end = self->length;
2412 if (end < 0)
2413 end += self->length;
2414 if (end < 0)
2415 end = 0;
2416
2417 end -= substring->length;
2418
2419 if (direction < 0) {
2420 for (; end >= start; end--)
2421 if (Py_UNICODE_MATCH(self, end, substring))
2422 return end;
2423 } else {
2424 for (; start <= end; start++)
2425 if (Py_UNICODE_MATCH(self, start, substring))
2426 return start;
2427 }
2428
2429 return -1;
2430}
2431
2432int PyUnicode_Find(PyObject *str,
2433 PyObject *substr,
2434 int start,
2435 int end,
2436 int direction)
2437{
2438 int result;
2439
2440 str = PyUnicode_FromObject(str);
2441 if (str == NULL)
2442 return -1;
2443 substr = PyUnicode_FromObject(substr);
2444 if (substr == NULL) {
2445 Py_DECREF(substr);
2446 return -1;
2447 }
2448
2449 result = findstring((PyUnicodeObject *)str,
2450 (PyUnicodeObject *)substr,
2451 start, end, direction);
2452 Py_DECREF(str);
2453 Py_DECREF(substr);
2454 return result;
2455}
2456
2457static
2458int tailmatch(PyUnicodeObject *self,
2459 PyUnicodeObject *substring,
2460 int start,
2461 int end,
2462 int direction)
2463{
2464 if (start < 0)
2465 start += self->length;
2466 if (start < 0)
2467 start = 0;
2468
2469 if (substring->length == 0)
2470 return 1;
2471
2472 if (end > self->length)
2473 end = self->length;
2474 if (end < 0)
2475 end += self->length;
2476 if (end < 0)
2477 end = 0;
2478
2479 end -= substring->length;
2480 if (end < start)
2481 return 0;
2482
2483 if (direction > 0) {
2484 if (Py_UNICODE_MATCH(self, end, substring))
2485 return 1;
2486 } else {
2487 if (Py_UNICODE_MATCH(self, start, substring))
2488 return 1;
2489 }
2490
2491 return 0;
2492}
2493
2494int PyUnicode_Tailmatch(PyObject *str,
2495 PyObject *substr,
2496 int start,
2497 int end,
2498 int direction)
2499{
2500 int result;
2501
2502 str = PyUnicode_FromObject(str);
2503 if (str == NULL)
2504 return -1;
2505 substr = PyUnicode_FromObject(substr);
2506 if (substr == NULL) {
2507 Py_DECREF(substr);
2508 return -1;
2509 }
2510
2511 result = tailmatch((PyUnicodeObject *)str,
2512 (PyUnicodeObject *)substr,
2513 start, end, direction);
2514 Py_DECREF(str);
2515 Py_DECREF(substr);
2516 return result;
2517}
2518
2519static
2520const Py_UNICODE *findchar(const Py_UNICODE *s,
2521 int size,
2522 Py_UNICODE ch)
2523{
2524 /* like wcschr, but doesn't stop at NULL characters */
2525
2526 while (size-- > 0) {
2527 if (*s == ch)
2528 return s;
2529 s++;
2530 }
2531
2532 return NULL;
2533}
2534
2535/* Apply fixfct filter to the Unicode object self and return a
2536 reference to the modified object */
2537
2538static
2539PyObject *fixup(PyUnicodeObject *self,
2540 int (*fixfct)(PyUnicodeObject *s))
2541{
2542
2543 PyUnicodeObject *u;
2544
2545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2546 self->length);
2547 if (u == NULL)
2548 return NULL;
2549 if (!fixfct(u)) {
2550 /* fixfct should return TRUE if it modified the buffer. If
2551 FALSE, return a reference to the original buffer instead
2552 (to save space, not time) */
2553 Py_INCREF(self);
2554 Py_DECREF(u);
2555 return (PyObject*) self;
2556 }
2557 return (PyObject*) u;
2558}
2559
2560static
2561int fixupper(PyUnicodeObject *self)
2562{
2563 int len = self->length;
2564 Py_UNICODE *s = self->str;
2565 int status = 0;
2566
2567 while (len-- > 0) {
2568 register Py_UNICODE ch;
2569
2570 ch = Py_UNICODE_TOUPPER(*s);
2571 if (ch != *s) {
2572 status = 1;
2573 *s = ch;
2574 }
2575 s++;
2576 }
2577
2578 return status;
2579}
2580
2581static
2582int fixlower(PyUnicodeObject *self)
2583{
2584 int len = self->length;
2585 Py_UNICODE *s = self->str;
2586 int status = 0;
2587
2588 while (len-- > 0) {
2589 register Py_UNICODE ch;
2590
2591 ch = Py_UNICODE_TOLOWER(*s);
2592 if (ch != *s) {
2593 status = 1;
2594 *s = ch;
2595 }
2596 s++;
2597 }
2598
2599 return status;
2600}
2601
2602static
2603int fixswapcase(PyUnicodeObject *self)
2604{
2605 int len = self->length;
2606 Py_UNICODE *s = self->str;
2607 int status = 0;
2608
2609 while (len-- > 0) {
2610 if (Py_UNICODE_ISUPPER(*s)) {
2611 *s = Py_UNICODE_TOLOWER(*s);
2612 status = 1;
2613 } else if (Py_UNICODE_ISLOWER(*s)) {
2614 *s = Py_UNICODE_TOUPPER(*s);
2615 status = 1;
2616 }
2617 s++;
2618 }
2619
2620 return status;
2621}
2622
2623static
2624int fixcapitalize(PyUnicodeObject *self)
2625{
2626 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2627 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2628 return 1;
2629 }
2630 return 0;
2631}
2632
2633static
2634int fixtitle(PyUnicodeObject *self)
2635{
2636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2637 register Py_UNICODE *e;
2638 int previous_is_cased;
2639
2640 /* Shortcut for single character strings */
2641 if (PyUnicode_GET_SIZE(self) == 1) {
2642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2643 if (*p != ch) {
2644 *p = ch;
2645 return 1;
2646 }
2647 else
2648 return 0;
2649 }
2650
2651 e = p + PyUnicode_GET_SIZE(self);
2652 previous_is_cased = 0;
2653 for (; p < e; p++) {
2654 register const Py_UNICODE ch = *p;
2655
2656 if (previous_is_cased)
2657 *p = Py_UNICODE_TOLOWER(ch);
2658 else
2659 *p = Py_UNICODE_TOTITLE(ch);
2660
2661 if (Py_UNICODE_ISLOWER(ch) ||
2662 Py_UNICODE_ISUPPER(ch) ||
2663 Py_UNICODE_ISTITLE(ch))
2664 previous_is_cased = 1;
2665 else
2666 previous_is_cased = 0;
2667 }
2668 return 1;
2669}
2670
2671PyObject *PyUnicode_Join(PyObject *separator,
2672 PyObject *seq)
2673{
2674 Py_UNICODE *sep;
2675 int seplen;
2676 PyUnicodeObject *res = NULL;
2677 int reslen = 0;
2678 Py_UNICODE *p;
2679 int seqlen = 0;
2680 int sz = 100;
2681 int i;
2682
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002683 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 if (seqlen < 0 && PyErr_Occurred())
2685 return NULL;
2686
2687 if (separator == NULL) {
2688 Py_UNICODE blank = ' ';
2689 sep = &blank;
2690 seplen = 1;
2691 }
2692 else {
2693 separator = PyUnicode_FromObject(separator);
2694 if (separator == NULL)
2695 return NULL;
2696 sep = PyUnicode_AS_UNICODE(separator);
2697 seplen = PyUnicode_GET_SIZE(separator);
2698 }
2699
2700 res = _PyUnicode_New(sz);
2701 if (res == NULL)
2702 goto onError;
2703 p = PyUnicode_AS_UNICODE(res);
2704 reslen = 0;
2705
2706 for (i = 0; i < seqlen; i++) {
2707 int itemlen;
2708 PyObject *item;
2709
2710 item = PySequence_GetItem(seq, i);
2711 if (item == NULL)
2712 goto onError;
2713 if (!PyUnicode_Check(item)) {
2714 PyObject *v;
2715 v = PyUnicode_FromObject(item);
2716 Py_DECREF(item);
2717 item = v;
2718 if (item == NULL)
2719 goto onError;
2720 }
2721 itemlen = PyUnicode_GET_SIZE(item);
2722 while (reslen + itemlen + seplen >= sz) {
2723 if (_PyUnicode_Resize(res, sz*2))
2724 goto onError;
2725 sz *= 2;
2726 p = PyUnicode_AS_UNICODE(res) + reslen;
2727 }
2728 if (i > 0) {
2729 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2730 p += seplen;
2731 reslen += seplen;
2732 }
2733 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2734 p += itemlen;
2735 reslen += itemlen;
2736 Py_DECREF(item);
2737 }
2738 if (_PyUnicode_Resize(res, reslen))
2739 goto onError;
2740
2741 Py_XDECREF(separator);
2742 return (PyObject *)res;
2743
2744 onError:
2745 Py_XDECREF(separator);
2746 Py_DECREF(res);
2747 return NULL;
2748}
2749
2750static
2751PyUnicodeObject *pad(PyUnicodeObject *self,
2752 int left,
2753 int right,
2754 Py_UNICODE fill)
2755{
2756 PyUnicodeObject *u;
2757
2758 if (left < 0)
2759 left = 0;
2760 if (right < 0)
2761 right = 0;
2762
2763 if (left == 0 && right == 0) {
2764 Py_INCREF(self);
2765 return self;
2766 }
2767
2768 u = _PyUnicode_New(left + self->length + right);
2769 if (u) {
2770 if (left)
2771 Py_UNICODE_FILL(u->str, fill, left);
2772 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2773 if (right)
2774 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2775 }
2776
2777 return u;
2778}
2779
2780#define SPLIT_APPEND(data, left, right) \
2781 str = PyUnicode_FromUnicode(data + left, right - left); \
2782 if (!str) \
2783 goto onError; \
2784 if (PyList_Append(list, str)) { \
2785 Py_DECREF(str); \
2786 goto onError; \
2787 } \
2788 else \
2789 Py_DECREF(str);
2790
2791static
2792PyObject *split_whitespace(PyUnicodeObject *self,
2793 PyObject *list,
2794 int maxcount)
2795{
2796 register int i;
2797 register int j;
2798 int len = self->length;
2799 PyObject *str;
2800
2801 for (i = j = 0; i < len; ) {
2802 /* find a token */
2803 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2804 i++;
2805 j = i;
2806 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2807 i++;
2808 if (j < i) {
2809 if (maxcount-- <= 0)
2810 break;
2811 SPLIT_APPEND(self->str, j, i);
2812 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2813 i++;
2814 j = i;
2815 }
2816 }
2817 if (j < len) {
2818 SPLIT_APPEND(self->str, j, len);
2819 }
2820 return list;
2821
2822 onError:
2823 Py_DECREF(list);
2824 return NULL;
2825}
2826
2827PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002828 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829{
2830 register int i;
2831 register int j;
2832 int len;
2833 PyObject *list;
2834 PyObject *str;
2835 Py_UNICODE *data;
2836
2837 string = PyUnicode_FromObject(string);
2838 if (string == NULL)
2839 return NULL;
2840 data = PyUnicode_AS_UNICODE(string);
2841 len = PyUnicode_GET_SIZE(string);
2842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 list = PyList_New(0);
2844 if (!list)
2845 goto onError;
2846
2847 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002848 int eol;
2849
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 /* Find a line and append it */
2851 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2852 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
2854 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002855 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (i < len) {
2857 if (data[i] == '\r' && i + 1 < len &&
2858 data[i+1] == '\n')
2859 i += 2;
2860 else
2861 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002862 if (keepends)
2863 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 }
Guido van Rossum86662912000-04-11 15:38:46 +00002865 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 j = i;
2867 }
2868 if (j < len) {
2869 SPLIT_APPEND(data, j, len);
2870 }
2871
2872 Py_DECREF(string);
2873 return list;
2874
2875 onError:
2876 Py_DECREF(list);
2877 Py_DECREF(string);
2878 return NULL;
2879}
2880
2881static
2882PyObject *split_char(PyUnicodeObject *self,
2883 PyObject *list,
2884 Py_UNICODE ch,
2885 int maxcount)
2886{
2887 register int i;
2888 register int j;
2889 int len = self->length;
2890 PyObject *str;
2891
2892 for (i = j = 0; i < len; ) {
2893 if (self->str[i] == ch) {
2894 if (maxcount-- <= 0)
2895 break;
2896 SPLIT_APPEND(self->str, j, i);
2897 i = j = i + 1;
2898 } else
2899 i++;
2900 }
2901 if (j <= len) {
2902 SPLIT_APPEND(self->str, j, len);
2903 }
2904 return list;
2905
2906 onError:
2907 Py_DECREF(list);
2908 return NULL;
2909}
2910
2911static
2912PyObject *split_substring(PyUnicodeObject *self,
2913 PyObject *list,
2914 PyUnicodeObject *substring,
2915 int maxcount)
2916{
2917 register int i;
2918 register int j;
2919 int len = self->length;
2920 int sublen = substring->length;
2921 PyObject *str;
2922
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002923 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 if (Py_UNICODE_MATCH(self, i, substring)) {
2925 if (maxcount-- <= 0)
2926 break;
2927 SPLIT_APPEND(self->str, j, i);
2928 i = j = i + sublen;
2929 } else
2930 i++;
2931 }
2932 if (j <= len) {
2933 SPLIT_APPEND(self->str, j, len);
2934 }
2935 return list;
2936
2937 onError:
2938 Py_DECREF(list);
2939 return NULL;
2940}
2941
2942#undef SPLIT_APPEND
2943
2944static
2945PyObject *split(PyUnicodeObject *self,
2946 PyUnicodeObject *substring,
2947 int maxcount)
2948{
2949 PyObject *list;
2950
2951 if (maxcount < 0)
2952 maxcount = INT_MAX;
2953
2954 list = PyList_New(0);
2955 if (!list)
2956 return NULL;
2957
2958 if (substring == NULL)
2959 return split_whitespace(self,list,maxcount);
2960
2961 else if (substring->length == 1)
2962 return split_char(self,list,substring->str[0],maxcount);
2963
2964 else if (substring->length == 0) {
2965 Py_DECREF(list);
2966 PyErr_SetString(PyExc_ValueError, "empty separator");
2967 return NULL;
2968 }
2969 else
2970 return split_substring(self,list,substring,maxcount);
2971}
2972
2973static
2974PyObject *strip(PyUnicodeObject *self,
2975 int left,
2976 int right)
2977{
2978 Py_UNICODE *p = self->str;
2979 int start = 0;
2980 int end = self->length;
2981
2982 if (left)
2983 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2984 start++;
2985
2986 if (right)
2987 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2988 end--;
2989
2990 if (start == 0 && end == self->length) {
2991 /* couldn't strip anything off, return original string */
2992 Py_INCREF(self);
2993 return (PyObject*) self;
2994 }
2995
2996 return (PyObject*) PyUnicode_FromUnicode(
2997 self->str + start,
2998 end - start
2999 );
3000}
3001
3002static
3003PyObject *replace(PyUnicodeObject *self,
3004 PyUnicodeObject *str1,
3005 PyUnicodeObject *str2,
3006 int maxcount)
3007{
3008 PyUnicodeObject *u;
3009
3010 if (maxcount < 0)
3011 maxcount = INT_MAX;
3012
3013 if (str1->length == 1 && str2->length == 1) {
3014 int i;
3015
3016 /* replace characters */
3017 if (!findchar(self->str, self->length, str1->str[0])) {
3018 /* nothing to replace, return original string */
3019 Py_INCREF(self);
3020 u = self;
3021 } else {
3022 Py_UNICODE u1 = str1->str[0];
3023 Py_UNICODE u2 = str2->str[0];
3024
3025 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3026 self->str,
3027 self->length
3028 );
3029 if (u)
3030 for (i = 0; i < u->length; i++)
3031 if (u->str[i] == u1) {
3032 if (--maxcount < 0)
3033 break;
3034 u->str[i] = u2;
3035 }
3036 }
3037
3038 } else {
3039 int n, i;
3040 Py_UNICODE *p;
3041
3042 /* replace strings */
3043 n = count(self, 0, self->length, str1);
3044 if (n > maxcount)
3045 n = maxcount;
3046 if (n == 0) {
3047 /* nothing to replace, return original string */
3048 Py_INCREF(self);
3049 u = self;
3050 } else {
3051 u = _PyUnicode_New(
3052 self->length + n * (str2->length - str1->length));
3053 if (u) {
3054 i = 0;
3055 p = u->str;
3056 while (i <= self->length - str1->length)
3057 if (Py_UNICODE_MATCH(self, i, str1)) {
3058 /* replace string segment */
3059 Py_UNICODE_COPY(p, str2->str, str2->length);
3060 p += str2->length;
3061 i += str1->length;
3062 if (--n <= 0) {
3063 /* copy remaining part */
3064 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3065 break;
3066 }
3067 } else
3068 *p++ = self->str[i++];
3069 }
3070 }
3071 }
3072
3073 return (PyObject *) u;
3074}
3075
3076/* --- Unicode Object Methods --------------------------------------------- */
3077
3078static char title__doc__[] =
3079"S.title() -> unicode\n\
3080\n\
3081Return a titlecased version of S, i.e. words start with title case\n\
3082characters, all remaining cased characters have lower case.";
3083
3084static PyObject*
3085unicode_title(PyUnicodeObject *self, PyObject *args)
3086{
3087 if (!PyArg_NoArgs(args))
3088 return NULL;
3089 return fixup(self, fixtitle);
3090}
3091
3092static char capitalize__doc__[] =
3093"S.capitalize() -> unicode\n\
3094\n\
3095Return a capitalized version of S, i.e. make the first character\n\
3096have upper case.";
3097
3098static PyObject*
3099unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3100{
3101 if (!PyArg_NoArgs(args))
3102 return NULL;
3103 return fixup(self, fixcapitalize);
3104}
3105
3106#if 0
3107static char capwords__doc__[] =
3108"S.capwords() -> unicode\n\
3109\n\
3110Apply .capitalize() to all words in S and return the result with\n\
3111normalized whitespace (all whitespace strings are replaced by ' ').";
3112
3113static PyObject*
3114unicode_capwords(PyUnicodeObject *self, PyObject *args)
3115{
3116 PyObject *list;
3117 PyObject *item;
3118 int i;
3119
3120 if (!PyArg_NoArgs(args))
3121 return NULL;
3122
3123 /* Split into words */
3124 list = split(self, NULL, -1);
3125 if (!list)
3126 return NULL;
3127
3128 /* Capitalize each word */
3129 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3130 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3131 fixcapitalize);
3132 if (item == NULL)
3133 goto onError;
3134 Py_DECREF(PyList_GET_ITEM(list, i));
3135 PyList_SET_ITEM(list, i, item);
3136 }
3137
3138 /* Join the words to form a new string */
3139 item = PyUnicode_Join(NULL, list);
3140
3141onError:
3142 Py_DECREF(list);
3143 return (PyObject *)item;
3144}
3145#endif
3146
3147static char center__doc__[] =
3148"S.center(width) -> unicode\n\
3149\n\
3150Return S centered in a Unicode string of length width. Padding is done\n\
3151using spaces.";
3152
3153static PyObject *
3154unicode_center(PyUnicodeObject *self, PyObject *args)
3155{
3156 int marg, left;
3157 int width;
3158
3159 if (!PyArg_ParseTuple(args, "i:center", &width))
3160 return NULL;
3161
3162 if (self->length >= width) {
3163 Py_INCREF(self);
3164 return (PyObject*) self;
3165 }
3166
3167 marg = width - self->length;
3168 left = marg / 2 + (marg & width & 1);
3169
3170 return (PyObject*) pad(self, left, marg - left, ' ');
3171}
3172
Marc-André Lemburge5034372000-08-08 08:04:29 +00003173#if 0
3174
3175/* This code should go into some future Unicode collation support
3176 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003177 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003178
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003179/* speedy UTF-16 code point order comparison */
3180/* gleaned from: */
3181/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3182
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003183static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003184{
3185 0, 0, 0, 0, 0, 0, 0, 0,
3186 0, 0, 0, 0, 0, 0, 0, 0,
3187 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003188 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003189};
3190
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191static int
3192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3193{
3194 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 Py_UNICODE *s1 = str1->str;
3197 Py_UNICODE *s2 = str2->str;
3198
3199 len1 = str1->length;
3200 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003201
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003203 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003204 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003205
3206 c1 = *s1++;
3207 c2 = *s2++;
3208 if (c1 > (1<<11) * 26)
3209 c1 += utf16Fixup[c1>>11];
3210 if (c2 > (1<<11) * 26)
3211 c2 += utf16Fixup[c2>>11];
3212
3213 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003214 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003215 if (diff)
3216 return (diff < 0) ? -1 : (diff != 0);
3217 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 }
3219
3220 return (len1 < len2) ? -1 : (len1 != len2);
3221}
3222
Marc-André Lemburge5034372000-08-08 08:04:29 +00003223#else
3224
3225static int
3226unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3227{
3228 register int len1, len2;
3229
3230 Py_UNICODE *s1 = str1->str;
3231 Py_UNICODE *s2 = str2->str;
3232
3233 len1 = str1->length;
3234 len2 = str2->length;
3235
3236 while (len1 > 0 && len2 > 0) {
3237 register long diff;
3238
3239 diff = (long)*s1++ - (long)*s2++;
3240 if (diff)
3241 return (diff < 0) ? -1 : (diff != 0);
3242 len1--; len2--;
3243 }
3244
3245 return (len1 < len2) ? -1 : (len1 != len2);
3246}
3247
3248#endif
3249
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250int PyUnicode_Compare(PyObject *left,
3251 PyObject *right)
3252{
3253 PyUnicodeObject *u = NULL, *v = NULL;
3254 int result;
3255
3256 /* Coerce the two arguments */
3257 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3258 if (u == NULL)
3259 goto onError;
3260 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3261 if (v == NULL)
3262 goto onError;
3263
Thomas Wouters7e474022000-07-16 12:04:32 +00003264 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 if (v == u) {
3266 Py_DECREF(u);
3267 Py_DECREF(v);
3268 return 0;
3269 }
3270
3271 result = unicode_compare(u, v);
3272
3273 Py_DECREF(u);
3274 Py_DECREF(v);
3275 return result;
3276
3277onError:
3278 Py_XDECREF(u);
3279 Py_XDECREF(v);
3280 return -1;
3281}
3282
Guido van Rossum403d68b2000-03-13 15:55:09 +00003283int PyUnicode_Contains(PyObject *container,
3284 PyObject *element)
3285{
3286 PyUnicodeObject *u = NULL, *v = NULL;
3287 int result;
3288 register const Py_UNICODE *p, *e;
3289 register Py_UNICODE ch;
3290
3291 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003292 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003293 if (v == NULL) {
3294 PyErr_SetString(PyExc_TypeError,
3295 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003296 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003297 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003298 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3299 if (u == NULL) {
3300 Py_DECREF(v);
3301 goto onError;
3302 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003303
3304 /* Check v in u */
3305 if (PyUnicode_GET_SIZE(v) != 1) {
3306 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003307 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003308 goto onError;
3309 }
3310 ch = *PyUnicode_AS_UNICODE(v);
3311 p = PyUnicode_AS_UNICODE(u);
3312 e = p + PyUnicode_GET_SIZE(u);
3313 result = 0;
3314 while (p < e) {
3315 if (*p++ == ch) {
3316 result = 1;
3317 break;
3318 }
3319 }
3320
3321 Py_DECREF(u);
3322 Py_DECREF(v);
3323 return result;
3324
3325onError:
3326 Py_XDECREF(u);
3327 Py_XDECREF(v);
3328 return -1;
3329}
3330
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331/* Concat to string or Unicode object giving a new Unicode object. */
3332
3333PyObject *PyUnicode_Concat(PyObject *left,
3334 PyObject *right)
3335{
3336 PyUnicodeObject *u = NULL, *v = NULL, *w;
3337
3338 /* Coerce the two arguments */
3339 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3340 if (u == NULL)
3341 goto onError;
3342 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3343 if (v == NULL)
3344 goto onError;
3345
3346 /* Shortcuts */
3347 if (v == unicode_empty) {
3348 Py_DECREF(v);
3349 return (PyObject *)u;
3350 }
3351 if (u == unicode_empty) {
3352 Py_DECREF(u);
3353 return (PyObject *)v;
3354 }
3355
3356 /* Concat the two Unicode strings */
3357 w = _PyUnicode_New(u->length + v->length);
3358 if (w == NULL)
3359 goto onError;
3360 Py_UNICODE_COPY(w->str, u->str, u->length);
3361 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3362
3363 Py_DECREF(u);
3364 Py_DECREF(v);
3365 return (PyObject *)w;
3366
3367onError:
3368 Py_XDECREF(u);
3369 Py_XDECREF(v);
3370 return NULL;
3371}
3372
3373static char count__doc__[] =
3374"S.count(sub[, start[, end]]) -> int\n\
3375\n\
3376Return the number of occurrences of substring sub in Unicode string\n\
3377S[start:end]. Optional arguments start and end are\n\
3378interpreted as in slice notation.";
3379
3380static PyObject *
3381unicode_count(PyUnicodeObject *self, PyObject *args)
3382{
3383 PyUnicodeObject *substring;
3384 int start = 0;
3385 int end = INT_MAX;
3386 PyObject *result;
3387
Guido van Rossumb8872e62000-05-09 14:14:27 +00003388 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3389 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 return NULL;
3391
3392 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3393 (PyObject *)substring);
3394 if (substring == NULL)
3395 return NULL;
3396
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (start < 0)
3398 start += self->length;
3399 if (start < 0)
3400 start = 0;
3401 if (end > self->length)
3402 end = self->length;
3403 if (end < 0)
3404 end += self->length;
3405 if (end < 0)
3406 end = 0;
3407
3408 result = PyInt_FromLong((long) count(self, start, end, substring));
3409
3410 Py_DECREF(substring);
3411 return result;
3412}
3413
3414static char encode__doc__[] =
3415"S.encode([encoding[,errors]]) -> string\n\
3416\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003417Return an encoded string version of S. Default encoding is the current\n\
3418default string encoding. errors may be given to set a different error\n\
3419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3420a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422static PyObject *
3423unicode_encode(PyUnicodeObject *self, PyObject *args)
3424{
3425 char *encoding = NULL;
3426 char *errors = NULL;
3427 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3428 return NULL;
3429 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3430}
3431
3432static char expandtabs__doc__[] =
3433"S.expandtabs([tabsize]) -> unicode\n\
3434\n\
3435Return a copy of S where all tab characters are expanded using spaces.\n\
3436If tabsize is not given, a tab size of 8 characters is assumed.";
3437
3438static PyObject*
3439unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3440{
3441 Py_UNICODE *e;
3442 Py_UNICODE *p;
3443 Py_UNICODE *q;
3444 int i, j;
3445 PyUnicodeObject *u;
3446 int tabsize = 8;
3447
3448 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3449 return NULL;
3450
Thomas Wouters7e474022000-07-16 12:04:32 +00003451 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 i = j = 0;
3453 e = self->str + self->length;
3454 for (p = self->str; p < e; p++)
3455 if (*p == '\t') {
3456 if (tabsize > 0)
3457 j += tabsize - (j % tabsize);
3458 }
3459 else {
3460 j++;
3461 if (*p == '\n' || *p == '\r') {
3462 i += j;
3463 j = 0;
3464 }
3465 }
3466
3467 /* Second pass: create output string and fill it */
3468 u = _PyUnicode_New(i + j);
3469 if (!u)
3470 return NULL;
3471
3472 j = 0;
3473 q = u->str;
3474
3475 for (p = self->str; p < e; p++)
3476 if (*p == '\t') {
3477 if (tabsize > 0) {
3478 i = tabsize - (j % tabsize);
3479 j += i;
3480 while (i--)
3481 *q++ = ' ';
3482 }
3483 }
3484 else {
3485 j++;
3486 *q++ = *p;
3487 if (*p == '\n' || *p == '\r')
3488 j = 0;
3489 }
3490
3491 return (PyObject*) u;
3492}
3493
3494static char find__doc__[] =
3495"S.find(sub [,start [,end]]) -> int\n\
3496\n\
3497Return the lowest index in S where substring sub is found,\n\
3498such that sub is contained within s[start,end]. Optional\n\
3499arguments start and end are interpreted as in slice notation.\n\
3500\n\
3501Return -1 on failure.";
3502
3503static PyObject *
3504unicode_find(PyUnicodeObject *self, PyObject *args)
3505{
3506 PyUnicodeObject *substring;
3507 int start = 0;
3508 int end = INT_MAX;
3509 PyObject *result;
3510
Guido van Rossumb8872e62000-05-09 14:14:27 +00003511 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 return NULL;
3514 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3515 (PyObject *)substring);
3516 if (substring == NULL)
3517 return NULL;
3518
3519 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3520
3521 Py_DECREF(substring);
3522 return result;
3523}
3524
3525static PyObject *
3526unicode_getitem(PyUnicodeObject *self, int index)
3527{
3528 if (index < 0 || index >= self->length) {
3529 PyErr_SetString(PyExc_IndexError, "string index out of range");
3530 return NULL;
3531 }
3532
3533 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3534}
3535
3536static long
3537unicode_hash(PyUnicodeObject *self)
3538{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003539 /* Since Unicode objects compare equal to their ASCII string
3540 counterparts, they should use the individual character values
3541 as basis for their hash value. This is needed to assure that
3542 strings and Unicode objects behave in the same way as
3543 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544
Fredrik Lundhdde61642000-07-10 18:27:47 +00003545 register int len;
3546 register Py_UNICODE *p;
3547 register long x;
3548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549 if (self->hash != -1)
3550 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003551 len = PyUnicode_GET_SIZE(self);
3552 p = PyUnicode_AS_UNICODE(self);
3553 x = *p << 7;
3554 while (--len >= 0)
3555 x = (1000003*x) ^ *p++;
3556 x ^= PyUnicode_GET_SIZE(self);
3557 if (x == -1)
3558 x = -2;
3559 self->hash = x;
3560 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561}
3562
3563static char index__doc__[] =
3564"S.index(sub [,start [,end]]) -> int\n\
3565\n\
3566Like S.find() but raise ValueError when the substring is not found.";
3567
3568static PyObject *
3569unicode_index(PyUnicodeObject *self, PyObject *args)
3570{
3571 int result;
3572 PyUnicodeObject *substring;
3573 int start = 0;
3574 int end = INT_MAX;
3575
Guido van Rossumb8872e62000-05-09 14:14:27 +00003576 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return NULL;
3579
3580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3581 (PyObject *)substring);
3582 if (substring == NULL)
3583 return NULL;
3584
3585 result = findstring(self, substring, start, end, 1);
3586
3587 Py_DECREF(substring);
3588 if (result < 0) {
3589 PyErr_SetString(PyExc_ValueError, "substring not found");
3590 return NULL;
3591 }
3592 return PyInt_FromLong(result);
3593}
3594
3595static char islower__doc__[] =
3596"S.islower() -> int\n\
3597\n\
3598Return 1 if all cased characters in S are lowercase and there is\n\
3599at least one cased character in S, 0 otherwise.";
3600
3601static PyObject*
3602unicode_islower(PyUnicodeObject *self, PyObject *args)
3603{
3604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3605 register const Py_UNICODE *e;
3606 int cased;
3607
3608 if (!PyArg_NoArgs(args))
3609 return NULL;
3610
3611 /* Shortcut for single character strings */
3612 if (PyUnicode_GET_SIZE(self) == 1)
3613 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3614
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003615 /* Special case for empty strings */
3616 if (PyString_GET_SIZE(self) == 0)
3617 return PyInt_FromLong(0);
3618
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 e = p + PyUnicode_GET_SIZE(self);
3620 cased = 0;
3621 for (; p < e; p++) {
3622 register const Py_UNICODE ch = *p;
3623
3624 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3625 return PyInt_FromLong(0);
3626 else if (!cased && Py_UNICODE_ISLOWER(ch))
3627 cased = 1;
3628 }
3629 return PyInt_FromLong(cased);
3630}
3631
3632static char isupper__doc__[] =
3633"S.isupper() -> int\n\
3634\n\
3635Return 1 if all cased characters in S are uppercase and there is\n\
3636at least one cased character in S, 0 otherwise.";
3637
3638static PyObject*
3639unicode_isupper(PyUnicodeObject *self, PyObject *args)
3640{
3641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3642 register const Py_UNICODE *e;
3643 int cased;
3644
3645 if (!PyArg_NoArgs(args))
3646 return NULL;
3647
3648 /* Shortcut for single character strings */
3649 if (PyUnicode_GET_SIZE(self) == 1)
3650 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3651
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003652 /* Special case for empty strings */
3653 if (PyString_GET_SIZE(self) == 0)
3654 return PyInt_FromLong(0);
3655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 e = p + PyUnicode_GET_SIZE(self);
3657 cased = 0;
3658 for (; p < e; p++) {
3659 register const Py_UNICODE ch = *p;
3660
3661 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3662 return PyInt_FromLong(0);
3663 else if (!cased && Py_UNICODE_ISUPPER(ch))
3664 cased = 1;
3665 }
3666 return PyInt_FromLong(cased);
3667}
3668
3669static char istitle__doc__[] =
3670"S.istitle() -> int\n\
3671\n\
3672Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3673may only follow uncased characters and lowercase characters only cased\n\
3674ones. Return 0 otherwise.";
3675
3676static PyObject*
3677unicode_istitle(PyUnicodeObject *self, PyObject *args)
3678{
3679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3680 register const Py_UNICODE *e;
3681 int cased, previous_is_cased;
3682
3683 if (!PyArg_NoArgs(args))
3684 return NULL;
3685
3686 /* Shortcut for single character strings */
3687 if (PyUnicode_GET_SIZE(self) == 1)
3688 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3689 (Py_UNICODE_ISUPPER(*p) != 0));
3690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003691 /* Special case for empty strings */
3692 if (PyString_GET_SIZE(self) == 0)
3693 return PyInt_FromLong(0);
3694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 e = p + PyUnicode_GET_SIZE(self);
3696 cased = 0;
3697 previous_is_cased = 0;
3698 for (; p < e; p++) {
3699 register const Py_UNICODE ch = *p;
3700
3701 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3702 if (previous_is_cased)
3703 return PyInt_FromLong(0);
3704 previous_is_cased = 1;
3705 cased = 1;
3706 }
3707 else if (Py_UNICODE_ISLOWER(ch)) {
3708 if (!previous_is_cased)
3709 return PyInt_FromLong(0);
3710 previous_is_cased = 1;
3711 cased = 1;
3712 }
3713 else
3714 previous_is_cased = 0;
3715 }
3716 return PyInt_FromLong(cased);
3717}
3718
3719static char isspace__doc__[] =
3720"S.isspace() -> int\n\
3721\n\
3722Return 1 if there are only whitespace characters in S,\n\
37230 otherwise.";
3724
3725static PyObject*
3726unicode_isspace(PyUnicodeObject *self, PyObject *args)
3727{
3728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3729 register const Py_UNICODE *e;
3730
3731 if (!PyArg_NoArgs(args))
3732 return NULL;
3733
3734 /* Shortcut for single character strings */
3735 if (PyUnicode_GET_SIZE(self) == 1 &&
3736 Py_UNICODE_ISSPACE(*p))
3737 return PyInt_FromLong(1);
3738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003739 /* Special case for empty strings */
3740 if (PyString_GET_SIZE(self) == 0)
3741 return PyInt_FromLong(0);
3742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 e = p + PyUnicode_GET_SIZE(self);
3744 for (; p < e; p++) {
3745 if (!Py_UNICODE_ISSPACE(*p))
3746 return PyInt_FromLong(0);
3747 }
3748 return PyInt_FromLong(1);
3749}
3750
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003751static char isalpha__doc__[] =
3752"S.isalpha() -> int\n\
3753\n\
3754Return 1 if all characters in S are alphabetic\n\
3755and there is at least one character in S, 0 otherwise.";
3756
3757static PyObject*
3758unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3759{
3760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3761 register const Py_UNICODE *e;
3762
3763 if (!PyArg_NoArgs(args))
3764 return NULL;
3765
3766 /* Shortcut for single character strings */
3767 if (PyUnicode_GET_SIZE(self) == 1 &&
3768 Py_UNICODE_ISALPHA(*p))
3769 return PyInt_FromLong(1);
3770
3771 /* Special case for empty strings */
3772 if (PyString_GET_SIZE(self) == 0)
3773 return PyInt_FromLong(0);
3774
3775 e = p + PyUnicode_GET_SIZE(self);
3776 for (; p < e; p++) {
3777 if (!Py_UNICODE_ISALPHA(*p))
3778 return PyInt_FromLong(0);
3779 }
3780 return PyInt_FromLong(1);
3781}
3782
3783static char isalnum__doc__[] =
3784"S.isalnum() -> int\n\
3785\n\
3786Return 1 if all characters in S are alphanumeric\n\
3787and there is at least one character in S, 0 otherwise.";
3788
3789static PyObject*
3790unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3791{
3792 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3793 register const Py_UNICODE *e;
3794
3795 if (!PyArg_NoArgs(args))
3796 return NULL;
3797
3798 /* Shortcut for single character strings */
3799 if (PyUnicode_GET_SIZE(self) == 1 &&
3800 Py_UNICODE_ISALNUM(*p))
3801 return PyInt_FromLong(1);
3802
3803 /* Special case for empty strings */
3804 if (PyString_GET_SIZE(self) == 0)
3805 return PyInt_FromLong(0);
3806
3807 e = p + PyUnicode_GET_SIZE(self);
3808 for (; p < e; p++) {
3809 if (!Py_UNICODE_ISALNUM(*p))
3810 return PyInt_FromLong(0);
3811 }
3812 return PyInt_FromLong(1);
3813}
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815static char isdecimal__doc__[] =
3816"S.isdecimal() -> int\n\
3817\n\
3818Return 1 if there are only decimal characters in S,\n\
38190 otherwise.";
3820
3821static PyObject*
3822unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3823{
3824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3825 register const Py_UNICODE *e;
3826
3827 if (!PyArg_NoArgs(args))
3828 return NULL;
3829
3830 /* Shortcut for single character strings */
3831 if (PyUnicode_GET_SIZE(self) == 1 &&
3832 Py_UNICODE_ISDECIMAL(*p))
3833 return PyInt_FromLong(1);
3834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003835 /* Special case for empty strings */
3836 if (PyString_GET_SIZE(self) == 0)
3837 return PyInt_FromLong(0);
3838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 e = p + PyUnicode_GET_SIZE(self);
3840 for (; p < e; p++) {
3841 if (!Py_UNICODE_ISDECIMAL(*p))
3842 return PyInt_FromLong(0);
3843 }
3844 return PyInt_FromLong(1);
3845}
3846
3847static char isdigit__doc__[] =
3848"S.isdigit() -> int\n\
3849\n\
3850Return 1 if there are only digit characters in S,\n\
38510 otherwise.";
3852
3853static PyObject*
3854unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3855{
3856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3857 register const Py_UNICODE *e;
3858
3859 if (!PyArg_NoArgs(args))
3860 return NULL;
3861
3862 /* Shortcut for single character strings */
3863 if (PyUnicode_GET_SIZE(self) == 1 &&
3864 Py_UNICODE_ISDIGIT(*p))
3865 return PyInt_FromLong(1);
3866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003867 /* Special case for empty strings */
3868 if (PyString_GET_SIZE(self) == 0)
3869 return PyInt_FromLong(0);
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 e = p + PyUnicode_GET_SIZE(self);
3872 for (; p < e; p++) {
3873 if (!Py_UNICODE_ISDIGIT(*p))
3874 return PyInt_FromLong(0);
3875 }
3876 return PyInt_FromLong(1);
3877}
3878
3879static char isnumeric__doc__[] =
3880"S.isnumeric() -> int\n\
3881\n\
3882Return 1 if there are only numeric characters in S,\n\
38830 otherwise.";
3884
3885static PyObject*
3886unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3887{
3888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3889 register const Py_UNICODE *e;
3890
3891 if (!PyArg_NoArgs(args))
3892 return NULL;
3893
3894 /* Shortcut for single character strings */
3895 if (PyUnicode_GET_SIZE(self) == 1 &&
3896 Py_UNICODE_ISNUMERIC(*p))
3897 return PyInt_FromLong(1);
3898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003899 /* Special case for empty strings */
3900 if (PyString_GET_SIZE(self) == 0)
3901 return PyInt_FromLong(0);
3902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 e = p + PyUnicode_GET_SIZE(self);
3904 for (; p < e; p++) {
3905 if (!Py_UNICODE_ISNUMERIC(*p))
3906 return PyInt_FromLong(0);
3907 }
3908 return PyInt_FromLong(1);
3909}
3910
3911static char join__doc__[] =
3912"S.join(sequence) -> unicode\n\
3913\n\
3914Return a string which is the concatenation of the strings in the\n\
3915sequence. The separator between elements is S.";
3916
3917static PyObject*
3918unicode_join(PyUnicodeObject *self, PyObject *args)
3919{
3920 PyObject *data;
3921 if (!PyArg_ParseTuple(args, "O:join", &data))
3922 return NULL;
3923
3924 return PyUnicode_Join((PyObject *)self, data);
3925}
3926
3927static int
3928unicode_length(PyUnicodeObject *self)
3929{
3930 return self->length;
3931}
3932
3933static char ljust__doc__[] =
3934"S.ljust(width) -> unicode\n\
3935\n\
3936Return S left justified in a Unicode string of length width. Padding is\n\
3937done using spaces.";
3938
3939static PyObject *
3940unicode_ljust(PyUnicodeObject *self, PyObject *args)
3941{
3942 int width;
3943 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3944 return NULL;
3945
3946 if (self->length >= width) {
3947 Py_INCREF(self);
3948 return (PyObject*) self;
3949 }
3950
3951 return (PyObject*) pad(self, 0, width - self->length, ' ');
3952}
3953
3954static char lower__doc__[] =
3955"S.lower() -> unicode\n\
3956\n\
3957Return a copy of the string S converted to lowercase.";
3958
3959static PyObject*
3960unicode_lower(PyUnicodeObject *self, PyObject *args)
3961{
3962 if (!PyArg_NoArgs(args))
3963 return NULL;
3964 return fixup(self, fixlower);
3965}
3966
3967static char lstrip__doc__[] =
3968"S.lstrip() -> unicode\n\
3969\n\
3970Return a copy of the string S with leading whitespace removed.";
3971
3972static PyObject *
3973unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3974{
3975 if (!PyArg_NoArgs(args))
3976 return NULL;
3977 return strip(self, 1, 0);
3978}
3979
3980static PyObject*
3981unicode_repeat(PyUnicodeObject *str, int len)
3982{
3983 PyUnicodeObject *u;
3984 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003985 int nchars;
3986 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987
3988 if (len < 0)
3989 len = 0;
3990
3991 if (len == 1) {
3992 /* no repeat, return original string */
3993 Py_INCREF(str);
3994 return (PyObject*) str;
3995 }
Tim Peters8f422462000-09-09 06:13:41 +00003996
3997 /* ensure # of chars needed doesn't overflow int and # of bytes
3998 * needed doesn't overflow size_t
3999 */
4000 nchars = len * str->length;
4001 if (len && nchars / len != str->length) {
4002 PyErr_SetString(PyExc_OverflowError,
4003 "repeated string is too long");
4004 return NULL;
4005 }
4006 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4007 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4008 PyErr_SetString(PyExc_OverflowError,
4009 "repeated string is too long");
4010 return NULL;
4011 }
4012 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 if (!u)
4014 return NULL;
4015
4016 p = u->str;
4017
4018 while (len-- > 0) {
4019 Py_UNICODE_COPY(p, str->str, str->length);
4020 p += str->length;
4021 }
4022
4023 return (PyObject*) u;
4024}
4025
4026PyObject *PyUnicode_Replace(PyObject *obj,
4027 PyObject *subobj,
4028 PyObject *replobj,
4029 int maxcount)
4030{
4031 PyObject *self;
4032 PyObject *str1;
4033 PyObject *str2;
4034 PyObject *result;
4035
4036 self = PyUnicode_FromObject(obj);
4037 if (self == NULL)
4038 return NULL;
4039 str1 = PyUnicode_FromObject(subobj);
4040 if (str1 == NULL) {
4041 Py_DECREF(self);
4042 return NULL;
4043 }
4044 str2 = PyUnicode_FromObject(replobj);
4045 if (str2 == NULL) {
4046 Py_DECREF(self);
4047 Py_DECREF(str1);
4048 return NULL;
4049 }
4050 result = replace((PyUnicodeObject *)self,
4051 (PyUnicodeObject *)str1,
4052 (PyUnicodeObject *)str2,
4053 maxcount);
4054 Py_DECREF(self);
4055 Py_DECREF(str1);
4056 Py_DECREF(str2);
4057 return result;
4058}
4059
4060static char replace__doc__[] =
4061"S.replace (old, new[, maxsplit]) -> unicode\n\
4062\n\
4063Return a copy of S with all occurrences of substring\n\
4064old replaced by new. If the optional argument maxsplit is\n\
4065given, only the first maxsplit occurrences are replaced.";
4066
4067static PyObject*
4068unicode_replace(PyUnicodeObject *self, PyObject *args)
4069{
4070 PyUnicodeObject *str1;
4071 PyUnicodeObject *str2;
4072 int maxcount = -1;
4073 PyObject *result;
4074
4075 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4076 return NULL;
4077 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4078 if (str1 == NULL)
4079 return NULL;
4080 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4081 if (str2 == NULL)
4082 return NULL;
4083
4084 result = replace(self, str1, str2, maxcount);
4085
4086 Py_DECREF(str1);
4087 Py_DECREF(str2);
4088 return result;
4089}
4090
4091static
4092PyObject *unicode_repr(PyObject *unicode)
4093{
4094 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4095 PyUnicode_GET_SIZE(unicode),
4096 1);
4097}
4098
4099static char rfind__doc__[] =
4100"S.rfind(sub [,start [,end]]) -> int\n\
4101\n\
4102Return the highest index in S where substring sub is found,\n\
4103such that sub is contained within s[start,end]. Optional\n\
4104arguments start and end are interpreted as in slice notation.\n\
4105\n\
4106Return -1 on failure.";
4107
4108static PyObject *
4109unicode_rfind(PyUnicodeObject *self, PyObject *args)
4110{
4111 PyUnicodeObject *substring;
4112 int start = 0;
4113 int end = INT_MAX;
4114 PyObject *result;
4115
Guido van Rossumb8872e62000-05-09 14:14:27 +00004116 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4117 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 return NULL;
4119 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4120 (PyObject *)substring);
4121 if (substring == NULL)
4122 return NULL;
4123
4124 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4125
4126 Py_DECREF(substring);
4127 return result;
4128}
4129
4130static char rindex__doc__[] =
4131"S.rindex(sub [,start [,end]]) -> int\n\
4132\n\
4133Like S.rfind() but raise ValueError when the substring is not found.";
4134
4135static PyObject *
4136unicode_rindex(PyUnicodeObject *self, PyObject *args)
4137{
4138 int result;
4139 PyUnicodeObject *substring;
4140 int start = 0;
4141 int end = INT_MAX;
4142
Guido van Rossumb8872e62000-05-09 14:14:27 +00004143 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4144 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 return NULL;
4146 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4147 (PyObject *)substring);
4148 if (substring == NULL)
4149 return NULL;
4150
4151 result = findstring(self, substring, start, end, -1);
4152
4153 Py_DECREF(substring);
4154 if (result < 0) {
4155 PyErr_SetString(PyExc_ValueError, "substring not found");
4156 return NULL;
4157 }
4158 return PyInt_FromLong(result);
4159}
4160
4161static char rjust__doc__[] =
4162"S.rjust(width) -> unicode\n\
4163\n\
4164Return S right justified in a Unicode string of length width. Padding is\n\
4165done using spaces.";
4166
4167static PyObject *
4168unicode_rjust(PyUnicodeObject *self, PyObject *args)
4169{
4170 int width;
4171 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4172 return NULL;
4173
4174 if (self->length >= width) {
4175 Py_INCREF(self);
4176 return (PyObject*) self;
4177 }
4178
4179 return (PyObject*) pad(self, width - self->length, 0, ' ');
4180}
4181
4182static char rstrip__doc__[] =
4183"S.rstrip() -> unicode\n\
4184\n\
4185Return a copy of the string S with trailing whitespace removed.";
4186
4187static PyObject *
4188unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4189{
4190 if (!PyArg_NoArgs(args))
4191 return NULL;
4192 return strip(self, 0, 1);
4193}
4194
4195static PyObject*
4196unicode_slice(PyUnicodeObject *self, int start, int end)
4197{
4198 /* standard clamping */
4199 if (start < 0)
4200 start = 0;
4201 if (end < 0)
4202 end = 0;
4203 if (end > self->length)
4204 end = self->length;
4205 if (start == 0 && end == self->length) {
4206 /* full slice, return original string */
4207 Py_INCREF(self);
4208 return (PyObject*) self;
4209 }
4210 if (start > end)
4211 start = end;
4212 /* copy slice */
4213 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4214 end - start);
4215}
4216
4217PyObject *PyUnicode_Split(PyObject *s,
4218 PyObject *sep,
4219 int maxsplit)
4220{
4221 PyObject *result;
4222
4223 s = PyUnicode_FromObject(s);
4224 if (s == NULL)
4225 return NULL;
4226 if (sep != NULL) {
4227 sep = PyUnicode_FromObject(sep);
4228 if (sep == NULL) {
4229 Py_DECREF(s);
4230 return NULL;
4231 }
4232 }
4233
4234 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4235
4236 Py_DECREF(s);
4237 Py_XDECREF(sep);
4238 return result;
4239}
4240
4241static char split__doc__[] =
4242"S.split([sep [,maxsplit]]) -> list of strings\n\
4243\n\
4244Return a list of the words in S, using sep as the\n\
4245delimiter string. If maxsplit is given, at most maxsplit\n\
4246splits are done. If sep is not specified, any whitespace string\n\
4247is a separator.";
4248
4249static PyObject*
4250unicode_split(PyUnicodeObject *self, PyObject *args)
4251{
4252 PyObject *substring = Py_None;
4253 int maxcount = -1;
4254
4255 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4256 return NULL;
4257
4258 if (substring == Py_None)
4259 return split(self, NULL, maxcount);
4260 else if (PyUnicode_Check(substring))
4261 return split(self, (PyUnicodeObject *)substring, maxcount);
4262 else
4263 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4264}
4265
4266static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004267"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268\n\
4269Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004270Line breaks are not included in the resulting list unless keepends\n\
4271is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272
4273static PyObject*
4274unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4275{
Guido van Rossum86662912000-04-11 15:38:46 +00004276 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
Guido van Rossum86662912000-04-11 15:38:46 +00004278 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 return NULL;
4280
Guido van Rossum86662912000-04-11 15:38:46 +00004281 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282}
4283
4284static
4285PyObject *unicode_str(PyUnicodeObject *self)
4286{
Fred Drakee4315f52000-05-09 19:53:39 +00004287 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288}
4289
4290static char strip__doc__[] =
4291"S.strip() -> unicode\n\
4292\n\
4293Return a copy of S with leading and trailing whitespace removed.";
4294
4295static PyObject *
4296unicode_strip(PyUnicodeObject *self, PyObject *args)
4297{
4298 if (!PyArg_NoArgs(args))
4299 return NULL;
4300 return strip(self, 1, 1);
4301}
4302
4303static char swapcase__doc__[] =
4304"S.swapcase() -> unicode\n\
4305\n\
4306Return a copy of S with uppercase characters converted to lowercase\n\
4307and vice versa.";
4308
4309static PyObject*
4310unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4311{
4312 if (!PyArg_NoArgs(args))
4313 return NULL;
4314 return fixup(self, fixswapcase);
4315}
4316
4317static char translate__doc__[] =
4318"S.translate(table) -> unicode\n\
4319\n\
4320Return a copy of the string S, where all characters have been mapped\n\
4321through the given translation table, which must be a mapping of\n\
4322Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4323are left untouched. Characters mapped to None are deleted.";
4324
4325static PyObject*
4326unicode_translate(PyUnicodeObject *self, PyObject *args)
4327{
4328 PyObject *table;
4329
4330 if (!PyArg_ParseTuple(args, "O:translate", &table))
4331 return NULL;
4332 return PyUnicode_TranslateCharmap(self->str,
4333 self->length,
4334 table,
4335 "ignore");
4336}
4337
4338static char upper__doc__[] =
4339"S.upper() -> unicode\n\
4340\n\
4341Return a copy of S converted to uppercase.";
4342
4343static PyObject*
4344unicode_upper(PyUnicodeObject *self, PyObject *args)
4345{
4346 if (!PyArg_NoArgs(args))
4347 return NULL;
4348 return fixup(self, fixupper);
4349}
4350
4351#if 0
4352static char zfill__doc__[] =
4353"S.zfill(width) -> unicode\n\
4354\n\
4355Pad a numeric string x with zeros on the left, to fill a field\n\
4356of the specified width. The string x is never truncated.";
4357
4358static PyObject *
4359unicode_zfill(PyUnicodeObject *self, PyObject *args)
4360{
4361 int fill;
4362 PyUnicodeObject *u;
4363
4364 int width;
4365 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4366 return NULL;
4367
4368 if (self->length >= width) {
4369 Py_INCREF(self);
4370 return (PyObject*) self;
4371 }
4372
4373 fill = width - self->length;
4374
4375 u = pad(self, fill, 0, '0');
4376
4377 if (u->str[fill] == '+' || u->str[fill] == '-') {
4378 /* move sign to beginning of string */
4379 u->str[0] = u->str[fill];
4380 u->str[fill] = '0';
4381 }
4382
4383 return (PyObject*) u;
4384}
4385#endif
4386
4387#if 0
4388static PyObject*
4389unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4390{
4391 if (!PyArg_NoArgs(args))
4392 return NULL;
4393 return PyInt_FromLong(unicode_freelist_size);
4394}
4395#endif
4396
4397static char startswith__doc__[] =
4398"S.startswith(prefix[, start[, end]]) -> int\n\
4399\n\
4400Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4401optional start, test S beginning at that position. With optional end, stop\n\
4402comparing S at that position.";
4403
4404static PyObject *
4405unicode_startswith(PyUnicodeObject *self,
4406 PyObject *args)
4407{
4408 PyUnicodeObject *substring;
4409 int start = 0;
4410 int end = INT_MAX;
4411 PyObject *result;
4412
Guido van Rossumb8872e62000-05-09 14:14:27 +00004413 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4414 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 return NULL;
4416 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4417 (PyObject *)substring);
4418 if (substring == NULL)
4419 return NULL;
4420
4421 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4422
4423 Py_DECREF(substring);
4424 return result;
4425}
4426
4427
4428static char endswith__doc__[] =
4429"S.endswith(suffix[, start[, end]]) -> int\n\
4430\n\
4431Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4432optional start, test S beginning at that position. With optional end, stop\n\
4433comparing S at that position.";
4434
4435static PyObject *
4436unicode_endswith(PyUnicodeObject *self,
4437 PyObject *args)
4438{
4439 PyUnicodeObject *substring;
4440 int start = 0;
4441 int end = INT_MAX;
4442 PyObject *result;
4443
Guido van Rossumb8872e62000-05-09 14:14:27 +00004444 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4445 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 return NULL;
4447 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4448 (PyObject *)substring);
4449 if (substring == NULL)
4450 return NULL;
4451
4452 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4453
4454 Py_DECREF(substring);
4455 return result;
4456}
4457
4458
4459static PyMethodDef unicode_methods[] = {
4460
4461 /* Order is according to common usage: often used methods should
4462 appear first, since lookup is done sequentially. */
4463
4464 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4465 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4466 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4467 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4468 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4469 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4470 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4471 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4472 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4473 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4474 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4475 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4476 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4477 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4478/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4479 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4480 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4481 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4482 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4483 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4484 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4485 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4486 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4487 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4488 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4489 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4490 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4491 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4492 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4493 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4494 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4495 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4496 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004497 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4498 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499#if 0
4500 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4501 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4502#endif
4503
4504#if 0
4505 /* This one is just used for debugging the implementation. */
4506 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4507#endif
4508
4509 {NULL, NULL}
4510};
4511
4512static PyObject *
4513unicode_getattr(PyUnicodeObject *self, char *name)
4514{
4515 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4516}
4517
4518static PySequenceMethods unicode_as_sequence = {
4519 (inquiry) unicode_length, /* sq_length */
4520 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4521 (intargfunc) unicode_repeat, /* sq_repeat */
4522 (intargfunc) unicode_getitem, /* sq_item */
4523 (intintargfunc) unicode_slice, /* sq_slice */
4524 0, /* sq_ass_item */
4525 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004526 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527};
4528
4529static int
4530unicode_buffer_getreadbuf(PyUnicodeObject *self,
4531 int index,
4532 const void **ptr)
4533{
4534 if (index != 0) {
4535 PyErr_SetString(PyExc_SystemError,
4536 "accessing non-existent unicode segment");
4537 return -1;
4538 }
4539 *ptr = (void *) self->str;
4540 return PyUnicode_GET_DATA_SIZE(self);
4541}
4542
4543static int
4544unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4545 const void **ptr)
4546{
4547 PyErr_SetString(PyExc_TypeError,
4548 "cannot use unicode as modifyable buffer");
4549 return -1;
4550}
4551
4552static int
4553unicode_buffer_getsegcount(PyUnicodeObject *self,
4554 int *lenp)
4555{
4556 if (lenp)
4557 *lenp = PyUnicode_GET_DATA_SIZE(self);
4558 return 1;
4559}
4560
4561static int
4562unicode_buffer_getcharbuf(PyUnicodeObject *self,
4563 int index,
4564 const void **ptr)
4565{
4566 PyObject *str;
4567
4568 if (index != 0) {
4569 PyErr_SetString(PyExc_SystemError,
4570 "accessing non-existent unicode segment");
4571 return -1;
4572 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004573 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 if (str == NULL)
4575 return -1;
4576 *ptr = (void *) PyString_AS_STRING(str);
4577 return PyString_GET_SIZE(str);
4578}
4579
4580/* Helpers for PyUnicode_Format() */
4581
4582static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004583getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584{
4585 int argidx = *p_argidx;
4586 if (argidx < arglen) {
4587 (*p_argidx)++;
4588 if (arglen < 0)
4589 return args;
4590 else
4591 return PyTuple_GetItem(args, argidx);
4592 }
4593 PyErr_SetString(PyExc_TypeError,
4594 "not enough arguments for format string");
4595 return NULL;
4596}
4597
4598#define F_LJUST (1<<0)
4599#define F_SIGN (1<<1)
4600#define F_BLANK (1<<2)
4601#define F_ALT (1<<3)
4602#define F_ZERO (1<<4)
4603
4604static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606{
4607 register int i;
4608 int len;
4609 va_list va;
4610 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613 /* First, format the string as char array, then expand to Py_UNICODE
4614 array. */
4615 charbuffer = (char *)buffer;
4616 len = vsprintf(charbuffer, format, va);
4617 for (i = len - 1; i >= 0; i--)
4618 buffer[i] = (Py_UNICODE) charbuffer[i];
4619
4620 va_end(va);
4621 return len;
4622}
4623
4624static int
4625formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004626 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 int flags,
4628 int prec,
4629 int type,
4630 PyObject *v)
4631{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004632 /* fmt = '%#.' + `prec` + `type`
4633 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 char fmt[20];
4635 double x;
4636
4637 x = PyFloat_AsDouble(v);
4638 if (x == -1.0 && PyErr_Occurred())
4639 return -1;
4640 if (prec < 0)
4641 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4643 type = 'g';
4644 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004645 /* worst case length calc to ensure no buffer overrun:
4646 fmt = %#.<prec>g
4647 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4648 for any double rep.)
4649 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4650 If prec=0 the effective precision is 1 (the leading digit is
4651 always given), therefore increase by one to 10+prec. */
4652 if (buflen <= (size_t)10 + (size_t)prec) {
4653 PyErr_SetString(PyExc_OverflowError,
4654 "formatted float is too long (precision too long?)");
4655 return -1;
4656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 return usprintf(buf, fmt, x);
4658}
4659
Tim Peters38fd5b62000-09-21 05:43:11 +00004660static PyObject*
4661formatlong(PyObject *val, int flags, int prec, int type)
4662{
4663 char *buf;
4664 int i, len;
4665 PyObject *str; /* temporary string object. */
4666 PyUnicodeObject *result;
4667
4668 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4669 if (!str)
4670 return NULL;
4671 result = _PyUnicode_New(len);
4672 for (i = 0; i < len; i++)
4673 result->str[i] = buf[i];
4674 result->str[len] = 0;
4675 Py_DECREF(str);
4676 return (PyObject*)result;
4677}
4678
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679static int
4680formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004681 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 int flags,
4683 int prec,
4684 int type,
4685 PyObject *v)
4686{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004687 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004688 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4689 + 1 + 1 = 24*/
4690 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 long x;
4692
4693 x = PyInt_AsLong(v);
4694 if (x == -1 && PyErr_Occurred())
4695 return -1;
4696 if (prec < 0)
4697 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004698 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4699 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4700 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4701 PyErr_SetString(PyExc_OverflowError,
4702 "formatted integer is too long (precision too long?)");
4703 return -1;
4704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4706 return usprintf(buf, fmt, x);
4707}
4708
4709static int
4710formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004711 size_t buflen,
4712 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004714 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004715 if (PyUnicode_Check(v)) {
4716 if (PyUnicode_GET_SIZE(v) != 1)
4717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004721 else if (PyString_Check(v)) {
4722 if (PyString_GET_SIZE(v) != 1)
4723 goto onError;
4724 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
4727 else {
4728 /* Integer input truncated to a character */
4729 long x;
4730 x = PyInt_AsLong(v);
4731 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004732 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 buf[0] = (char) x;
4734 }
4735 buf[1] = '\0';
4736 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004737
4738 onError:
4739 PyErr_SetString(PyExc_TypeError,
4740 "%c requires int or char");
4741 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742}
4743
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004744/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4745
4746 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4747 chars are formatted. XXX This is a magic number. Each formatting
4748 routine does bounds checking to ensure no overflow, but a better
4749 solution may be to malloc a buffer of appropriate size for each
4750 format. For now, the current solution is sufficient.
4751*/
4752#define FORMATBUFLEN (size_t)120
4753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754PyObject *PyUnicode_Format(PyObject *format,
4755 PyObject *args)
4756{
4757 Py_UNICODE *fmt, *res;
4758 int fmtcnt, rescnt, reslen, arglen, argidx;
4759 int args_owned = 0;
4760 PyUnicodeObject *result = NULL;
4761 PyObject *dict = NULL;
4762 PyObject *uformat;
4763
4764 if (format == NULL || args == NULL) {
4765 PyErr_BadInternalCall();
4766 return NULL;
4767 }
4768 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004769 if (uformat == NULL)
4770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 fmt = PyUnicode_AS_UNICODE(uformat);
4772 fmtcnt = PyUnicode_GET_SIZE(uformat);
4773
4774 reslen = rescnt = fmtcnt + 100;
4775 result = _PyUnicode_New(reslen);
4776 if (result == NULL)
4777 goto onError;
4778 res = PyUnicode_AS_UNICODE(result);
4779
4780 if (PyTuple_Check(args)) {
4781 arglen = PyTuple_Size(args);
4782 argidx = 0;
4783 }
4784 else {
4785 arglen = -1;
4786 argidx = -2;
4787 }
4788 if (args->ob_type->tp_as_mapping)
4789 dict = args;
4790
4791 while (--fmtcnt >= 0) {
4792 if (*fmt != '%') {
4793 if (--rescnt < 0) {
4794 rescnt = fmtcnt + 100;
4795 reslen += rescnt;
4796 if (_PyUnicode_Resize(result, reslen) < 0)
4797 return NULL;
4798 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4799 --rescnt;
4800 }
4801 *res++ = *fmt++;
4802 }
4803 else {
4804 /* Got a format specifier */
4805 int flags = 0;
4806 int width = -1;
4807 int prec = -1;
4808 int size = 0;
4809 Py_UNICODE c = '\0';
4810 Py_UNICODE fill;
4811 PyObject *v = NULL;
4812 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004813 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 Py_UNICODE sign;
4815 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004816 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
4818 fmt++;
4819 if (*fmt == '(') {
4820 Py_UNICODE *keystart;
4821 int keylen;
4822 PyObject *key;
4823 int pcount = 1;
4824
4825 if (dict == NULL) {
4826 PyErr_SetString(PyExc_TypeError,
4827 "format requires a mapping");
4828 goto onError;
4829 }
4830 ++fmt;
4831 --fmtcnt;
4832 keystart = fmt;
4833 /* Skip over balanced parentheses */
4834 while (pcount > 0 && --fmtcnt >= 0) {
4835 if (*fmt == ')')
4836 --pcount;
4837 else if (*fmt == '(')
4838 ++pcount;
4839 fmt++;
4840 }
4841 keylen = fmt - keystart - 1;
4842 if (fmtcnt < 0 || pcount > 0) {
4843 PyErr_SetString(PyExc_ValueError,
4844 "incomplete format key");
4845 goto onError;
4846 }
Fred Drakee4315f52000-05-09 19:53:39 +00004847 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 then looked up since Python uses strings to hold
4849 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004850 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 key = PyUnicode_EncodeUTF8(keystart,
4852 keylen,
4853 NULL);
4854 if (key == NULL)
4855 goto onError;
4856 if (args_owned) {
4857 Py_DECREF(args);
4858 args_owned = 0;
4859 }
4860 args = PyObject_GetItem(dict, key);
4861 Py_DECREF(key);
4862 if (args == NULL) {
4863 goto onError;
4864 }
4865 args_owned = 1;
4866 arglen = -1;
4867 argidx = -2;
4868 }
4869 while (--fmtcnt >= 0) {
4870 switch (c = *fmt++) {
4871 case '-': flags |= F_LJUST; continue;
4872 case '+': flags |= F_SIGN; continue;
4873 case ' ': flags |= F_BLANK; continue;
4874 case '#': flags |= F_ALT; continue;
4875 case '0': flags |= F_ZERO; continue;
4876 }
4877 break;
4878 }
4879 if (c == '*') {
4880 v = getnextarg(args, arglen, &argidx);
4881 if (v == NULL)
4882 goto onError;
4883 if (!PyInt_Check(v)) {
4884 PyErr_SetString(PyExc_TypeError,
4885 "* wants int");
4886 goto onError;
4887 }
4888 width = PyInt_AsLong(v);
4889 if (width < 0) {
4890 flags |= F_LJUST;
4891 width = -width;
4892 }
4893 if (--fmtcnt >= 0)
4894 c = *fmt++;
4895 }
4896 else if (c >= '0' && c <= '9') {
4897 width = c - '0';
4898 while (--fmtcnt >= 0) {
4899 c = *fmt++;
4900 if (c < '0' || c > '9')
4901 break;
4902 if ((width*10) / 10 != width) {
4903 PyErr_SetString(PyExc_ValueError,
4904 "width too big");
4905 goto onError;
4906 }
4907 width = width*10 + (c - '0');
4908 }
4909 }
4910 if (c == '.') {
4911 prec = 0;
4912 if (--fmtcnt >= 0)
4913 c = *fmt++;
4914 if (c == '*') {
4915 v = getnextarg(args, arglen, &argidx);
4916 if (v == NULL)
4917 goto onError;
4918 if (!PyInt_Check(v)) {
4919 PyErr_SetString(PyExc_TypeError,
4920 "* wants int");
4921 goto onError;
4922 }
4923 prec = PyInt_AsLong(v);
4924 if (prec < 0)
4925 prec = 0;
4926 if (--fmtcnt >= 0)
4927 c = *fmt++;
4928 }
4929 else if (c >= '0' && c <= '9') {
4930 prec = c - '0';
4931 while (--fmtcnt >= 0) {
4932 c = Py_CHARMASK(*fmt++);
4933 if (c < '0' || c > '9')
4934 break;
4935 if ((prec*10) / 10 != prec) {
4936 PyErr_SetString(PyExc_ValueError,
4937 "prec too big");
4938 goto onError;
4939 }
4940 prec = prec*10 + (c - '0');
4941 }
4942 }
4943 } /* prec */
4944 if (fmtcnt >= 0) {
4945 if (c == 'h' || c == 'l' || c == 'L') {
4946 size = c;
4947 if (--fmtcnt >= 0)
4948 c = *fmt++;
4949 }
4950 }
4951 if (fmtcnt < 0) {
4952 PyErr_SetString(PyExc_ValueError,
4953 "incomplete format");
4954 goto onError;
4955 }
4956 if (c != '%') {
4957 v = getnextarg(args, arglen, &argidx);
4958 if (v == NULL)
4959 goto onError;
4960 }
4961 sign = 0;
4962 fill = ' ';
4963 switch (c) {
4964
4965 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004966 pbuf = formatbuf;
4967 /* presume that buffer length is at least 1 */
4968 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 len = 1;
4970 break;
4971
4972 case 's':
4973 case 'r':
4974 if (PyUnicode_Check(v) && c == 's') {
4975 temp = v;
4976 Py_INCREF(temp);
4977 }
4978 else {
4979 PyObject *unicode;
4980 if (c == 's')
4981 temp = PyObject_Str(v);
4982 else
4983 temp = PyObject_Repr(v);
4984 if (temp == NULL)
4985 goto onError;
4986 if (!PyString_Check(temp)) {
4987 /* XXX Note: this should never happen, since
4988 PyObject_Repr() and PyObject_Str() assure
4989 this */
4990 Py_DECREF(temp);
4991 PyErr_SetString(PyExc_TypeError,
4992 "%s argument has non-string str()");
4993 goto onError;
4994 }
Fred Drakee4315f52000-05-09 19:53:39 +00004995 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004997 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 "strict");
4999 Py_DECREF(temp);
5000 temp = unicode;
5001 if (temp == NULL)
5002 goto onError;
5003 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005004 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 len = PyUnicode_GET_SIZE(temp);
5006 if (prec >= 0 && len > prec)
5007 len = prec;
5008 break;
5009
5010 case 'i':
5011 case 'd':
5012 case 'u':
5013 case 'o':
5014 case 'x':
5015 case 'X':
5016 if (c == 'i')
5017 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005018 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005019 temp = formatlong(v, flags, prec, c);
5020 if (!temp)
5021 goto onError;
5022 pbuf = PyUnicode_AS_UNICODE(temp);
5023 len = PyUnicode_GET_SIZE(temp);
5024 /* unbounded ints can always produce
5025 a sign character! */
5026 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005028 else {
5029 pbuf = formatbuf;
5030 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5031 flags, prec, c, v);
5032 if (len < 0)
5033 goto onError;
5034 /* only d conversion is signed */
5035 sign = c == 'd';
5036 }
5037 if (flags & F_ZERO)
5038 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 break;
5040
5041 case 'e':
5042 case 'E':
5043 case 'f':
5044 case 'g':
5045 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005046 pbuf = formatbuf;
5047 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5048 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 if (len < 0)
5050 goto onError;
5051 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005052 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 fill = '0';
5054 break;
5055
5056 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005057 pbuf = formatbuf;
5058 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 if (len < 0)
5060 goto onError;
5061 break;
5062
5063 default:
5064 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005065 "unsupported format character '%c' (0x%x) "
5066 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005067 (31<=c && c<=126) ? c : '?',
5068 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 goto onError;
5070 }
5071 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005072 if (*pbuf == '-' || *pbuf == '+') {
5073 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 len--;
5075 }
5076 else if (flags & F_SIGN)
5077 sign = '+';
5078 else if (flags & F_BLANK)
5079 sign = ' ';
5080 else
5081 sign = 0;
5082 }
5083 if (width < len)
5084 width = len;
5085 if (rescnt < width + (sign != 0)) {
5086 reslen -= rescnt;
5087 rescnt = width + fmtcnt + 100;
5088 reslen += rescnt;
5089 if (_PyUnicode_Resize(result, reslen) < 0)
5090 return NULL;
5091 res = PyUnicode_AS_UNICODE(result)
5092 + reslen - rescnt;
5093 }
5094 if (sign) {
5095 if (fill != ' ')
5096 *res++ = sign;
5097 rescnt--;
5098 if (width > len)
5099 width--;
5100 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005101 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5102 assert(pbuf[0] == '0');
5103 assert(pbuf[1] == c);
5104 if (fill != ' ') {
5105 *res++ = *pbuf++;
5106 *res++ = *pbuf++;
5107 }
5108 rescnt -= 2;
5109 width -= 2;
5110 if (width < 0)
5111 width = 0;
5112 len -= 2;
5113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 if (width > len && !(flags & F_LJUST)) {
5115 do {
5116 --rescnt;
5117 *res++ = fill;
5118 } while (--width > len);
5119 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005120 if (fill == ' ') {
5121 if (sign)
5122 *res++ = sign;
5123 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5124 assert(pbuf[0] == '0');
5125 assert(pbuf[1] == c);
5126 *res++ = *pbuf++;
5127 *res++ = *pbuf++;
5128 }
5129 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005130 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 res += len;
5132 rescnt -= len;
5133 while (--width >= len) {
5134 --rescnt;
5135 *res++ = ' ';
5136 }
5137 if (dict && (argidx < arglen) && c != '%') {
5138 PyErr_SetString(PyExc_TypeError,
5139 "not all arguments converted");
5140 goto onError;
5141 }
5142 Py_XDECREF(temp);
5143 } /* '%' */
5144 } /* until end */
5145 if (argidx < arglen && !dict) {
5146 PyErr_SetString(PyExc_TypeError,
5147 "not all arguments converted");
5148 goto onError;
5149 }
5150
5151 if (args_owned) {
5152 Py_DECREF(args);
5153 }
5154 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005155 if (_PyUnicode_Resize(result, reslen - rescnt))
5156 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return (PyObject *)result;
5158
5159 onError:
5160 Py_XDECREF(result);
5161 Py_DECREF(uformat);
5162 if (args_owned) {
5163 Py_DECREF(args);
5164 }
5165 return NULL;
5166}
5167
5168static PyBufferProcs unicode_as_buffer = {
5169 (getreadbufferproc) unicode_buffer_getreadbuf,
5170 (getwritebufferproc) unicode_buffer_getwritebuf,
5171 (getsegcountproc) unicode_buffer_getsegcount,
5172 (getcharbufferproc) unicode_buffer_getcharbuf,
5173};
5174
5175PyTypeObject PyUnicode_Type = {
5176 PyObject_HEAD_INIT(&PyType_Type)
5177 0, /* ob_size */
5178 "unicode", /* tp_name */
5179 sizeof(PyUnicodeObject), /* tp_size */
5180 0, /* tp_itemsize */
5181 /* Slots */
5182 (destructor)_PyUnicode_Free, /* tp_dealloc */
5183 0, /* tp_print */
5184 (getattrfunc)unicode_getattr, /* tp_getattr */
5185 0, /* tp_setattr */
5186 (cmpfunc) unicode_compare, /* tp_compare */
5187 (reprfunc) unicode_repr, /* tp_repr */
5188 0, /* tp_as_number */
5189 &unicode_as_sequence, /* tp_as_sequence */
5190 0, /* tp_as_mapping */
5191 (hashfunc) unicode_hash, /* tp_hash*/
5192 0, /* tp_call*/
5193 (reprfunc) unicode_str, /* tp_str */
5194 (getattrofunc) NULL, /* tp_getattro */
5195 (setattrofunc) NULL, /* tp_setattro */
5196 &unicode_as_buffer, /* tp_as_buffer */
5197 Py_TPFLAGS_DEFAULT, /* tp_flags */
5198};
5199
5200/* Initialize the Unicode implementation */
5201
Thomas Wouters78890102000-07-22 19:25:51 +00005202void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
5204 /* Doublecheck the configuration... */
5205 if (sizeof(Py_UNICODE) != 2)
5206 Py_FatalError("Unicode configuration error: "
5207 "sizeof(Py_UNICODE) != 2 bytes");
5208
Fred Drakee4315f52000-05-09 19:53:39 +00005209 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005210 unicode_freelist = NULL;
5211 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005213 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214}
5215
5216/* Finalize the Unicode implementation */
5217
5218void
Thomas Wouters78890102000-07-22 19:25:51 +00005219_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005221 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005223 Py_XDECREF(unicode_empty);
5224 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005225
5226 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 PyUnicodeObject *v = u;
5228 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005229 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005230 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005231 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005232 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005234 unicode_freelist = NULL;
5235 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236}