blob: 5ee72bd128df8d94638c2113debf50d2acac44e5 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000070#ifdef MS_WIN32
71#include <windows.h>
72#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073
Guido van Rossumd57fd912000-03-10 22:53:23 +000074/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE 1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
83
Barry Warsaw51ac5802000-03-20 16:36:48 +000084 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000086 malloc()-overhead) bytes of unused garbage.
87
88 Setting the limit to 0 effectively turns the feature off.
89
Guido van Rossumfd4b9572000-04-10 13:51:10 +000090 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
93*/
94
Guido van Rossumfd4b9572000-04-10 13:51:10 +000095#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000096
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105/* --- Globals ------------------------------------------------------------
106
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
109
110*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
112/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000113static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
134{
135 void *oldstr;
136
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000137 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000138 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000139 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000140
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
158 unicode->length = length;
159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
167
168 return 0;
169}
170
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000171int PyUnicode_Resize(PyObject **unicode,
172 int length)
173{
174 PyUnicodeObject *v;
175
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
179 }
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
184 }
185 return _PyUnicode_Resize(v, length);
186}
187
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188/* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
190
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199 register PyUnicodeObject *unicode;
200
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
205 }
206
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000210 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000217 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 }
220 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000221 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000223 }
224 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000270 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
358{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 const char *s;
360 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000361 int owned = 0;
362 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
367 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
377 }
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
383 }
384 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
391 }
392 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 }
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000406 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000407 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000408
409 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 if (len == 0) {
411 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000417 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000418 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000420 return v;
421
422 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000423 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000424 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427}
428
429PyObject *PyUnicode_Decode(const char *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *buffer = NULL, *unicode;
435
Fred Drakee4315f52000-05-09 19:53:39 +0000436 if (encoding == NULL)
437 encoding = PyUnicode_GetDefaultEncoding();
438
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000442 else if (strcmp(encoding, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s, size, errors);
444 else if (strcmp(encoding, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446
447 /* Decode via the codec registry */
448 buffer = PyBuffer_FromMemory((void *)s, size);
449 if (buffer == NULL)
450 goto onError;
451 unicode = PyCodec_Decode(buffer, encoding, errors);
452 if (unicode == NULL)
453 goto onError;
454 if (!PyUnicode_Check(unicode)) {
455 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000456 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 unicode->ob_type->tp_name);
458 Py_DECREF(unicode);
459 goto onError;
460 }
461 Py_DECREF(buffer);
462 return unicode;
463
464 onError:
465 Py_XDECREF(buffer);
466 return NULL;
467}
468
469PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470 int size,
471 const char *encoding,
472 const char *errors)
473{
474 PyObject *v, *unicode;
475
476 unicode = PyUnicode_FromUnicode(s, size);
477 if (unicode == NULL)
478 return NULL;
479 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480 Py_DECREF(unicode);
481 return v;
482}
483
484PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485 const char *encoding,
486 const char *errors)
487{
488 PyObject *v;
489
490 if (!PyUnicode_Check(unicode)) {
491 PyErr_BadArgument();
492 goto onError;
493 }
Fred Drakee4315f52000-05-09 19:53:39 +0000494
495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
497
498 /* Shortcuts for common default encodings */
499 if (errors == NULL) {
500 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode);
506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507
508 /* Encode via the codec registry */
509 v = PyCodec_Encode(unicode, encoding, errors);
510 if (v == NULL)
511 goto onError;
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v)) {
514 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000515 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 v->ob_type->tp_name);
517 Py_DECREF(v);
518 goto onError;
519 }
520 return v;
521
522 onError:
523 return NULL;
524}
525
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000526/* Return a Python string holding the default encoded value of the
527 Unicode object.
528
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
533
534 The refcount of the string is *not* incremented.
535
536 *** Exported for internal use by the interpreter only !!! ***
537
538*/
539
540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541 const char *errors)
542{
543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
544
545 if (v)
546 return v;
547 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548 if (v && errors == NULL)
549 ((PyUnicodeObject *)unicode)->defenc = v;
550 return v;
551}
552
Guido van Rossumd57fd912000-03-10 22:53:23 +0000553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
554{
555 if (!PyUnicode_Check(unicode)) {
556 PyErr_BadArgument();
557 goto onError;
558 }
559 return PyUnicode_AS_UNICODE(unicode);
560
561 onError:
562 return NULL;
563}
564
565int PyUnicode_GetSize(PyObject *unicode)
566{
567 if (!PyUnicode_Check(unicode)) {
568 PyErr_BadArgument();
569 goto onError;
570 }
571 return PyUnicode_GET_SIZE(unicode);
572
573 onError:
574 return -1;
575}
576
Thomas Wouters78890102000-07-22 19:25:51 +0000577const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000578{
579 return unicode_default_encoding;
580}
581
582int PyUnicode_SetDefaultEncoding(const char *encoding)
583{
584 PyObject *v;
585
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v = _PyCodec_Lookup(encoding);
589 if (v == NULL)
590 goto onError;
591 Py_DECREF(v);
592 strncpy(unicode_default_encoding,
593 encoding,
594 sizeof(unicode_default_encoding));
595 return 0;
596
597 onError:
598 return -1;
599}
600
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601/* --- UTF-8 Codec -------------------------------------------------------- */
602
603static
604char utf8_code_length[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
623};
624
625static
626int utf8_decoding_error(const char **source,
627 Py_UNICODE **dest,
628 const char *errors,
629 const char *details)
630{
631 if ((errors == NULL) ||
632 (strcmp(errors,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000634 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 details);
636 return -1;
637 }
638 else if (strcmp(errors,"ignore") == 0) {
639 (*source)++;
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 (*source)++;
644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656PyObject *PyUnicode_DecodeUTF8(const char *s,
657 int size,
658 const char *errors)
659{
660 int n;
661 const char *e;
662 PyUnicodeObject *unicode;
663 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665
666 /* Note: size will always be longer than the resulting Unicode
667 character count */
668 unicode = _PyUnicode_New(size);
669 if (!unicode)
670 return NULL;
671 if (size == 0)
672 return (PyObject *)unicode;
673
674 /* Unpack UTF-8 encoded data */
675 p = unicode->str;
676 e = s + size;
677
678 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000679 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680
681 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683 s++;
684 continue;
685 }
686
687 n = utf8_code_length[ch];
688
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000689 if (s + n > e) {
690 errmsg = "unexpected end of data";
691 goto utf8Error;
692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693
694 switch (n) {
695
696 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 errmsg = "unexpected code byte";
698 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 break;
700
701 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 errmsg = "internal error";
703 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 break;
705
706 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 if ((s[1] & 0xc0) != 0x80) {
708 errmsg = "invalid data";
709 goto utf8Error;
710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000712 if (ch < 0x80) {
713 errmsg = "illegal encoding";
714 goto utf8Error;
715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000717 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 break;
719
720 case 3:
721 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000722 (s[2] & 0xc0) != 0x80) {
723 errmsg = "invalid data";
724 goto utf8Error;
725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728 errmsg = "illegal encoding";
729 goto utf8Error;
730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000732 *p++ = (Py_UNICODE)ch;
733 break;
734
735 case 4:
736 if ((s[1] & 0xc0) != 0x80 ||
737 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 (s[3] & 0xc0) != 0x80) {
739 errmsg = "invalid data";
740 goto utf8Error;
741 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000745 if ((ch < 0x10000) || /* minimum value allowed for 4
746 byte encoding */
747 (ch > 0x10ffff)) { /* maximum value allowed for
748 UTF-16 */
749 errmsg = "illegal encoding";
750 goto utf8Error;
751 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000752 /* compute and append the two surrogates: */
753
754 /* translate from 10000..10FFFF to 0..FFFF */
755 ch -= 0x10000;
756
757 /* high surrogate = top 10 bits added to D800 */
758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
759
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 break;
763
764 default:
765 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000766 errmsg = "unsupported Unicode code range";
767 goto utf8Error;
768 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769 }
770 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000771 continue;
772
773 utf8Error:
774 if (utf8_decoding_error(&s, &p, errors, errmsg))
775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 }
777
778 /* Adjust length */
779 if (_PyUnicode_Resize(unicode, p - unicode->str))
780 goto onError;
781
782 return (PyObject *)unicode;
783
784onError:
785 Py_DECREF(unicode);
786 return NULL;
787}
788
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789/* Not used anymore, now that the encoder supports UTF-16
790 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000791#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792static
793int utf8_encoding_error(const Py_UNICODE **source,
794 char **dest,
795 const char *errors,
796 const char *details)
797{
798 if ((errors == NULL) ||
799 (strcmp(errors,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000801 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 details);
803 return -1;
804 }
805 else if (strcmp(errors,"ignore") == 0) {
806 return 0;
807 }
808 else if (strcmp(errors,"replace") == 0) {
809 **dest = '?';
810 (*dest)++;
811 return 0;
812 }
813 else {
814 PyErr_Format(PyExc_ValueError,
815 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000816 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 errors);
818 return -1;
819 }
820}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824 int size,
825 const char *errors)
826{
827 PyObject *v;
828 char *p;
829 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000830 Py_UCS4 ch2;
831 unsigned int cbAllocated = 3 * size;
832 unsigned int cbWritten = 0;
833 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000835 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (v == NULL)
837 return NULL;
838 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000839 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840
841 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000842 while (i < size) {
843 Py_UCS4 ch = s[i++];
844 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000846 cbWritten++;
847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 else if (ch < 0x0800) {
849 *p++ = 0xc0 | (ch >> 6);
850 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000851 cbWritten += 2;
852 }
853 else {
854 /* Check for high surrogate */
855 if (0xD800 <= ch && ch <= 0xDBFF) {
856 if (i != size) {
857 ch2 = s[i];
858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
859
860 if (cbWritten >= (cbAllocated - 4)) {
861 /* Provide enough room for some more
862 surrogates */
863 cbAllocated += 4*10;
864 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000865 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 }
867
868 /* combine the two values */
869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
870
871 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 i++;
874 cbWritten += 4;
875 }
876 }
877 }
878 else {
879 *p++ = (char)(0xe0 | (ch >> 12));
880 cbWritten += 3;
881 }
882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 }
885 }
886 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000887 if (_PyString_Resize(&v, p - q))
888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 return v;
890
891 onError:
892 Py_DECREF(v);
893 return NULL;
894}
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
897{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 if (!PyUnicode_Check(unicode)) {
899 PyErr_BadArgument();
900 return NULL;
901 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903 PyUnicode_GET_SIZE(unicode),
904 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905}
906
907/* --- UTF-16 Codec ------------------------------------------------------- */
908
909static
910int utf16_decoding_error(const Py_UNICODE **source,
911 Py_UNICODE **dest,
912 const char *errors,
913 const char *details)
914{
915 if ((errors == NULL) ||
916 (strcmp(errors,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000918 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000919 details);
920 return -1;
921 }
922 else if (strcmp(errors,"ignore") == 0) {
923 return 0;
924 }
925 else if (strcmp(errors,"replace") == 0) {
926 if (dest) {
927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928 (*dest)++;
929 }
930 return 0;
931 }
932 else {
933 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 errors);
937 return -1;
938 }
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_DecodeUTF16(const char *s,
942 int size,
943 const char *errors,
944 int *byteorder)
945{
946 PyUnicodeObject *unicode;
947 Py_UNICODE *p;
948 const Py_UNICODE *q, *e;
949 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000950 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951
952 /* size should be an even number */
953 if (size % sizeof(Py_UNICODE) != 0) {
954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955 return NULL;
956 /* The remaining input chars are ignored if we fall through
957 here... */
958 }
959
960 /* Note: size will always be longer than the resulting Unicode
961 character count */
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 /* Unpack UTF-16 encoded data */
969 p = unicode->str;
970 q = (Py_UNICODE *)s;
971 e = q + (size / sizeof(Py_UNICODE));
972
973 if (byteorder)
974 bo = *byteorder;
975
976 while (q < e) {
977 register Py_UNICODE ch = *q++;
978
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982 !) */
983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
984 if (ch == 0xFEFF) {
985 bo = -1;
986 continue;
987 } else if (ch == 0xFFFE) {
988 bo = 1;
989 continue;
990 }
991 if (bo == 1)
992 ch = (ch >> 8) | (ch << 8);
993#else
994 if (ch == 0xFEFF) {
995 bo = 1;
996 continue;
997 } else if (ch == 0xFFFE) {
998 bo = -1;
999 continue;
1000 }
1001 if (bo == -1)
1002 ch = (ch >> 8) | (ch << 8);
1003#endif
1004 if (ch < 0xD800 || ch > 0xDFFF) {
1005 *p++ = ch;
1006 continue;
1007 }
1008
1009 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001010 if (q >= e) {
1011 errmsg = "unexpected end of data";
1012 goto utf16Error;
1013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014 if (0xDC00 <= *q && *q <= 0xDFFF) {
1015 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001016 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001021 errmsg = "code pairs are not supported";
1022 goto utf16Error;
1023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 else
1025 continue;
1026 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001027 errmsg = "illegal encoding";
1028 /* Fall through to report the error */
1029
1030 utf16Error:
1031 if (utf16_decoding_error(&q, &p, errors, errmsg))
1032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 }
1034
1035 if (byteorder)
1036 *byteorder = bo;
1037
1038 /* Adjust length */
1039 if (_PyUnicode_Resize(unicode, p - unicode->str))
1040 goto onError;
1041
1042 return (PyObject *)unicode;
1043
1044onError:
1045 Py_DECREF(unicode);
1046 return NULL;
1047}
1048
1049#undef UTF16_ERROR
1050
1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052 int size,
1053 const char *errors,
1054 int byteorder)
1055{
1056 PyObject *v;
1057 Py_UNICODE *p;
1058 char *q;
1059
1060 /* We don't create UTF-16 pairs... */
1061 v = PyString_FromStringAndSize(NULL,
1062 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063 if (v == NULL)
1064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 q = PyString_AS_STRING(v);
1067 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (byteorder == 0)
1069 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001070 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (byteorder == 0 ||
1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074 byteorder == -1
1075#else
1076 byteorder == 1
1077#endif
1078 )
1079 memcpy(p, s, size * sizeof(Py_UNICODE));
1080 else
1081 while (size-- > 0) {
1082 Py_UNICODE ch = *s++;
1083 *p++ = (ch >> 8) | (ch << 8);
1084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 return v;
1086}
1087
1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089{
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 return NULL;
1093 }
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095 PyUnicode_GET_SIZE(unicode),
1096 NULL,
1097 0);
1098}
1099
1100/* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102static
1103int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001104 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 const char *errors,
1106 const char *details)
1107{
1108 if ((errors == NULL) ||
1109 (strcmp(errors,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001111 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 details);
1113 return -1;
1114 }
1115 else if (strcmp(errors,"ignore") == 0) {
1116 return 0;
1117 }
1118 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001119 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120 return 0;
1121 }
1122 else {
1123 PyErr_Format(PyExc_ValueError,
1124 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126 errors);
1127 return -1;
1128 }
1129}
1130
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001131static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133static
1134int mystrnicmp(const char *s1, const char *s2, size_t count)
1135{
1136 char c1, c2;
1137
1138 if (count)
1139 {
1140 do
1141 {
1142 c1 = tolower(*(s1++));
1143 c2 = tolower(*(s2++));
1144 }
1145 while(--count && c1 == c2);
1146
1147 return c1 - c2;
1148 }
1149
1150 return 0;
1151}
1152
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154 int size,
1155 const char *errors)
1156{
1157 PyUnicodeObject *v;
1158 Py_UNICODE *p = NULL, *buf = NULL;
1159 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001160 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v = _PyUnicode_New(size);
1166 if (v == NULL)
1167 goto onError;
1168 if (size == 0)
1169 return (PyObject *)v;
1170 p = buf = PyUnicode_AS_UNICODE(v);
1171 end = s + size;
1172 while (s < end) {
1173 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001174 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 int i;
1176
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1178 if (*s != '\\') {
1179 *p++ = (unsigned char)*s++;
1180 continue;
1181 }
1182
1183 /* \ - Escapes */
1184 s++;
1185 switch (*s++) {
1186
1187 /* \x escapes */
1188 case '\n': break;
1189 case '\\': *p++ = '\\'; break;
1190 case '\'': *p++ = '\''; break;
1191 case '\"': *p++ = '\"'; break;
1192 case 'b': *p++ = '\b'; break;
1193 case 'f': *p++ = '\014'; break; /* FF */
1194 case 't': *p++ = '\t'; break;
1195 case 'n': *p++ = '\n'; break;
1196 case 'r': *p++ = '\r'; break;
1197 case 'v': *p++ = '\013'; break; /* VT */
1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001203 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001205 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001207 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001209 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 break;
1211
Fredrik Lundhdf846752000-09-03 11:29:49 +00001212 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001214 for (x = 0, i = 0; i < 2; i++) {
1215 c = (unsigned char)s[i];
1216 if (!isxdigit(c)) {
1217 if (unicodeescape_decoding_error(&s, &x, errors,
1218 "truncated \\xXX"))
1219 goto onError;
1220 i++;
1221 break;
1222 }
1223 x = (x<<4) & ~0xF;
1224 if (c >= '0' && c <= '9')
1225 x += c - '0';
1226 else if (c >= 'a' && c <= 'f')
1227 x += 10 + c - 'a';
1228 else
1229 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001231 s += i;
1232 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 break;
1234
1235 /* \uXXXX with 4 hex digits */
1236 case 'u':
1237 for (x = 0, i = 0; i < 4; i++) {
1238 c = (unsigned char)s[i];
1239 if (!isxdigit(c)) {
1240 if (unicodeescape_decoding_error(&s, &x, errors,
1241 "truncated \\uXXXX"))
1242 goto onError;
1243 i++;
1244 break;
1245 }
1246 x = (x<<4) & ~0xF;
1247 if (c >= '0' && c <= '9')
1248 x += c - '0';
1249 else if (c >= 'a' && c <= 'f')
1250 x += 10 + c - 'a';
1251 else
1252 x += 10 + c - 'A';
1253 }
1254 s += i;
1255 *p++ = x;
1256 break;
1257
Fredrik Lundhdf846752000-09-03 11:29:49 +00001258 /* \UXXXXXXXX with 8 hex digits */
1259 case 'U':
1260 for (chr = 0, i = 0; i < 8; i++) {
1261 c = (unsigned char)s[i];
1262 if (!isxdigit(c)) {
1263 if (unicodeescape_decoding_error(&s, &x, errors,
1264 "truncated \\uXXXX"))
1265 goto onError;
1266 i++;
1267 break;
1268 }
1269 chr = (chr<<4) & ~0xF;
1270 if (c >= '0' && c <= '9')
1271 chr += c - '0';
1272 else if (c >= 'a' && c <= 'f')
1273 chr += 10 + c - 'a';
1274 else
1275 chr += 10 + c - 'A';
1276 }
1277 s += i;
1278 goto store;
1279
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001280 case 'N':
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1283 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001284 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001285 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 mod = PyImport_ImportModule("ucnhash");
1287 if (mod == NULL)
1288 goto onError;
1289 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290 Py_DECREF(mod);
1291 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001292 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001293 pucnHash = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001296 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001297 }
1298
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001300 const char *start = s + 1;
1301 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001302 unsigned long j;
1303
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1306 */
1307 while (*endBrace != '}' &&
1308 (unsigned int)(endBrace - start) <=
1309 pucnHash->cchMax &&
1310 endBrace < end)
1311 {
1312 endBrace++;
1313 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001314 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001315 j = pucnHash->hash(start, endBrace - start);
1316 if (j > pucnHash->cKeys ||
1317 mystrnicmp(
1318 start,
1319 ((_Py_UnicodeCharacterName *)
1320 (pucnHash->getValue(j)))->pszUCN,
1321 (int)(endBrace - start)) != 0)
1322 {
1323 if (unicodeescape_decoding_error(
1324 &s, &x, errors,
1325 "Invalid Unicode Character Name"))
1326 {
1327 goto onError;
1328 }
1329 goto ucnFallthrough;
1330 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001331 chr = ((_Py_UnicodeCharacterName *)
1332 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001333 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001334 goto store;
1335 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Unicode name missing closing brace"))
1339 goto onError;
1340 goto ucnFallthrough;
1341 }
1342 break;
1343 }
1344 if (unicodeescape_decoding_error(
1345 &s, &x, errors,
1346 "Missing opening brace for Unicode Character Name escape"))
1347 goto onError;
1348ucnFallthrough:
1349 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001350 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 *p++ = '\\';
1352 *p++ = (unsigned char)s[-1];
1353 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001354store:
1355 /* when we get here, chr is a 32-bit unicode character */
1356 if (chr <= 0xffff)
1357 /* UCS-2 character */
1358 *p++ = (Py_UNICODE) chr;
1359 else if (chr <= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1361 chr -= 0x10000L;
1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364 } else {
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Illegal Unicode character")
1368 )
1369 goto onError;
1370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 }
1372 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001373 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return (PyObject *)v;
1376
1377 onError:
1378 Py_XDECREF(v);
1379 return NULL;
1380}
1381
1382/* Return a Unicode-Escape string version of the Unicode object.
1383
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1385 appropriate.
1386
1387*/
1388
Barry Warsaw51ac5802000-03-20 16:36:48 +00001389static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390 int size,
1391 Py_UNICODE ch);
1392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393static
1394PyObject *unicodeescape_string(const Py_UNICODE *s,
1395 int size,
1396 int quotes)
1397{
1398 PyObject *repr;
1399 char *p;
1400 char *q;
1401
1402 static const char *hexdigit = "0123456789ABCDEF";
1403
1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405 if (repr == NULL)
1406 return NULL;
1407
1408 p = q = PyString_AS_STRING(repr);
1409
1410 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 *p++ = 'u';
1412 *p++ = (findchar(s, size, '\'') &&
1413 !findchar(s, size, '"')) ? '"' : '\'';
1414 }
1415 while (size-- > 0) {
1416 Py_UNICODE ch = *s++;
1417 /* Escape quotes */
1418 if (quotes && (ch == q[1] || ch == '\\')) {
1419 *p++ = '\\';
1420 *p++ = (char) ch;
1421 }
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch >= 256) {
1424 *p++ = '\\';
1425 *p++ = 'u';
1426 *p++ = hexdigit[(ch >> 12) & 0xf];
1427 *p++ = hexdigit[(ch >> 8) & 0xf];
1428 *p++ = hexdigit[(ch >> 4) & 0xf];
1429 *p++ = hexdigit[ch & 15];
1430 }
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch < ' ' || ch >= 128) {
1433 *p++ = '\\';
1434 *p++ = hexdigit[(ch >> 6) & 7];
1435 *p++ = hexdigit[(ch >> 3) & 7];
1436 *p++ = hexdigit[ch & 7];
1437 }
1438 /* Copy everything else as-is */
1439 else
1440 *p++ = (char) ch;
1441 }
1442 if (quotes)
1443 *p++ = q[1];
1444
1445 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001446 if (_PyString_Resize(&repr, p - q))
1447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001450
1451 onError:
1452 Py_DECREF(repr);
1453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457 int size)
1458{
1459 return unicodeescape_string(s, size, 0);
1460}
1461
1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463{
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 return NULL;
1467 }
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469 PyUnicode_GET_SIZE(unicode));
1470}
1471
1472/* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475 int size,
1476 const char *errors)
1477{
1478 PyUnicodeObject *v;
1479 Py_UNICODE *p, *buf;
1480 const char *end;
1481 const char *bs;
1482
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v = _PyUnicode_New(size);
1487 if (v == NULL)
1488 goto onError;
1489 if (size == 0)
1490 return (PyObject *)v;
1491 p = buf = PyUnicode_AS_UNICODE(v);
1492 end = s + size;
1493 while (s < end) {
1494 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001495 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 int i;
1497
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1499 if (*s != '\\') {
1500 *p++ = (unsigned char)*s++;
1501 continue;
1502 }
1503
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1506 bs = s;
1507 for (;s < end;) {
1508 if (*s != '\\')
1509 break;
1510 *p++ = (unsigned char)*s++;
1511 }
1512 if (((s - bs) & 1) == 0 ||
1513 s >= end ||
1514 *s != 'u') {
1515 continue;
1516 }
1517 p--;
1518 s++;
1519
1520 /* \uXXXX with 4 hex digits */
1521 for (x = 0, i = 0; i < 4; i++) {
1522 c = (unsigned char)s[i];
1523 if (!isxdigit(c)) {
1524 if (unicodeescape_decoding_error(&s, &x, errors,
1525 "truncated \\uXXXX"))
1526 goto onError;
1527 i++;
1528 break;
1529 }
1530 x = (x<<4) & ~0xF;
1531 if (c >= '0' && c <= '9')
1532 x += c - '0';
1533 else if (c >= 'a' && c <= 'f')
1534 x += 10 + c - 'a';
1535 else
1536 x += 10 + c - 'A';
1537 }
1538 s += i;
1539 *p++ = x;
1540 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001541 if (_PyUnicode_Resize(v, (int)(p - buf)))
1542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return (PyObject *)v;
1544
1545 onError:
1546 Py_XDECREF(v);
1547 return NULL;
1548}
1549
1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551 int size)
1552{
1553 PyObject *repr;
1554 char *p;
1555 char *q;
1556
1557 static const char *hexdigit = "0123456789ABCDEF";
1558
1559 repr = PyString_FromStringAndSize(NULL, 6 * size);
1560 if (repr == NULL)
1561 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001562 if (size == 0)
1563 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564
1565 p = q = PyString_AS_STRING(repr);
1566 while (size-- > 0) {
1567 Py_UNICODE ch = *s++;
1568 /* Map 16-bit characters to '\uxxxx' */
1569 if (ch >= 256) {
1570 *p++ = '\\';
1571 *p++ = 'u';
1572 *p++ = hexdigit[(ch >> 12) & 0xf];
1573 *p++ = hexdigit[(ch >> 8) & 0xf];
1574 *p++ = hexdigit[(ch >> 4) & 0xf];
1575 *p++ = hexdigit[ch & 15];
1576 }
1577 /* Copy everything else as-is */
1578 else
1579 *p++ = (char) ch;
1580 }
1581 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001582 if (_PyString_Resize(&repr, p - q))
1583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584
1585 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001586
1587 onError:
1588 Py_DECREF(repr);
1589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590}
1591
1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593{
1594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1597 }
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599 PyUnicode_GET_SIZE(unicode));
1600}
1601
1602/* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604PyObject *PyUnicode_DecodeLatin1(const char *s,
1605 int size,
1606 const char *errors)
1607{
1608 PyUnicodeObject *v;
1609 Py_UNICODE *p;
1610
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v = _PyUnicode_New(size);
1613 if (v == NULL)
1614 goto onError;
1615 if (size == 0)
1616 return (PyObject *)v;
1617 p = PyUnicode_AS_UNICODE(v);
1618 while (size-- > 0)
1619 *p++ = (unsigned char)*s++;
1620 return (PyObject *)v;
1621
1622 onError:
1623 Py_XDECREF(v);
1624 return NULL;
1625}
1626
1627static
1628int latin1_encoding_error(const Py_UNICODE **source,
1629 char **dest,
1630 const char *errors,
1631 const char *details)
1632{
1633 if ((errors == NULL) ||
1634 (strcmp(errors,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001636 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 details);
1638 return -1;
1639 }
1640 else if (strcmp(errors,"ignore") == 0) {
1641 return 0;
1642 }
1643 else if (strcmp(errors,"replace") == 0) {
1644 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 return 0;
1647 }
1648 else {
1649 PyErr_Format(PyExc_ValueError,
1650 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 errors);
1653 return -1;
1654 }
1655}
1656
1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658 int size,
1659 const char *errors)
1660{
1661 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001662 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001663
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 repr = PyString_FromStringAndSize(NULL, size);
1665 if (repr == NULL)
1666 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001667 if (size == 0)
1668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
1670 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001671 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1678 }
1679 else
1680 *s++ = (char)ch;
1681 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return repr;
1687
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1698 }
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1711{
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001715 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 details);
1717 return -1;
1718 }
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1721 }
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1726 }
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 errors);
1732 return -1;
1733 }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1739{
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1742
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1752
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1759 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return (PyObject *)v;
1764
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1775{
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001779 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 details);
1781 return -1;
1782 }
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1785 }
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001788 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 return 0;
1790 }
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 errors);
1796 return -1;
1797 }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1803{
1804 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001805 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 repr = PyString_FromStringAndSize(NULL, size);
1808 if (repr == NULL)
1809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001810 if (size == 0)
1811 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812
1813 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001814 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 while (size-- > 0) {
1816 Py_UNICODE ch = *p++;
1817 if (ch >= 128) {
1818 if (ascii_encoding_error(&p, &s, errors,
1819 "ordinal not in range(128)"))
1820 goto onError;
1821 }
1822 else
1823 *s++ = (char)ch;
1824 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001825 /* Resize if error handling skipped some characters */
1826 if (s - start < PyString_GET_SIZE(repr))
1827 if (_PyString_Resize(&repr, s - start))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return repr;
1830
1831 onError:
1832 Py_DECREF(repr);
1833 return NULL;
1834}
1835
1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837{
1838 if (!PyUnicode_Check(unicode)) {
1839 PyErr_BadArgument();
1840 return NULL;
1841 }
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843 PyUnicode_GET_SIZE(unicode),
1844 NULL);
1845}
1846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001847#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001851PyObject *PyUnicode_DecodeMBCS(const char *s,
1852 int size,
1853 const char *errors)
1854{
1855 PyUnicodeObject *v;
1856 Py_UNICODE *p;
1857
1858 /* First get the size of the result */
1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001860 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863 v = _PyUnicode_New(usize);
1864 if (v == NULL)
1865 return NULL;
1866 if (usize == 0)
1867 return (PyObject *)v;
1868 p = PyUnicode_AS_UNICODE(v);
1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870 Py_DECREF(v);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872 }
1873
1874 return (PyObject *)v;
1875}
1876
1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878 int size,
1879 const char *errors)
1880{
1881 PyObject *repr;
1882 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001883 DWORD mbcssize;
1884
1885 /* If there are no characters, bail now! */
1886 if (size==0)
1887 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888
1889 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001891 if (mbcssize==0)
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894 repr = PyString_FromStringAndSize(NULL, mbcssize);
1895 if (repr == NULL)
1896 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001897 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001898 return repr;
1899
1900 /* Do the conversion */
1901 s = PyString_AS_STRING(repr);
1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903 Py_DECREF(repr);
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905 }
1906 return repr;
1907}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909#endif /* MS_WIN32 */
1910
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911/* --- Character Mapping Codec -------------------------------------------- */
1912
1913static
1914int charmap_decoding_error(const char **source,
1915 Py_UNICODE **dest,
1916 const char *errors,
1917 const char *details)
1918{
1919 if ((errors == NULL) ||
1920 (strcmp(errors,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001922 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 details);
1924 return -1;
1925 }
1926 else if (strcmp(errors,"ignore") == 0) {
1927 return 0;
1928 }
1929 else if (strcmp(errors,"replace") == 0) {
1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931 (*dest)++;
1932 return 0;
1933 }
1934 else {
1935 PyErr_Format(PyExc_ValueError,
1936 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001937 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 errors);
1939 return -1;
1940 }
1941}
1942
1943PyObject *PyUnicode_DecodeCharmap(const char *s,
1944 int size,
1945 PyObject *mapping,
1946 const char *errors)
1947{
1948 PyUnicodeObject *v;
1949 Py_UNICODE *p;
1950
1951 /* Default to Latin-1 */
1952 if (mapping == NULL)
1953 return PyUnicode_DecodeLatin1(s, size, errors);
1954
1955 v = _PyUnicode_New(size);
1956 if (v == NULL)
1957 goto onError;
1958 if (size == 0)
1959 return (PyObject *)v;
1960 p = PyUnicode_AS_UNICODE(v);
1961 while (size-- > 0) {
1962 unsigned char ch = *s++;
1963 PyObject *w, *x;
1964
1965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1966 w = PyInt_FromLong((long)ch);
1967 if (w == NULL)
1968 goto onError;
1969 x = PyObject_GetItem(mapping, w);
1970 Py_DECREF(w);
1971 if (x == NULL) {
1972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1973 /* No mapping found: default to Latin-1 mapping */
1974 PyErr_Clear();
1975 *p++ = (Py_UNICODE)ch;
1976 continue;
1977 }
1978 goto onError;
1979 }
1980
1981 /* Apply mapping */
1982 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001983 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 if (value < 0 || value > 65535) {
1985 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001986 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 Py_DECREF(x);
1988 goto onError;
1989 }
1990 *p++ = (Py_UNICODE)value;
1991 }
1992 else if (x == Py_None) {
1993 /* undefined mapping */
1994 if (charmap_decoding_error(&s, &p, errors,
1995 "character maps to <undefined>")) {
1996 Py_DECREF(x);
1997 goto onError;
1998 }
1999 }
2000 else if (PyUnicode_Check(x)) {
2001 if (PyUnicode_GET_SIZE(x) != 1) {
2002 /* 1-n mapping */
2003 PyErr_SetString(PyExc_NotImplementedError,
2004 "1-n mappings are currently not implemented");
2005 Py_DECREF(x);
2006 goto onError;
2007 }
2008 *p++ = *PyUnicode_AS_UNICODE(x);
2009 }
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2016 }
2017 Py_DECREF(x);
2018 }
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2020 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2021 goto onError;
2022 return (PyObject *)v;
2023
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2034{
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002038 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 details);
2040 return -1;
2041 }
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2044 }
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2049 }
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002053 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 errors);
2055 return -1;
2056 }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2063{
2064 PyObject *v;
2065 char *s;
2066
2067 /* Default to Latin-1 */
2068 if (mapping == NULL)
2069 return PyUnicode_EncodeLatin1(p, size, errors);
2070
2071 v = PyString_FromStringAndSize(NULL, size);
2072 if (v == NULL)
2073 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002074 if (size == 0)
2075 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 s = PyString_AS_STRING(v);
2077 while (size-- > 0) {
2078 Py_UNICODE ch = *p++;
2079 PyObject *w, *x;
2080
2081 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2082 w = PyInt_FromLong((long)ch);
2083 if (w == NULL)
2084 goto onError;
2085 x = PyObject_GetItem(mapping, w);
2086 Py_DECREF(w);
2087 if (x == NULL) {
2088 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2089 /* No mapping found: default to Latin-1 mapping if possible */
2090 PyErr_Clear();
2091 if (ch < 256) {
2092 *s++ = (char)ch;
2093 continue;
2094 }
2095 else if (!charmap_encoding_error(&p, &s, errors,
2096 "missing character mapping"))
2097 continue;
2098 }
2099 goto onError;
2100 }
2101
2102 /* Apply mapping */
2103 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002104 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 if (value < 0 || value > 255) {
2106 PyErr_SetString(PyExc_TypeError,
2107 "character mapping must be in range(256)");
2108 Py_DECREF(x);
2109 goto onError;
2110 }
2111 *s++ = (char)value;
2112 }
2113 else if (x == Py_None) {
2114 /* undefined mapping */
2115 if (charmap_encoding_error(&p, &s, errors,
2116 "character maps to <undefined>")) {
2117 Py_DECREF(x);
2118 goto onError;
2119 }
2120 }
2121 else if (PyString_Check(x)) {
2122 if (PyString_GET_SIZE(x) != 1) {
2123 /* 1-n mapping */
2124 PyErr_SetString(PyExc_NotImplementedError,
2125 "1-n mappings are currently not implemented");
2126 Py_DECREF(x);
2127 goto onError;
2128 }
2129 *s++ = *PyString_AS_STRING(x);
2130 }
2131 else {
2132 /* wrong return value */
2133 PyErr_SetString(PyExc_TypeError,
2134 "character mapping must return integer, None or unicode");
2135 Py_DECREF(x);
2136 goto onError;
2137 }
2138 Py_DECREF(x);
2139 }
2140 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2141 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2142 goto onError;
2143 return v;
2144
2145 onError:
2146 Py_DECREF(v);
2147 return NULL;
2148}
2149
2150PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2151 PyObject *mapping)
2152{
2153 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2154 PyErr_BadArgument();
2155 return NULL;
2156 }
2157 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2158 PyUnicode_GET_SIZE(unicode),
2159 mapping,
2160 NULL);
2161}
2162
2163static
2164int translate_error(const Py_UNICODE **source,
2165 Py_UNICODE **dest,
2166 const char *errors,
2167 const char *details)
2168{
2169 if ((errors == NULL) ||
2170 (strcmp(errors,"strict") == 0)) {
2171 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002172 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 details);
2174 return -1;
2175 }
2176 else if (strcmp(errors,"ignore") == 0) {
2177 return 0;
2178 }
2179 else if (strcmp(errors,"replace") == 0) {
2180 **dest = '?';
2181 (*dest)++;
2182 return 0;
2183 }
2184 else {
2185 PyErr_Format(PyExc_ValueError,
2186 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002187 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 errors);
2189 return -1;
2190 }
2191}
2192
2193PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2194 int size,
2195 PyObject *mapping,
2196 const char *errors)
2197{
2198 PyUnicodeObject *v;
2199 Py_UNICODE *p;
2200
2201 if (mapping == NULL) {
2202 PyErr_BadArgument();
2203 return NULL;
2204 }
2205
2206 /* Output will never be longer than input */
2207 v = _PyUnicode_New(size);
2208 if (v == NULL)
2209 goto onError;
2210 if (size == 0)
2211 goto done;
2212 p = PyUnicode_AS_UNICODE(v);
2213 while (size-- > 0) {
2214 Py_UNICODE ch = *s++;
2215 PyObject *w, *x;
2216
2217 /* Get mapping */
2218 w = PyInt_FromLong(ch);
2219 if (w == NULL)
2220 goto onError;
2221 x = PyObject_GetItem(mapping, w);
2222 Py_DECREF(w);
2223 if (x == NULL) {
2224 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2225 /* No mapping found: default to 1-1 mapping */
2226 PyErr_Clear();
2227 *p++ = ch;
2228 continue;
2229 }
2230 goto onError;
2231 }
2232
2233 /* Apply mapping */
2234 if (PyInt_Check(x))
2235 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2236 else if (x == Py_None) {
2237 /* undefined mapping */
2238 if (translate_error(&s, &p, errors,
2239 "character maps to <undefined>")) {
2240 Py_DECREF(x);
2241 goto onError;
2242 }
2243 }
2244 else if (PyUnicode_Check(x)) {
2245 if (PyUnicode_GET_SIZE(x) != 1) {
2246 /* 1-n mapping */
2247 PyErr_SetString(PyExc_NotImplementedError,
2248 "1-n mappings are currently not implemented");
2249 Py_DECREF(x);
2250 goto onError;
2251 }
2252 *p++ = *PyUnicode_AS_UNICODE(x);
2253 }
2254 else {
2255 /* wrong return value */
2256 PyErr_SetString(PyExc_TypeError,
2257 "translate mapping must return integer, None or unicode");
2258 Py_DECREF(x);
2259 goto onError;
2260 }
2261 Py_DECREF(x);
2262 }
2263 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002264 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266
2267 done:
2268 return (PyObject *)v;
2269
2270 onError:
2271 Py_XDECREF(v);
2272 return NULL;
2273}
2274
2275PyObject *PyUnicode_Translate(PyObject *str,
2276 PyObject *mapping,
2277 const char *errors)
2278{
2279 PyObject *result;
2280
2281 str = PyUnicode_FromObject(str);
2282 if (str == NULL)
2283 goto onError;
2284 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2285 PyUnicode_GET_SIZE(str),
2286 mapping,
2287 errors);
2288 Py_DECREF(str);
2289 return result;
2290
2291 onError:
2292 Py_XDECREF(str);
2293 return NULL;
2294}
2295
Guido van Rossum9e896b32000-04-05 20:11:21 +00002296/* --- Decimal Encoder ---------------------------------------------------- */
2297
2298int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2299 int length,
2300 char *output,
2301 const char *errors)
2302{
2303 Py_UNICODE *p, *end;
2304
2305 if (output == NULL) {
2306 PyErr_BadArgument();
2307 return -1;
2308 }
2309
2310 p = s;
2311 end = s + length;
2312 while (p < end) {
2313 register Py_UNICODE ch = *p++;
2314 int decimal;
2315
2316 if (Py_UNICODE_ISSPACE(ch)) {
2317 *output++ = ' ';
2318 continue;
2319 }
2320 decimal = Py_UNICODE_TODECIMAL(ch);
2321 if (decimal >= 0) {
2322 *output++ = '0' + decimal;
2323 continue;
2324 }
Guido van Rossumba477042000-04-06 18:18:10 +00002325 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002326 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002327 continue;
2328 }
2329 /* All other characters are considered invalid */
2330 if (errors == NULL || strcmp(errors, "strict") == 0) {
2331 PyErr_SetString(PyExc_ValueError,
2332 "invalid decimal Unicode string");
2333 goto onError;
2334 }
2335 else if (strcmp(errors, "ignore") == 0)
2336 continue;
2337 else if (strcmp(errors, "replace") == 0) {
2338 *output++ = '?';
2339 continue;
2340 }
2341 }
2342 /* 0-terminate the output string */
2343 *output++ = '\0';
2344 return 0;
2345
2346 onError:
2347 return -1;
2348}
2349
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350/* --- Helpers ------------------------------------------------------------ */
2351
2352static
2353int count(PyUnicodeObject *self,
2354 int start,
2355 int end,
2356 PyUnicodeObject *substring)
2357{
2358 int count = 0;
2359
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002360 if (substring->length == 0)
2361 return (end - start + 1);
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363 end -= substring->length;
2364
2365 while (start <= end)
2366 if (Py_UNICODE_MATCH(self, start, substring)) {
2367 count++;
2368 start += substring->length;
2369 } else
2370 start++;
2371
2372 return count;
2373}
2374
2375int PyUnicode_Count(PyObject *str,
2376 PyObject *substr,
2377 int start,
2378 int end)
2379{
2380 int result;
2381
2382 str = PyUnicode_FromObject(str);
2383 if (str == NULL)
2384 return -1;
2385 substr = PyUnicode_FromObject(substr);
2386 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002387 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 return -1;
2389 }
2390
2391 result = count((PyUnicodeObject *)str,
2392 start, end,
2393 (PyUnicodeObject *)substr);
2394
2395 Py_DECREF(str);
2396 Py_DECREF(substr);
2397 return result;
2398}
2399
2400static
2401int findstring(PyUnicodeObject *self,
2402 PyUnicodeObject *substring,
2403 int start,
2404 int end,
2405 int direction)
2406{
2407 if (start < 0)
2408 start += self->length;
2409 if (start < 0)
2410 start = 0;
2411
2412 if (substring->length == 0)
2413 return start;
2414
2415 if (end > self->length)
2416 end = self->length;
2417 if (end < 0)
2418 end += self->length;
2419 if (end < 0)
2420 end = 0;
2421
2422 end -= substring->length;
2423
2424 if (direction < 0) {
2425 for (; end >= start; end--)
2426 if (Py_UNICODE_MATCH(self, end, substring))
2427 return end;
2428 } else {
2429 for (; start <= end; start++)
2430 if (Py_UNICODE_MATCH(self, start, substring))
2431 return start;
2432 }
2433
2434 return -1;
2435}
2436
2437int PyUnicode_Find(PyObject *str,
2438 PyObject *substr,
2439 int start,
2440 int end,
2441 int direction)
2442{
2443 int result;
2444
2445 str = PyUnicode_FromObject(str);
2446 if (str == NULL)
2447 return -1;
2448 substr = PyUnicode_FromObject(substr);
2449 if (substr == NULL) {
2450 Py_DECREF(substr);
2451 return -1;
2452 }
2453
2454 result = findstring((PyUnicodeObject *)str,
2455 (PyUnicodeObject *)substr,
2456 start, end, direction);
2457 Py_DECREF(str);
2458 Py_DECREF(substr);
2459 return result;
2460}
2461
2462static
2463int tailmatch(PyUnicodeObject *self,
2464 PyUnicodeObject *substring,
2465 int start,
2466 int end,
2467 int direction)
2468{
2469 if (start < 0)
2470 start += self->length;
2471 if (start < 0)
2472 start = 0;
2473
2474 if (substring->length == 0)
2475 return 1;
2476
2477 if (end > self->length)
2478 end = self->length;
2479 if (end < 0)
2480 end += self->length;
2481 if (end < 0)
2482 end = 0;
2483
2484 end -= substring->length;
2485 if (end < start)
2486 return 0;
2487
2488 if (direction > 0) {
2489 if (Py_UNICODE_MATCH(self, end, substring))
2490 return 1;
2491 } else {
2492 if (Py_UNICODE_MATCH(self, start, substring))
2493 return 1;
2494 }
2495
2496 return 0;
2497}
2498
2499int PyUnicode_Tailmatch(PyObject *str,
2500 PyObject *substr,
2501 int start,
2502 int end,
2503 int direction)
2504{
2505 int result;
2506
2507 str = PyUnicode_FromObject(str);
2508 if (str == NULL)
2509 return -1;
2510 substr = PyUnicode_FromObject(substr);
2511 if (substr == NULL) {
2512 Py_DECREF(substr);
2513 return -1;
2514 }
2515
2516 result = tailmatch((PyUnicodeObject *)str,
2517 (PyUnicodeObject *)substr,
2518 start, end, direction);
2519 Py_DECREF(str);
2520 Py_DECREF(substr);
2521 return result;
2522}
2523
2524static
2525const Py_UNICODE *findchar(const Py_UNICODE *s,
2526 int size,
2527 Py_UNICODE ch)
2528{
2529 /* like wcschr, but doesn't stop at NULL characters */
2530
2531 while (size-- > 0) {
2532 if (*s == ch)
2533 return s;
2534 s++;
2535 }
2536
2537 return NULL;
2538}
2539
2540/* Apply fixfct filter to the Unicode object self and return a
2541 reference to the modified object */
2542
2543static
2544PyObject *fixup(PyUnicodeObject *self,
2545 int (*fixfct)(PyUnicodeObject *s))
2546{
2547
2548 PyUnicodeObject *u;
2549
2550 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2551 self->length);
2552 if (u == NULL)
2553 return NULL;
2554 if (!fixfct(u)) {
2555 /* fixfct should return TRUE if it modified the buffer. If
2556 FALSE, return a reference to the original buffer instead
2557 (to save space, not time) */
2558 Py_INCREF(self);
2559 Py_DECREF(u);
2560 return (PyObject*) self;
2561 }
2562 return (PyObject*) u;
2563}
2564
2565static
2566int fixupper(PyUnicodeObject *self)
2567{
2568 int len = self->length;
2569 Py_UNICODE *s = self->str;
2570 int status = 0;
2571
2572 while (len-- > 0) {
2573 register Py_UNICODE ch;
2574
2575 ch = Py_UNICODE_TOUPPER(*s);
2576 if (ch != *s) {
2577 status = 1;
2578 *s = ch;
2579 }
2580 s++;
2581 }
2582
2583 return status;
2584}
2585
2586static
2587int fixlower(PyUnicodeObject *self)
2588{
2589 int len = self->length;
2590 Py_UNICODE *s = self->str;
2591 int status = 0;
2592
2593 while (len-- > 0) {
2594 register Py_UNICODE ch;
2595
2596 ch = Py_UNICODE_TOLOWER(*s);
2597 if (ch != *s) {
2598 status = 1;
2599 *s = ch;
2600 }
2601 s++;
2602 }
2603
2604 return status;
2605}
2606
2607static
2608int fixswapcase(PyUnicodeObject *self)
2609{
2610 int len = self->length;
2611 Py_UNICODE *s = self->str;
2612 int status = 0;
2613
2614 while (len-- > 0) {
2615 if (Py_UNICODE_ISUPPER(*s)) {
2616 *s = Py_UNICODE_TOLOWER(*s);
2617 status = 1;
2618 } else if (Py_UNICODE_ISLOWER(*s)) {
2619 *s = Py_UNICODE_TOUPPER(*s);
2620 status = 1;
2621 }
2622 s++;
2623 }
2624
2625 return status;
2626}
2627
2628static
2629int fixcapitalize(PyUnicodeObject *self)
2630{
2631 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2632 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2633 return 1;
2634 }
2635 return 0;
2636}
2637
2638static
2639int fixtitle(PyUnicodeObject *self)
2640{
2641 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2642 register Py_UNICODE *e;
2643 int previous_is_cased;
2644
2645 /* Shortcut for single character strings */
2646 if (PyUnicode_GET_SIZE(self) == 1) {
2647 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2648 if (*p != ch) {
2649 *p = ch;
2650 return 1;
2651 }
2652 else
2653 return 0;
2654 }
2655
2656 e = p + PyUnicode_GET_SIZE(self);
2657 previous_is_cased = 0;
2658 for (; p < e; p++) {
2659 register const Py_UNICODE ch = *p;
2660
2661 if (previous_is_cased)
2662 *p = Py_UNICODE_TOLOWER(ch);
2663 else
2664 *p = Py_UNICODE_TOTITLE(ch);
2665
2666 if (Py_UNICODE_ISLOWER(ch) ||
2667 Py_UNICODE_ISUPPER(ch) ||
2668 Py_UNICODE_ISTITLE(ch))
2669 previous_is_cased = 1;
2670 else
2671 previous_is_cased = 0;
2672 }
2673 return 1;
2674}
2675
2676PyObject *PyUnicode_Join(PyObject *separator,
2677 PyObject *seq)
2678{
2679 Py_UNICODE *sep;
2680 int seplen;
2681 PyUnicodeObject *res = NULL;
2682 int reslen = 0;
2683 Py_UNICODE *p;
2684 int seqlen = 0;
2685 int sz = 100;
2686 int i;
2687
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002688 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 if (seqlen < 0 && PyErr_Occurred())
2690 return NULL;
2691
2692 if (separator == NULL) {
2693 Py_UNICODE blank = ' ';
2694 sep = &blank;
2695 seplen = 1;
2696 }
2697 else {
2698 separator = PyUnicode_FromObject(separator);
2699 if (separator == NULL)
2700 return NULL;
2701 sep = PyUnicode_AS_UNICODE(separator);
2702 seplen = PyUnicode_GET_SIZE(separator);
2703 }
2704
2705 res = _PyUnicode_New(sz);
2706 if (res == NULL)
2707 goto onError;
2708 p = PyUnicode_AS_UNICODE(res);
2709 reslen = 0;
2710
2711 for (i = 0; i < seqlen; i++) {
2712 int itemlen;
2713 PyObject *item;
2714
2715 item = PySequence_GetItem(seq, i);
2716 if (item == NULL)
2717 goto onError;
2718 if (!PyUnicode_Check(item)) {
2719 PyObject *v;
2720 v = PyUnicode_FromObject(item);
2721 Py_DECREF(item);
2722 item = v;
2723 if (item == NULL)
2724 goto onError;
2725 }
2726 itemlen = PyUnicode_GET_SIZE(item);
2727 while (reslen + itemlen + seplen >= sz) {
2728 if (_PyUnicode_Resize(res, sz*2))
2729 goto onError;
2730 sz *= 2;
2731 p = PyUnicode_AS_UNICODE(res) + reslen;
2732 }
2733 if (i > 0) {
2734 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2735 p += seplen;
2736 reslen += seplen;
2737 }
2738 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2739 p += itemlen;
2740 reslen += itemlen;
2741 Py_DECREF(item);
2742 }
2743 if (_PyUnicode_Resize(res, reslen))
2744 goto onError;
2745
2746 Py_XDECREF(separator);
2747 return (PyObject *)res;
2748
2749 onError:
2750 Py_XDECREF(separator);
2751 Py_DECREF(res);
2752 return NULL;
2753}
2754
2755static
2756PyUnicodeObject *pad(PyUnicodeObject *self,
2757 int left,
2758 int right,
2759 Py_UNICODE fill)
2760{
2761 PyUnicodeObject *u;
2762
2763 if (left < 0)
2764 left = 0;
2765 if (right < 0)
2766 right = 0;
2767
2768 if (left == 0 && right == 0) {
2769 Py_INCREF(self);
2770 return self;
2771 }
2772
2773 u = _PyUnicode_New(left + self->length + right);
2774 if (u) {
2775 if (left)
2776 Py_UNICODE_FILL(u->str, fill, left);
2777 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2778 if (right)
2779 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2780 }
2781
2782 return u;
2783}
2784
2785#define SPLIT_APPEND(data, left, right) \
2786 str = PyUnicode_FromUnicode(data + left, right - left); \
2787 if (!str) \
2788 goto onError; \
2789 if (PyList_Append(list, str)) { \
2790 Py_DECREF(str); \
2791 goto onError; \
2792 } \
2793 else \
2794 Py_DECREF(str);
2795
2796static
2797PyObject *split_whitespace(PyUnicodeObject *self,
2798 PyObject *list,
2799 int maxcount)
2800{
2801 register int i;
2802 register int j;
2803 int len = self->length;
2804 PyObject *str;
2805
2806 for (i = j = 0; i < len; ) {
2807 /* find a token */
2808 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2809 i++;
2810 j = i;
2811 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2812 i++;
2813 if (j < i) {
2814 if (maxcount-- <= 0)
2815 break;
2816 SPLIT_APPEND(self->str, j, i);
2817 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2818 i++;
2819 j = i;
2820 }
2821 }
2822 if (j < len) {
2823 SPLIT_APPEND(self->str, j, len);
2824 }
2825 return list;
2826
2827 onError:
2828 Py_DECREF(list);
2829 return NULL;
2830}
2831
2832PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002833 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834{
2835 register int i;
2836 register int j;
2837 int len;
2838 PyObject *list;
2839 PyObject *str;
2840 Py_UNICODE *data;
2841
2842 string = PyUnicode_FromObject(string);
2843 if (string == NULL)
2844 return NULL;
2845 data = PyUnicode_AS_UNICODE(string);
2846 len = PyUnicode_GET_SIZE(string);
2847
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 list = PyList_New(0);
2849 if (!list)
2850 goto onError;
2851
2852 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002853 int eol;
2854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Find a line and append it */
2856 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2857 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858
2859 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002860 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 if (i < len) {
2862 if (data[i] == '\r' && i + 1 < len &&
2863 data[i+1] == '\n')
2864 i += 2;
2865 else
2866 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002867 if (keepends)
2868 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 }
Guido van Rossum86662912000-04-11 15:38:46 +00002870 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 j = i;
2872 }
2873 if (j < len) {
2874 SPLIT_APPEND(data, j, len);
2875 }
2876
2877 Py_DECREF(string);
2878 return list;
2879
2880 onError:
2881 Py_DECREF(list);
2882 Py_DECREF(string);
2883 return NULL;
2884}
2885
2886static
2887PyObject *split_char(PyUnicodeObject *self,
2888 PyObject *list,
2889 Py_UNICODE ch,
2890 int maxcount)
2891{
2892 register int i;
2893 register int j;
2894 int len = self->length;
2895 PyObject *str;
2896
2897 for (i = j = 0; i < len; ) {
2898 if (self->str[i] == ch) {
2899 if (maxcount-- <= 0)
2900 break;
2901 SPLIT_APPEND(self->str, j, i);
2902 i = j = i + 1;
2903 } else
2904 i++;
2905 }
2906 if (j <= len) {
2907 SPLIT_APPEND(self->str, j, len);
2908 }
2909 return list;
2910
2911 onError:
2912 Py_DECREF(list);
2913 return NULL;
2914}
2915
2916static
2917PyObject *split_substring(PyUnicodeObject *self,
2918 PyObject *list,
2919 PyUnicodeObject *substring,
2920 int maxcount)
2921{
2922 register int i;
2923 register int j;
2924 int len = self->length;
2925 int sublen = substring->length;
2926 PyObject *str;
2927
2928 for (i = j = 0; i < len - sublen; ) {
2929 if (Py_UNICODE_MATCH(self, i, substring)) {
2930 if (maxcount-- <= 0)
2931 break;
2932 SPLIT_APPEND(self->str, j, i);
2933 i = j = i + sublen;
2934 } else
2935 i++;
2936 }
2937 if (j <= len) {
2938 SPLIT_APPEND(self->str, j, len);
2939 }
2940 return list;
2941
2942 onError:
2943 Py_DECREF(list);
2944 return NULL;
2945}
2946
2947#undef SPLIT_APPEND
2948
2949static
2950PyObject *split(PyUnicodeObject *self,
2951 PyUnicodeObject *substring,
2952 int maxcount)
2953{
2954 PyObject *list;
2955
2956 if (maxcount < 0)
2957 maxcount = INT_MAX;
2958
2959 list = PyList_New(0);
2960 if (!list)
2961 return NULL;
2962
2963 if (substring == NULL)
2964 return split_whitespace(self,list,maxcount);
2965
2966 else if (substring->length == 1)
2967 return split_char(self,list,substring->str[0],maxcount);
2968
2969 else if (substring->length == 0) {
2970 Py_DECREF(list);
2971 PyErr_SetString(PyExc_ValueError, "empty separator");
2972 return NULL;
2973 }
2974 else
2975 return split_substring(self,list,substring,maxcount);
2976}
2977
2978static
2979PyObject *strip(PyUnicodeObject *self,
2980 int left,
2981 int right)
2982{
2983 Py_UNICODE *p = self->str;
2984 int start = 0;
2985 int end = self->length;
2986
2987 if (left)
2988 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2989 start++;
2990
2991 if (right)
2992 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2993 end--;
2994
2995 if (start == 0 && end == self->length) {
2996 /* couldn't strip anything off, return original string */
2997 Py_INCREF(self);
2998 return (PyObject*) self;
2999 }
3000
3001 return (PyObject*) PyUnicode_FromUnicode(
3002 self->str + start,
3003 end - start
3004 );
3005}
3006
3007static
3008PyObject *replace(PyUnicodeObject *self,
3009 PyUnicodeObject *str1,
3010 PyUnicodeObject *str2,
3011 int maxcount)
3012{
3013 PyUnicodeObject *u;
3014
3015 if (maxcount < 0)
3016 maxcount = INT_MAX;
3017
3018 if (str1->length == 1 && str2->length == 1) {
3019 int i;
3020
3021 /* replace characters */
3022 if (!findchar(self->str, self->length, str1->str[0])) {
3023 /* nothing to replace, return original string */
3024 Py_INCREF(self);
3025 u = self;
3026 } else {
3027 Py_UNICODE u1 = str1->str[0];
3028 Py_UNICODE u2 = str2->str[0];
3029
3030 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3031 self->str,
3032 self->length
3033 );
3034 if (u)
3035 for (i = 0; i < u->length; i++)
3036 if (u->str[i] == u1) {
3037 if (--maxcount < 0)
3038 break;
3039 u->str[i] = u2;
3040 }
3041 }
3042
3043 } else {
3044 int n, i;
3045 Py_UNICODE *p;
3046
3047 /* replace strings */
3048 n = count(self, 0, self->length, str1);
3049 if (n > maxcount)
3050 n = maxcount;
3051 if (n == 0) {
3052 /* nothing to replace, return original string */
3053 Py_INCREF(self);
3054 u = self;
3055 } else {
3056 u = _PyUnicode_New(
3057 self->length + n * (str2->length - str1->length));
3058 if (u) {
3059 i = 0;
3060 p = u->str;
3061 while (i <= self->length - str1->length)
3062 if (Py_UNICODE_MATCH(self, i, str1)) {
3063 /* replace string segment */
3064 Py_UNICODE_COPY(p, str2->str, str2->length);
3065 p += str2->length;
3066 i += str1->length;
3067 if (--n <= 0) {
3068 /* copy remaining part */
3069 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3070 break;
3071 }
3072 } else
3073 *p++ = self->str[i++];
3074 }
3075 }
3076 }
3077
3078 return (PyObject *) u;
3079}
3080
3081/* --- Unicode Object Methods --------------------------------------------- */
3082
3083static char title__doc__[] =
3084"S.title() -> unicode\n\
3085\n\
3086Return a titlecased version of S, i.e. words start with title case\n\
3087characters, all remaining cased characters have lower case.";
3088
3089static PyObject*
3090unicode_title(PyUnicodeObject *self, PyObject *args)
3091{
3092 if (!PyArg_NoArgs(args))
3093 return NULL;
3094 return fixup(self, fixtitle);
3095}
3096
3097static char capitalize__doc__[] =
3098"S.capitalize() -> unicode\n\
3099\n\
3100Return a capitalized version of S, i.e. make the first character\n\
3101have upper case.";
3102
3103static PyObject*
3104unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3105{
3106 if (!PyArg_NoArgs(args))
3107 return NULL;
3108 return fixup(self, fixcapitalize);
3109}
3110
3111#if 0
3112static char capwords__doc__[] =
3113"S.capwords() -> unicode\n\
3114\n\
3115Apply .capitalize() to all words in S and return the result with\n\
3116normalized whitespace (all whitespace strings are replaced by ' ').";
3117
3118static PyObject*
3119unicode_capwords(PyUnicodeObject *self, PyObject *args)
3120{
3121 PyObject *list;
3122 PyObject *item;
3123 int i;
3124
3125 if (!PyArg_NoArgs(args))
3126 return NULL;
3127
3128 /* Split into words */
3129 list = split(self, NULL, -1);
3130 if (!list)
3131 return NULL;
3132
3133 /* Capitalize each word */
3134 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3135 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3136 fixcapitalize);
3137 if (item == NULL)
3138 goto onError;
3139 Py_DECREF(PyList_GET_ITEM(list, i));
3140 PyList_SET_ITEM(list, i, item);
3141 }
3142
3143 /* Join the words to form a new string */
3144 item = PyUnicode_Join(NULL, list);
3145
3146onError:
3147 Py_DECREF(list);
3148 return (PyObject *)item;
3149}
3150#endif
3151
3152static char center__doc__[] =
3153"S.center(width) -> unicode\n\
3154\n\
3155Return S centered in a Unicode string of length width. Padding is done\n\
3156using spaces.";
3157
3158static PyObject *
3159unicode_center(PyUnicodeObject *self, PyObject *args)
3160{
3161 int marg, left;
3162 int width;
3163
3164 if (!PyArg_ParseTuple(args, "i:center", &width))
3165 return NULL;
3166
3167 if (self->length >= width) {
3168 Py_INCREF(self);
3169 return (PyObject*) self;
3170 }
3171
3172 marg = width - self->length;
3173 left = marg / 2 + (marg & width & 1);
3174
3175 return (PyObject*) pad(self, left, marg - left, ' ');
3176}
3177
Marc-André Lemburge5034372000-08-08 08:04:29 +00003178#if 0
3179
3180/* This code should go into some future Unicode collation support
3181 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003182 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003183
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003184/* speedy UTF-16 code point order comparison */
3185/* gleaned from: */
3186/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3187
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003188static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003189{
3190 0, 0, 0, 0, 0, 0, 0, 0,
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003193 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003194};
3195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196static int
3197unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3198{
3199 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003200
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 Py_UNICODE *s1 = str1->str;
3202 Py_UNICODE *s2 = str2->str;
3203
3204 len1 = str1->length;
3205 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003208 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003209 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003210
3211 c1 = *s1++;
3212 c2 = *s2++;
3213 if (c1 > (1<<11) * 26)
3214 c1 += utf16Fixup[c1>>11];
3215 if (c2 > (1<<11) * 26)
3216 c2 += utf16Fixup[c2>>11];
3217
3218 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003219 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003220 if (diff)
3221 return (diff < 0) ? -1 : (diff != 0);
3222 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 }
3224
3225 return (len1 < len2) ? -1 : (len1 != len2);
3226}
3227
Marc-André Lemburge5034372000-08-08 08:04:29 +00003228#else
3229
3230static int
3231unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3232{
3233 register int len1, len2;
3234
3235 Py_UNICODE *s1 = str1->str;
3236 Py_UNICODE *s2 = str2->str;
3237
3238 len1 = str1->length;
3239 len2 = str2->length;
3240
3241 while (len1 > 0 && len2 > 0) {
3242 register long diff;
3243
3244 diff = (long)*s1++ - (long)*s2++;
3245 if (diff)
3246 return (diff < 0) ? -1 : (diff != 0);
3247 len1--; len2--;
3248 }
3249
3250 return (len1 < len2) ? -1 : (len1 != len2);
3251}
3252
3253#endif
3254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255int PyUnicode_Compare(PyObject *left,
3256 PyObject *right)
3257{
3258 PyUnicodeObject *u = NULL, *v = NULL;
3259 int result;
3260
3261 /* Coerce the two arguments */
3262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3263 if (u == NULL)
3264 goto onError;
3265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3266 if (v == NULL)
3267 goto onError;
3268
Thomas Wouters7e474022000-07-16 12:04:32 +00003269 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 if (v == u) {
3271 Py_DECREF(u);
3272 Py_DECREF(v);
3273 return 0;
3274 }
3275
3276 result = unicode_compare(u, v);
3277
3278 Py_DECREF(u);
3279 Py_DECREF(v);
3280 return result;
3281
3282onError:
3283 Py_XDECREF(u);
3284 Py_XDECREF(v);
3285 return -1;
3286}
3287
Guido van Rossum403d68b2000-03-13 15:55:09 +00003288int PyUnicode_Contains(PyObject *container,
3289 PyObject *element)
3290{
3291 PyUnicodeObject *u = NULL, *v = NULL;
3292 int result;
3293 register const Py_UNICODE *p, *e;
3294 register Py_UNICODE ch;
3295
3296 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003297 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003298 if (v == NULL) {
3299 PyErr_SetString(PyExc_TypeError,
3300 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003301 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003302 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003303 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3304 if (u == NULL) {
3305 Py_DECREF(v);
3306 goto onError;
3307 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003308
3309 /* Check v in u */
3310 if (PyUnicode_GET_SIZE(v) != 1) {
3311 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003312 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003313 goto onError;
3314 }
3315 ch = *PyUnicode_AS_UNICODE(v);
3316 p = PyUnicode_AS_UNICODE(u);
3317 e = p + PyUnicode_GET_SIZE(u);
3318 result = 0;
3319 while (p < e) {
3320 if (*p++ == ch) {
3321 result = 1;
3322 break;
3323 }
3324 }
3325
3326 Py_DECREF(u);
3327 Py_DECREF(v);
3328 return result;
3329
3330onError:
3331 Py_XDECREF(u);
3332 Py_XDECREF(v);
3333 return -1;
3334}
3335
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336/* Concat to string or Unicode object giving a new Unicode object. */
3337
3338PyObject *PyUnicode_Concat(PyObject *left,
3339 PyObject *right)
3340{
3341 PyUnicodeObject *u = NULL, *v = NULL, *w;
3342
3343 /* Coerce the two arguments */
3344 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3345 if (u == NULL)
3346 goto onError;
3347 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3348 if (v == NULL)
3349 goto onError;
3350
3351 /* Shortcuts */
3352 if (v == unicode_empty) {
3353 Py_DECREF(v);
3354 return (PyObject *)u;
3355 }
3356 if (u == unicode_empty) {
3357 Py_DECREF(u);
3358 return (PyObject *)v;
3359 }
3360
3361 /* Concat the two Unicode strings */
3362 w = _PyUnicode_New(u->length + v->length);
3363 if (w == NULL)
3364 goto onError;
3365 Py_UNICODE_COPY(w->str, u->str, u->length);
3366 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3367
3368 Py_DECREF(u);
3369 Py_DECREF(v);
3370 return (PyObject *)w;
3371
3372onError:
3373 Py_XDECREF(u);
3374 Py_XDECREF(v);
3375 return NULL;
3376}
3377
3378static char count__doc__[] =
3379"S.count(sub[, start[, end]]) -> int\n\
3380\n\
3381Return the number of occurrences of substring sub in Unicode string\n\
3382S[start:end]. Optional arguments start and end are\n\
3383interpreted as in slice notation.";
3384
3385static PyObject *
3386unicode_count(PyUnicodeObject *self, PyObject *args)
3387{
3388 PyUnicodeObject *substring;
3389 int start = 0;
3390 int end = INT_MAX;
3391 PyObject *result;
3392
Guido van Rossumb8872e62000-05-09 14:14:27 +00003393 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 return NULL;
3396
3397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3398 (PyObject *)substring);
3399 if (substring == NULL)
3400 return NULL;
3401
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 if (start < 0)
3403 start += self->length;
3404 if (start < 0)
3405 start = 0;
3406 if (end > self->length)
3407 end = self->length;
3408 if (end < 0)
3409 end += self->length;
3410 if (end < 0)
3411 end = 0;
3412
3413 result = PyInt_FromLong((long) count(self, start, end, substring));
3414
3415 Py_DECREF(substring);
3416 return result;
3417}
3418
3419static char encode__doc__[] =
3420"S.encode([encoding[,errors]]) -> string\n\
3421\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003422Return an encoded string version of S. Default encoding is the current\n\
3423default string encoding. errors may be given to set a different error\n\
3424handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3425a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426
3427static PyObject *
3428unicode_encode(PyUnicodeObject *self, PyObject *args)
3429{
3430 char *encoding = NULL;
3431 char *errors = NULL;
3432 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3433 return NULL;
3434 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3435}
3436
3437static char expandtabs__doc__[] =
3438"S.expandtabs([tabsize]) -> unicode\n\
3439\n\
3440Return a copy of S where all tab characters are expanded using spaces.\n\
3441If tabsize is not given, a tab size of 8 characters is assumed.";
3442
3443static PyObject*
3444unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3445{
3446 Py_UNICODE *e;
3447 Py_UNICODE *p;
3448 Py_UNICODE *q;
3449 int i, j;
3450 PyUnicodeObject *u;
3451 int tabsize = 8;
3452
3453 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3454 return NULL;
3455
Thomas Wouters7e474022000-07-16 12:04:32 +00003456 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 i = j = 0;
3458 e = self->str + self->length;
3459 for (p = self->str; p < e; p++)
3460 if (*p == '\t') {
3461 if (tabsize > 0)
3462 j += tabsize - (j % tabsize);
3463 }
3464 else {
3465 j++;
3466 if (*p == '\n' || *p == '\r') {
3467 i += j;
3468 j = 0;
3469 }
3470 }
3471
3472 /* Second pass: create output string and fill it */
3473 u = _PyUnicode_New(i + j);
3474 if (!u)
3475 return NULL;
3476
3477 j = 0;
3478 q = u->str;
3479
3480 for (p = self->str; p < e; p++)
3481 if (*p == '\t') {
3482 if (tabsize > 0) {
3483 i = tabsize - (j % tabsize);
3484 j += i;
3485 while (i--)
3486 *q++ = ' ';
3487 }
3488 }
3489 else {
3490 j++;
3491 *q++ = *p;
3492 if (*p == '\n' || *p == '\r')
3493 j = 0;
3494 }
3495
3496 return (PyObject*) u;
3497}
3498
3499static char find__doc__[] =
3500"S.find(sub [,start [,end]]) -> int\n\
3501\n\
3502Return the lowest index in S where substring sub is found,\n\
3503such that sub is contained within s[start,end]. Optional\n\
3504arguments start and end are interpreted as in slice notation.\n\
3505\n\
3506Return -1 on failure.";
3507
3508static PyObject *
3509unicode_find(PyUnicodeObject *self, PyObject *args)
3510{
3511 PyUnicodeObject *substring;
3512 int start = 0;
3513 int end = INT_MAX;
3514 PyObject *result;
3515
Guido van Rossumb8872e62000-05-09 14:14:27 +00003516 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3517 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 return NULL;
3519 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3520 (PyObject *)substring);
3521 if (substring == NULL)
3522 return NULL;
3523
3524 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3525
3526 Py_DECREF(substring);
3527 return result;
3528}
3529
3530static PyObject *
3531unicode_getitem(PyUnicodeObject *self, int index)
3532{
3533 if (index < 0 || index >= self->length) {
3534 PyErr_SetString(PyExc_IndexError, "string index out of range");
3535 return NULL;
3536 }
3537
3538 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3539}
3540
3541static long
3542unicode_hash(PyUnicodeObject *self)
3543{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003544 /* Since Unicode objects compare equal to their ASCII string
3545 counterparts, they should use the individual character values
3546 as basis for their hash value. This is needed to assure that
3547 strings and Unicode objects behave in the same way as
3548 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549
Fredrik Lundhdde61642000-07-10 18:27:47 +00003550 register int len;
3551 register Py_UNICODE *p;
3552 register long x;
3553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 if (self->hash != -1)
3555 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003556 len = PyUnicode_GET_SIZE(self);
3557 p = PyUnicode_AS_UNICODE(self);
3558 x = *p << 7;
3559 while (--len >= 0)
3560 x = (1000003*x) ^ *p++;
3561 x ^= PyUnicode_GET_SIZE(self);
3562 if (x == -1)
3563 x = -2;
3564 self->hash = x;
3565 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566}
3567
3568static char index__doc__[] =
3569"S.index(sub [,start [,end]]) -> int\n\
3570\n\
3571Like S.find() but raise ValueError when the substring is not found.";
3572
3573static PyObject *
3574unicode_index(PyUnicodeObject *self, PyObject *args)
3575{
3576 int result;
3577 PyUnicodeObject *substring;
3578 int start = 0;
3579 int end = INT_MAX;
3580
Guido van Rossumb8872e62000-05-09 14:14:27 +00003581 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 return NULL;
3584
3585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3586 (PyObject *)substring);
3587 if (substring == NULL)
3588 return NULL;
3589
3590 result = findstring(self, substring, start, end, 1);
3591
3592 Py_DECREF(substring);
3593 if (result < 0) {
3594 PyErr_SetString(PyExc_ValueError, "substring not found");
3595 return NULL;
3596 }
3597 return PyInt_FromLong(result);
3598}
3599
3600static char islower__doc__[] =
3601"S.islower() -> int\n\
3602\n\
3603Return 1 if all cased characters in S are lowercase and there is\n\
3604at least one cased character in S, 0 otherwise.";
3605
3606static PyObject*
3607unicode_islower(PyUnicodeObject *self, PyObject *args)
3608{
3609 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3610 register const Py_UNICODE *e;
3611 int cased;
3612
3613 if (!PyArg_NoArgs(args))
3614 return NULL;
3615
3616 /* Shortcut for single character strings */
3617 if (PyUnicode_GET_SIZE(self) == 1)
3618 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003620 /* Special case for empty strings */
3621 if (PyString_GET_SIZE(self) == 0)
3622 return PyInt_FromLong(0);
3623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 e = p + PyUnicode_GET_SIZE(self);
3625 cased = 0;
3626 for (; p < e; p++) {
3627 register const Py_UNICODE ch = *p;
3628
3629 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3630 return PyInt_FromLong(0);
3631 else if (!cased && Py_UNICODE_ISLOWER(ch))
3632 cased = 1;
3633 }
3634 return PyInt_FromLong(cased);
3635}
3636
3637static char isupper__doc__[] =
3638"S.isupper() -> int\n\
3639\n\
3640Return 1 if all cased characters in S are uppercase and there is\n\
3641at least one cased character in S, 0 otherwise.";
3642
3643static PyObject*
3644unicode_isupper(PyUnicodeObject *self, PyObject *args)
3645{
3646 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3647 register const Py_UNICODE *e;
3648 int cased;
3649
3650 if (!PyArg_NoArgs(args))
3651 return NULL;
3652
3653 /* Shortcut for single character strings */
3654 if (PyUnicode_GET_SIZE(self) == 1)
3655 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003657 /* Special case for empty strings */
3658 if (PyString_GET_SIZE(self) == 0)
3659 return PyInt_FromLong(0);
3660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 e = p + PyUnicode_GET_SIZE(self);
3662 cased = 0;
3663 for (; p < e; p++) {
3664 register const Py_UNICODE ch = *p;
3665
3666 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3667 return PyInt_FromLong(0);
3668 else if (!cased && Py_UNICODE_ISUPPER(ch))
3669 cased = 1;
3670 }
3671 return PyInt_FromLong(cased);
3672}
3673
3674static char istitle__doc__[] =
3675"S.istitle() -> int\n\
3676\n\
3677Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3678may only follow uncased characters and lowercase characters only cased\n\
3679ones. Return 0 otherwise.";
3680
3681static PyObject*
3682unicode_istitle(PyUnicodeObject *self, PyObject *args)
3683{
3684 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3685 register const Py_UNICODE *e;
3686 int cased, previous_is_cased;
3687
3688 if (!PyArg_NoArgs(args))
3689 return NULL;
3690
3691 /* Shortcut for single character strings */
3692 if (PyUnicode_GET_SIZE(self) == 1)
3693 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3694 (Py_UNICODE_ISUPPER(*p) != 0));
3695
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003696 /* Special case for empty strings */
3697 if (PyString_GET_SIZE(self) == 0)
3698 return PyInt_FromLong(0);
3699
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 e = p + PyUnicode_GET_SIZE(self);
3701 cased = 0;
3702 previous_is_cased = 0;
3703 for (; p < e; p++) {
3704 register const Py_UNICODE ch = *p;
3705
3706 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3707 if (previous_is_cased)
3708 return PyInt_FromLong(0);
3709 previous_is_cased = 1;
3710 cased = 1;
3711 }
3712 else if (Py_UNICODE_ISLOWER(ch)) {
3713 if (!previous_is_cased)
3714 return PyInt_FromLong(0);
3715 previous_is_cased = 1;
3716 cased = 1;
3717 }
3718 else
3719 previous_is_cased = 0;
3720 }
3721 return PyInt_FromLong(cased);
3722}
3723
3724static char isspace__doc__[] =
3725"S.isspace() -> int\n\
3726\n\
3727Return 1 if there are only whitespace characters in S,\n\
37280 otherwise.";
3729
3730static PyObject*
3731unicode_isspace(PyUnicodeObject *self, PyObject *args)
3732{
3733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3734 register const Py_UNICODE *e;
3735
3736 if (!PyArg_NoArgs(args))
3737 return NULL;
3738
3739 /* Shortcut for single character strings */
3740 if (PyUnicode_GET_SIZE(self) == 1 &&
3741 Py_UNICODE_ISSPACE(*p))
3742 return PyInt_FromLong(1);
3743
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003744 /* Special case for empty strings */
3745 if (PyString_GET_SIZE(self) == 0)
3746 return PyInt_FromLong(0);
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 e = p + PyUnicode_GET_SIZE(self);
3749 for (; p < e; p++) {
3750 if (!Py_UNICODE_ISSPACE(*p))
3751 return PyInt_FromLong(0);
3752 }
3753 return PyInt_FromLong(1);
3754}
3755
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003756static char isalpha__doc__[] =
3757"S.isalpha() -> int\n\
3758\n\
3759Return 1 if all characters in S are alphabetic\n\
3760and there is at least one character in S, 0 otherwise.";
3761
3762static PyObject*
3763unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3764{
3765 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3766 register const Py_UNICODE *e;
3767
3768 if (!PyArg_NoArgs(args))
3769 return NULL;
3770
3771 /* Shortcut for single character strings */
3772 if (PyUnicode_GET_SIZE(self) == 1 &&
3773 Py_UNICODE_ISALPHA(*p))
3774 return PyInt_FromLong(1);
3775
3776 /* Special case for empty strings */
3777 if (PyString_GET_SIZE(self) == 0)
3778 return PyInt_FromLong(0);
3779
3780 e = p + PyUnicode_GET_SIZE(self);
3781 for (; p < e; p++) {
3782 if (!Py_UNICODE_ISALPHA(*p))
3783 return PyInt_FromLong(0);
3784 }
3785 return PyInt_FromLong(1);
3786}
3787
3788static char isalnum__doc__[] =
3789"S.isalnum() -> int\n\
3790\n\
3791Return 1 if all characters in S are alphanumeric\n\
3792and there is at least one character in S, 0 otherwise.";
3793
3794static PyObject*
3795unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3796{
3797 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3798 register const Py_UNICODE *e;
3799
3800 if (!PyArg_NoArgs(args))
3801 return NULL;
3802
3803 /* Shortcut for single character strings */
3804 if (PyUnicode_GET_SIZE(self) == 1 &&
3805 Py_UNICODE_ISALNUM(*p))
3806 return PyInt_FromLong(1);
3807
3808 /* Special case for empty strings */
3809 if (PyString_GET_SIZE(self) == 0)
3810 return PyInt_FromLong(0);
3811
3812 e = p + PyUnicode_GET_SIZE(self);
3813 for (; p < e; p++) {
3814 if (!Py_UNICODE_ISALNUM(*p))
3815 return PyInt_FromLong(0);
3816 }
3817 return PyInt_FromLong(1);
3818}
3819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820static char isdecimal__doc__[] =
3821"S.isdecimal() -> int\n\
3822\n\
3823Return 1 if there are only decimal characters in S,\n\
38240 otherwise.";
3825
3826static PyObject*
3827unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3828{
3829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830 register const Py_UNICODE *e;
3831
3832 if (!PyArg_NoArgs(args))
3833 return NULL;
3834
3835 /* Shortcut for single character strings */
3836 if (PyUnicode_GET_SIZE(self) == 1 &&
3837 Py_UNICODE_ISDECIMAL(*p))
3838 return PyInt_FromLong(1);
3839
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003840 /* Special case for empty strings */
3841 if (PyString_GET_SIZE(self) == 0)
3842 return PyInt_FromLong(0);
3843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 e = p + PyUnicode_GET_SIZE(self);
3845 for (; p < e; p++) {
3846 if (!Py_UNICODE_ISDECIMAL(*p))
3847 return PyInt_FromLong(0);
3848 }
3849 return PyInt_FromLong(1);
3850}
3851
3852static char isdigit__doc__[] =
3853"S.isdigit() -> int\n\
3854\n\
3855Return 1 if there are only digit characters in S,\n\
38560 otherwise.";
3857
3858static PyObject*
3859unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3860{
3861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862 register const Py_UNICODE *e;
3863
3864 if (!PyArg_NoArgs(args))
3865 return NULL;
3866
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self) == 1 &&
3869 Py_UNICODE_ISDIGIT(*p))
3870 return PyInt_FromLong(1);
3871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self) == 0)
3874 return PyInt_FromLong(0);
3875
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 e = p + PyUnicode_GET_SIZE(self);
3877 for (; p < e; p++) {
3878 if (!Py_UNICODE_ISDIGIT(*p))
3879 return PyInt_FromLong(0);
3880 }
3881 return PyInt_FromLong(1);
3882}
3883
3884static char isnumeric__doc__[] =
3885"S.isnumeric() -> int\n\
3886\n\
3887Return 1 if there are only numeric characters in S,\n\
38880 otherwise.";
3889
3890static PyObject*
3891unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3892{
3893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894 register const Py_UNICODE *e;
3895
3896 if (!PyArg_NoArgs(args))
3897 return NULL;
3898
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self) == 1 &&
3901 Py_UNICODE_ISNUMERIC(*p))
3902 return PyInt_FromLong(1);
3903
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self) == 0)
3906 return PyInt_FromLong(0);
3907
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 e = p + PyUnicode_GET_SIZE(self);
3909 for (; p < e; p++) {
3910 if (!Py_UNICODE_ISNUMERIC(*p))
3911 return PyInt_FromLong(0);
3912 }
3913 return PyInt_FromLong(1);
3914}
3915
3916static char join__doc__[] =
3917"S.join(sequence) -> unicode\n\
3918\n\
3919Return a string which is the concatenation of the strings in the\n\
3920sequence. The separator between elements is S.";
3921
3922static PyObject*
3923unicode_join(PyUnicodeObject *self, PyObject *args)
3924{
3925 PyObject *data;
3926 if (!PyArg_ParseTuple(args, "O:join", &data))
3927 return NULL;
3928
3929 return PyUnicode_Join((PyObject *)self, data);
3930}
3931
3932static int
3933unicode_length(PyUnicodeObject *self)
3934{
3935 return self->length;
3936}
3937
3938static char ljust__doc__[] =
3939"S.ljust(width) -> unicode\n\
3940\n\
3941Return S left justified in a Unicode string of length width. Padding is\n\
3942done using spaces.";
3943
3944static PyObject *
3945unicode_ljust(PyUnicodeObject *self, PyObject *args)
3946{
3947 int width;
3948 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3949 return NULL;
3950
3951 if (self->length >= width) {
3952 Py_INCREF(self);
3953 return (PyObject*) self;
3954 }
3955
3956 return (PyObject*) pad(self, 0, width - self->length, ' ');
3957}
3958
3959static char lower__doc__[] =
3960"S.lower() -> unicode\n\
3961\n\
3962Return a copy of the string S converted to lowercase.";
3963
3964static PyObject*
3965unicode_lower(PyUnicodeObject *self, PyObject *args)
3966{
3967 if (!PyArg_NoArgs(args))
3968 return NULL;
3969 return fixup(self, fixlower);
3970}
3971
3972static char lstrip__doc__[] =
3973"S.lstrip() -> unicode\n\
3974\n\
3975Return a copy of the string S with leading whitespace removed.";
3976
3977static PyObject *
3978unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3979{
3980 if (!PyArg_NoArgs(args))
3981 return NULL;
3982 return strip(self, 1, 0);
3983}
3984
3985static PyObject*
3986unicode_repeat(PyUnicodeObject *str, int len)
3987{
3988 PyUnicodeObject *u;
3989 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003990 int nchars;
3991 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992
3993 if (len < 0)
3994 len = 0;
3995
3996 if (len == 1) {
3997 /* no repeat, return original string */
3998 Py_INCREF(str);
3999 return (PyObject*) str;
4000 }
Tim Peters8f422462000-09-09 06:13:41 +00004001
4002 /* ensure # of chars needed doesn't overflow int and # of bytes
4003 * needed doesn't overflow size_t
4004 */
4005 nchars = len * str->length;
4006 if (len && nchars / len != str->length) {
4007 PyErr_SetString(PyExc_OverflowError,
4008 "repeated string is too long");
4009 return NULL;
4010 }
4011 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4012 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4013 PyErr_SetString(PyExc_OverflowError,
4014 "repeated string is too long");
4015 return NULL;
4016 }
4017 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 if (!u)
4019 return NULL;
4020
4021 p = u->str;
4022
4023 while (len-- > 0) {
4024 Py_UNICODE_COPY(p, str->str, str->length);
4025 p += str->length;
4026 }
4027
4028 return (PyObject*) u;
4029}
4030
4031PyObject *PyUnicode_Replace(PyObject *obj,
4032 PyObject *subobj,
4033 PyObject *replobj,
4034 int maxcount)
4035{
4036 PyObject *self;
4037 PyObject *str1;
4038 PyObject *str2;
4039 PyObject *result;
4040
4041 self = PyUnicode_FromObject(obj);
4042 if (self == NULL)
4043 return NULL;
4044 str1 = PyUnicode_FromObject(subobj);
4045 if (str1 == NULL) {
4046 Py_DECREF(self);
4047 return NULL;
4048 }
4049 str2 = PyUnicode_FromObject(replobj);
4050 if (str2 == NULL) {
4051 Py_DECREF(self);
4052 Py_DECREF(str1);
4053 return NULL;
4054 }
4055 result = replace((PyUnicodeObject *)self,
4056 (PyUnicodeObject *)str1,
4057 (PyUnicodeObject *)str2,
4058 maxcount);
4059 Py_DECREF(self);
4060 Py_DECREF(str1);
4061 Py_DECREF(str2);
4062 return result;
4063}
4064
4065static char replace__doc__[] =
4066"S.replace (old, new[, maxsplit]) -> unicode\n\
4067\n\
4068Return a copy of S with all occurrences of substring\n\
4069old replaced by new. If the optional argument maxsplit is\n\
4070given, only the first maxsplit occurrences are replaced.";
4071
4072static PyObject*
4073unicode_replace(PyUnicodeObject *self, PyObject *args)
4074{
4075 PyUnicodeObject *str1;
4076 PyUnicodeObject *str2;
4077 int maxcount = -1;
4078 PyObject *result;
4079
4080 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4081 return NULL;
4082 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4083 if (str1 == NULL)
4084 return NULL;
4085 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4086 if (str2 == NULL)
4087 return NULL;
4088
4089 result = replace(self, str1, str2, maxcount);
4090
4091 Py_DECREF(str1);
4092 Py_DECREF(str2);
4093 return result;
4094}
4095
4096static
4097PyObject *unicode_repr(PyObject *unicode)
4098{
4099 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4100 PyUnicode_GET_SIZE(unicode),
4101 1);
4102}
4103
4104static char rfind__doc__[] =
4105"S.rfind(sub [,start [,end]]) -> int\n\
4106\n\
4107Return the highest index in S where substring sub is found,\n\
4108such that sub is contained within s[start,end]. Optional\n\
4109arguments start and end are interpreted as in slice notation.\n\
4110\n\
4111Return -1 on failure.";
4112
4113static PyObject *
4114unicode_rfind(PyUnicodeObject *self, PyObject *args)
4115{
4116 PyUnicodeObject *substring;
4117 int start = 0;
4118 int end = INT_MAX;
4119 PyObject *result;
4120
Guido van Rossumb8872e62000-05-09 14:14:27 +00004121 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 return NULL;
4124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4125 (PyObject *)substring);
4126 if (substring == NULL)
4127 return NULL;
4128
4129 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4130
4131 Py_DECREF(substring);
4132 return result;
4133}
4134
4135static char rindex__doc__[] =
4136"S.rindex(sub [,start [,end]]) -> int\n\
4137\n\
4138Like S.rfind() but raise ValueError when the substring is not found.";
4139
4140static PyObject *
4141unicode_rindex(PyUnicodeObject *self, PyObject *args)
4142{
4143 int result;
4144 PyUnicodeObject *substring;
4145 int start = 0;
4146 int end = INT_MAX;
4147
Guido van Rossumb8872e62000-05-09 14:14:27 +00004148 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4149 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 return NULL;
4151 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4152 (PyObject *)substring);
4153 if (substring == NULL)
4154 return NULL;
4155
4156 result = findstring(self, substring, start, end, -1);
4157
4158 Py_DECREF(substring);
4159 if (result < 0) {
4160 PyErr_SetString(PyExc_ValueError, "substring not found");
4161 return NULL;
4162 }
4163 return PyInt_FromLong(result);
4164}
4165
4166static char rjust__doc__[] =
4167"S.rjust(width) -> unicode\n\
4168\n\
4169Return S right justified in a Unicode string of length width. Padding is\n\
4170done using spaces.";
4171
4172static PyObject *
4173unicode_rjust(PyUnicodeObject *self, PyObject *args)
4174{
4175 int width;
4176 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4177 return NULL;
4178
4179 if (self->length >= width) {
4180 Py_INCREF(self);
4181 return (PyObject*) self;
4182 }
4183
4184 return (PyObject*) pad(self, width - self->length, 0, ' ');
4185}
4186
4187static char rstrip__doc__[] =
4188"S.rstrip() -> unicode\n\
4189\n\
4190Return a copy of the string S with trailing whitespace removed.";
4191
4192static PyObject *
4193unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4194{
4195 if (!PyArg_NoArgs(args))
4196 return NULL;
4197 return strip(self, 0, 1);
4198}
4199
4200static PyObject*
4201unicode_slice(PyUnicodeObject *self, int start, int end)
4202{
4203 /* standard clamping */
4204 if (start < 0)
4205 start = 0;
4206 if (end < 0)
4207 end = 0;
4208 if (end > self->length)
4209 end = self->length;
4210 if (start == 0 && end == self->length) {
4211 /* full slice, return original string */
4212 Py_INCREF(self);
4213 return (PyObject*) self;
4214 }
4215 if (start > end)
4216 start = end;
4217 /* copy slice */
4218 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4219 end - start);
4220}
4221
4222PyObject *PyUnicode_Split(PyObject *s,
4223 PyObject *sep,
4224 int maxsplit)
4225{
4226 PyObject *result;
4227
4228 s = PyUnicode_FromObject(s);
4229 if (s == NULL)
4230 return NULL;
4231 if (sep != NULL) {
4232 sep = PyUnicode_FromObject(sep);
4233 if (sep == NULL) {
4234 Py_DECREF(s);
4235 return NULL;
4236 }
4237 }
4238
4239 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4240
4241 Py_DECREF(s);
4242 Py_XDECREF(sep);
4243 return result;
4244}
4245
4246static char split__doc__[] =
4247"S.split([sep [,maxsplit]]) -> list of strings\n\
4248\n\
4249Return a list of the words in S, using sep as the\n\
4250delimiter string. If maxsplit is given, at most maxsplit\n\
4251splits are done. If sep is not specified, any whitespace string\n\
4252is a separator.";
4253
4254static PyObject*
4255unicode_split(PyUnicodeObject *self, PyObject *args)
4256{
4257 PyObject *substring = Py_None;
4258 int maxcount = -1;
4259
4260 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4261 return NULL;
4262
4263 if (substring == Py_None)
4264 return split(self, NULL, maxcount);
4265 else if (PyUnicode_Check(substring))
4266 return split(self, (PyUnicodeObject *)substring, maxcount);
4267 else
4268 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4269}
4270
4271static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004272"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273\n\
4274Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004275Line breaks are not included in the resulting list unless keepends\n\
4276is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
4278static PyObject*
4279unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4280{
Guido van Rossum86662912000-04-11 15:38:46 +00004281 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282
Guido van Rossum86662912000-04-11 15:38:46 +00004283 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return NULL;
4285
Guido van Rossum86662912000-04-11 15:38:46 +00004286 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287}
4288
4289static
4290PyObject *unicode_str(PyUnicodeObject *self)
4291{
Fred Drakee4315f52000-05-09 19:53:39 +00004292 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293}
4294
4295static char strip__doc__[] =
4296"S.strip() -> unicode\n\
4297\n\
4298Return a copy of S with leading and trailing whitespace removed.";
4299
4300static PyObject *
4301unicode_strip(PyUnicodeObject *self, PyObject *args)
4302{
4303 if (!PyArg_NoArgs(args))
4304 return NULL;
4305 return strip(self, 1, 1);
4306}
4307
4308static char swapcase__doc__[] =
4309"S.swapcase() -> unicode\n\
4310\n\
4311Return a copy of S with uppercase characters converted to lowercase\n\
4312and vice versa.";
4313
4314static PyObject*
4315unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4316{
4317 if (!PyArg_NoArgs(args))
4318 return NULL;
4319 return fixup(self, fixswapcase);
4320}
4321
4322static char translate__doc__[] =
4323"S.translate(table) -> unicode\n\
4324\n\
4325Return a copy of the string S, where all characters have been mapped\n\
4326through the given translation table, which must be a mapping of\n\
4327Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4328are left untouched. Characters mapped to None are deleted.";
4329
4330static PyObject*
4331unicode_translate(PyUnicodeObject *self, PyObject *args)
4332{
4333 PyObject *table;
4334
4335 if (!PyArg_ParseTuple(args, "O:translate", &table))
4336 return NULL;
4337 return PyUnicode_TranslateCharmap(self->str,
4338 self->length,
4339 table,
4340 "ignore");
4341}
4342
4343static char upper__doc__[] =
4344"S.upper() -> unicode\n\
4345\n\
4346Return a copy of S converted to uppercase.";
4347
4348static PyObject*
4349unicode_upper(PyUnicodeObject *self, PyObject *args)
4350{
4351 if (!PyArg_NoArgs(args))
4352 return NULL;
4353 return fixup(self, fixupper);
4354}
4355
4356#if 0
4357static char zfill__doc__[] =
4358"S.zfill(width) -> unicode\n\
4359\n\
4360Pad a numeric string x with zeros on the left, to fill a field\n\
4361of the specified width. The string x is never truncated.";
4362
4363static PyObject *
4364unicode_zfill(PyUnicodeObject *self, PyObject *args)
4365{
4366 int fill;
4367 PyUnicodeObject *u;
4368
4369 int width;
4370 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4371 return NULL;
4372
4373 if (self->length >= width) {
4374 Py_INCREF(self);
4375 return (PyObject*) self;
4376 }
4377
4378 fill = width - self->length;
4379
4380 u = pad(self, fill, 0, '0');
4381
4382 if (u->str[fill] == '+' || u->str[fill] == '-') {
4383 /* move sign to beginning of string */
4384 u->str[0] = u->str[fill];
4385 u->str[fill] = '0';
4386 }
4387
4388 return (PyObject*) u;
4389}
4390#endif
4391
4392#if 0
4393static PyObject*
4394unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4395{
4396 if (!PyArg_NoArgs(args))
4397 return NULL;
4398 return PyInt_FromLong(unicode_freelist_size);
4399}
4400#endif
4401
4402static char startswith__doc__[] =
4403"S.startswith(prefix[, start[, end]]) -> int\n\
4404\n\
4405Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4406optional start, test S beginning at that position. With optional end, stop\n\
4407comparing S at that position.";
4408
4409static PyObject *
4410unicode_startswith(PyUnicodeObject *self,
4411 PyObject *args)
4412{
4413 PyUnicodeObject *substring;
4414 int start = 0;
4415 int end = INT_MAX;
4416 PyObject *result;
4417
Guido van Rossumb8872e62000-05-09 14:14:27 +00004418 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 return NULL;
4421 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4422 (PyObject *)substring);
4423 if (substring == NULL)
4424 return NULL;
4425
4426 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4427
4428 Py_DECREF(substring);
4429 return result;
4430}
4431
4432
4433static char endswith__doc__[] =
4434"S.endswith(suffix[, start[, end]]) -> int\n\
4435\n\
4436Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4437optional start, test S beginning at that position. With optional end, stop\n\
4438comparing S at that position.";
4439
4440static PyObject *
4441unicode_endswith(PyUnicodeObject *self,
4442 PyObject *args)
4443{
4444 PyUnicodeObject *substring;
4445 int start = 0;
4446 int end = INT_MAX;
4447 PyObject *result;
4448
Guido van Rossumb8872e62000-05-09 14:14:27 +00004449 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 return NULL;
4452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4453 (PyObject *)substring);
4454 if (substring == NULL)
4455 return NULL;
4456
4457 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4458
4459 Py_DECREF(substring);
4460 return result;
4461}
4462
4463
4464static PyMethodDef unicode_methods[] = {
4465
4466 /* Order is according to common usage: often used methods should
4467 appear first, since lookup is done sequentially. */
4468
4469 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4470 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4471 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4472 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4473 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4474 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4475 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4476 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4477 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4478 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4479 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4480 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4481 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4482 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4483/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4484 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4485 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4486 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4487 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4488 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4489 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4490 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4491 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4492 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4493 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4494 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4495 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4496 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4497 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4498 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4499 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4500 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4501 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004502 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4503 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504#if 0
4505 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4506 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4507#endif
4508
4509#if 0
4510 /* This one is just used for debugging the implementation. */
4511 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4512#endif
4513
4514 {NULL, NULL}
4515};
4516
4517static PyObject *
4518unicode_getattr(PyUnicodeObject *self, char *name)
4519{
4520 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4521}
4522
4523static PySequenceMethods unicode_as_sequence = {
4524 (inquiry) unicode_length, /* sq_length */
4525 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4526 (intargfunc) unicode_repeat, /* sq_repeat */
4527 (intargfunc) unicode_getitem, /* sq_item */
4528 (intintargfunc) unicode_slice, /* sq_slice */
4529 0, /* sq_ass_item */
4530 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004531 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532};
4533
4534static int
4535unicode_buffer_getreadbuf(PyUnicodeObject *self,
4536 int index,
4537 const void **ptr)
4538{
4539 if (index != 0) {
4540 PyErr_SetString(PyExc_SystemError,
4541 "accessing non-existent unicode segment");
4542 return -1;
4543 }
4544 *ptr = (void *) self->str;
4545 return PyUnicode_GET_DATA_SIZE(self);
4546}
4547
4548static int
4549unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4550 const void **ptr)
4551{
4552 PyErr_SetString(PyExc_TypeError,
4553 "cannot use unicode as modifyable buffer");
4554 return -1;
4555}
4556
4557static int
4558unicode_buffer_getsegcount(PyUnicodeObject *self,
4559 int *lenp)
4560{
4561 if (lenp)
4562 *lenp = PyUnicode_GET_DATA_SIZE(self);
4563 return 1;
4564}
4565
4566static int
4567unicode_buffer_getcharbuf(PyUnicodeObject *self,
4568 int index,
4569 const void **ptr)
4570{
4571 PyObject *str;
4572
4573 if (index != 0) {
4574 PyErr_SetString(PyExc_SystemError,
4575 "accessing non-existent unicode segment");
4576 return -1;
4577 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004578 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 if (str == NULL)
4580 return -1;
4581 *ptr = (void *) PyString_AS_STRING(str);
4582 return PyString_GET_SIZE(str);
4583}
4584
4585/* Helpers for PyUnicode_Format() */
4586
4587static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004588getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589{
4590 int argidx = *p_argidx;
4591 if (argidx < arglen) {
4592 (*p_argidx)++;
4593 if (arglen < 0)
4594 return args;
4595 else
4596 return PyTuple_GetItem(args, argidx);
4597 }
4598 PyErr_SetString(PyExc_TypeError,
4599 "not enough arguments for format string");
4600 return NULL;
4601}
4602
4603#define F_LJUST (1<<0)
4604#define F_SIGN (1<<1)
4605#define F_BLANK (1<<2)
4606#define F_ALT (1<<3)
4607#define F_ZERO (1<<4)
4608
4609static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611{
4612 register int i;
4613 int len;
4614 va_list va;
4615 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617
4618 /* First, format the string as char array, then expand to Py_UNICODE
4619 array. */
4620 charbuffer = (char *)buffer;
4621 len = vsprintf(charbuffer, format, va);
4622 for (i = len - 1; i >= 0; i--)
4623 buffer[i] = (Py_UNICODE) charbuffer[i];
4624
4625 va_end(va);
4626 return len;
4627}
4628
4629static int
4630formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004631 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 int flags,
4633 int prec,
4634 int type,
4635 PyObject *v)
4636{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004637 /* fmt = '%#.' + `prec` + `type`
4638 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 char fmt[20];
4640 double x;
4641
4642 x = PyFloat_AsDouble(v);
4643 if (x == -1.0 && PyErr_Occurred())
4644 return -1;
4645 if (prec < 0)
4646 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4648 type = 'g';
4649 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004650 /* worst case length calc to ensure no buffer overrun:
4651 fmt = %#.<prec>g
4652 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4653 for any double rep.)
4654 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4655 If prec=0 the effective precision is 1 (the leading digit is
4656 always given), therefore increase by one to 10+prec. */
4657 if (buflen <= (size_t)10 + (size_t)prec) {
4658 PyErr_SetString(PyExc_OverflowError,
4659 "formatted float is too long (precision too long?)");
4660 return -1;
4661 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 return usprintf(buf, fmt, x);
4663}
4664
Tim Peters38fd5b62000-09-21 05:43:11 +00004665static PyObject*
4666formatlong(PyObject *val, int flags, int prec, int type)
4667{
4668 char *buf;
4669 int i, len;
4670 PyObject *str; /* temporary string object. */
4671 PyUnicodeObject *result;
4672
4673 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4674 if (!str)
4675 return NULL;
4676 result = _PyUnicode_New(len);
4677 for (i = 0; i < len; i++)
4678 result->str[i] = buf[i];
4679 result->str[len] = 0;
4680 Py_DECREF(str);
4681 return (PyObject*)result;
4682}
4683
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684static int
4685formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004686 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 int flags,
4688 int prec,
4689 int type,
4690 PyObject *v)
4691{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004692 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004693 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4694 + 1 + 1 = 24*/
4695 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 long x;
4697
4698 x = PyInt_AsLong(v);
4699 if (x == -1 && PyErr_Occurred())
4700 return -1;
4701 if (prec < 0)
4702 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004703 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4704 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4705 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4706 PyErr_SetString(PyExc_OverflowError,
4707 "formatted integer is too long (precision too long?)");
4708 return -1;
4709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4711 return usprintf(buf, fmt, x);
4712}
4713
4714static int
4715formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004716 size_t buflen,
4717 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004719 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004720 if (PyUnicode_Check(v)) {
4721 if (PyUnicode_GET_SIZE(v) != 1)
4722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004726 else if (PyString_Check(v)) {
4727 if (PyString_GET_SIZE(v) != 1)
4728 goto onError;
4729 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731
4732 else {
4733 /* Integer input truncated to a character */
4734 long x;
4735 x = PyInt_AsLong(v);
4736 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004737 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 buf[0] = (char) x;
4739 }
4740 buf[1] = '\0';
4741 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004742
4743 onError:
4744 PyErr_SetString(PyExc_TypeError,
4745 "%c requires int or char");
4746 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747}
4748
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004749/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4750
4751 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4752 chars are formatted. XXX This is a magic number. Each formatting
4753 routine does bounds checking to ensure no overflow, but a better
4754 solution may be to malloc a buffer of appropriate size for each
4755 format. For now, the current solution is sufficient.
4756*/
4757#define FORMATBUFLEN (size_t)120
4758
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759PyObject *PyUnicode_Format(PyObject *format,
4760 PyObject *args)
4761{
4762 Py_UNICODE *fmt, *res;
4763 int fmtcnt, rescnt, reslen, arglen, argidx;
4764 int args_owned = 0;
4765 PyUnicodeObject *result = NULL;
4766 PyObject *dict = NULL;
4767 PyObject *uformat;
4768
4769 if (format == NULL || args == NULL) {
4770 PyErr_BadInternalCall();
4771 return NULL;
4772 }
4773 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004774 if (uformat == NULL)
4775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 fmt = PyUnicode_AS_UNICODE(uformat);
4777 fmtcnt = PyUnicode_GET_SIZE(uformat);
4778
4779 reslen = rescnt = fmtcnt + 100;
4780 result = _PyUnicode_New(reslen);
4781 if (result == NULL)
4782 goto onError;
4783 res = PyUnicode_AS_UNICODE(result);
4784
4785 if (PyTuple_Check(args)) {
4786 arglen = PyTuple_Size(args);
4787 argidx = 0;
4788 }
4789 else {
4790 arglen = -1;
4791 argidx = -2;
4792 }
4793 if (args->ob_type->tp_as_mapping)
4794 dict = args;
4795
4796 while (--fmtcnt >= 0) {
4797 if (*fmt != '%') {
4798 if (--rescnt < 0) {
4799 rescnt = fmtcnt + 100;
4800 reslen += rescnt;
4801 if (_PyUnicode_Resize(result, reslen) < 0)
4802 return NULL;
4803 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4804 --rescnt;
4805 }
4806 *res++ = *fmt++;
4807 }
4808 else {
4809 /* Got a format specifier */
4810 int flags = 0;
4811 int width = -1;
4812 int prec = -1;
4813 int size = 0;
4814 Py_UNICODE c = '\0';
4815 Py_UNICODE fill;
4816 PyObject *v = NULL;
4817 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004818 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 Py_UNICODE sign;
4820 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004821 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
4823 fmt++;
4824 if (*fmt == '(') {
4825 Py_UNICODE *keystart;
4826 int keylen;
4827 PyObject *key;
4828 int pcount = 1;
4829
4830 if (dict == NULL) {
4831 PyErr_SetString(PyExc_TypeError,
4832 "format requires a mapping");
4833 goto onError;
4834 }
4835 ++fmt;
4836 --fmtcnt;
4837 keystart = fmt;
4838 /* Skip over balanced parentheses */
4839 while (pcount > 0 && --fmtcnt >= 0) {
4840 if (*fmt == ')')
4841 --pcount;
4842 else if (*fmt == '(')
4843 ++pcount;
4844 fmt++;
4845 }
4846 keylen = fmt - keystart - 1;
4847 if (fmtcnt < 0 || pcount > 0) {
4848 PyErr_SetString(PyExc_ValueError,
4849 "incomplete format key");
4850 goto onError;
4851 }
Fred Drakee4315f52000-05-09 19:53:39 +00004852 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 then looked up since Python uses strings to hold
4854 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004855 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 key = PyUnicode_EncodeUTF8(keystart,
4857 keylen,
4858 NULL);
4859 if (key == NULL)
4860 goto onError;
4861 if (args_owned) {
4862 Py_DECREF(args);
4863 args_owned = 0;
4864 }
4865 args = PyObject_GetItem(dict, key);
4866 Py_DECREF(key);
4867 if (args == NULL) {
4868 goto onError;
4869 }
4870 args_owned = 1;
4871 arglen = -1;
4872 argidx = -2;
4873 }
4874 while (--fmtcnt >= 0) {
4875 switch (c = *fmt++) {
4876 case '-': flags |= F_LJUST; continue;
4877 case '+': flags |= F_SIGN; continue;
4878 case ' ': flags |= F_BLANK; continue;
4879 case '#': flags |= F_ALT; continue;
4880 case '0': flags |= F_ZERO; continue;
4881 }
4882 break;
4883 }
4884 if (c == '*') {
4885 v = getnextarg(args, arglen, &argidx);
4886 if (v == NULL)
4887 goto onError;
4888 if (!PyInt_Check(v)) {
4889 PyErr_SetString(PyExc_TypeError,
4890 "* wants int");
4891 goto onError;
4892 }
4893 width = PyInt_AsLong(v);
4894 if (width < 0) {
4895 flags |= F_LJUST;
4896 width = -width;
4897 }
4898 if (--fmtcnt >= 0)
4899 c = *fmt++;
4900 }
4901 else if (c >= '0' && c <= '9') {
4902 width = c - '0';
4903 while (--fmtcnt >= 0) {
4904 c = *fmt++;
4905 if (c < '0' || c > '9')
4906 break;
4907 if ((width*10) / 10 != width) {
4908 PyErr_SetString(PyExc_ValueError,
4909 "width too big");
4910 goto onError;
4911 }
4912 width = width*10 + (c - '0');
4913 }
4914 }
4915 if (c == '.') {
4916 prec = 0;
4917 if (--fmtcnt >= 0)
4918 c = *fmt++;
4919 if (c == '*') {
4920 v = getnextarg(args, arglen, &argidx);
4921 if (v == NULL)
4922 goto onError;
4923 if (!PyInt_Check(v)) {
4924 PyErr_SetString(PyExc_TypeError,
4925 "* wants int");
4926 goto onError;
4927 }
4928 prec = PyInt_AsLong(v);
4929 if (prec < 0)
4930 prec = 0;
4931 if (--fmtcnt >= 0)
4932 c = *fmt++;
4933 }
4934 else if (c >= '0' && c <= '9') {
4935 prec = c - '0';
4936 while (--fmtcnt >= 0) {
4937 c = Py_CHARMASK(*fmt++);
4938 if (c < '0' || c > '9')
4939 break;
4940 if ((prec*10) / 10 != prec) {
4941 PyErr_SetString(PyExc_ValueError,
4942 "prec too big");
4943 goto onError;
4944 }
4945 prec = prec*10 + (c - '0');
4946 }
4947 }
4948 } /* prec */
4949 if (fmtcnt >= 0) {
4950 if (c == 'h' || c == 'l' || c == 'L') {
4951 size = c;
4952 if (--fmtcnt >= 0)
4953 c = *fmt++;
4954 }
4955 }
4956 if (fmtcnt < 0) {
4957 PyErr_SetString(PyExc_ValueError,
4958 "incomplete format");
4959 goto onError;
4960 }
4961 if (c != '%') {
4962 v = getnextarg(args, arglen, &argidx);
4963 if (v == NULL)
4964 goto onError;
4965 }
4966 sign = 0;
4967 fill = ' ';
4968 switch (c) {
4969
4970 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004971 pbuf = formatbuf;
4972 /* presume that buffer length is at least 1 */
4973 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 len = 1;
4975 break;
4976
4977 case 's':
4978 case 'r':
4979 if (PyUnicode_Check(v) && c == 's') {
4980 temp = v;
4981 Py_INCREF(temp);
4982 }
4983 else {
4984 PyObject *unicode;
4985 if (c == 's')
4986 temp = PyObject_Str(v);
4987 else
4988 temp = PyObject_Repr(v);
4989 if (temp == NULL)
4990 goto onError;
4991 if (!PyString_Check(temp)) {
4992 /* XXX Note: this should never happen, since
4993 PyObject_Repr() and PyObject_Str() assure
4994 this */
4995 Py_DECREF(temp);
4996 PyErr_SetString(PyExc_TypeError,
4997 "%s argument has non-string str()");
4998 goto onError;
4999 }
Fred Drakee4315f52000-05-09 19:53:39 +00005000 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005002 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 "strict");
5004 Py_DECREF(temp);
5005 temp = unicode;
5006 if (temp == NULL)
5007 goto onError;
5008 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005009 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 len = PyUnicode_GET_SIZE(temp);
5011 if (prec >= 0 && len > prec)
5012 len = prec;
5013 break;
5014
5015 case 'i':
5016 case 'd':
5017 case 'u':
5018 case 'o':
5019 case 'x':
5020 case 'X':
5021 if (c == 'i')
5022 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005023 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005024 temp = formatlong(v, flags, prec, c);
5025 if (!temp)
5026 goto onError;
5027 pbuf = PyUnicode_AS_UNICODE(temp);
5028 len = PyUnicode_GET_SIZE(temp);
5029 /* unbounded ints can always produce
5030 a sign character! */
5031 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005033 else {
5034 pbuf = formatbuf;
5035 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5036 flags, prec, c, v);
5037 if (len < 0)
5038 goto onError;
5039 /* only d conversion is signed */
5040 sign = c == 'd';
5041 }
5042 if (flags & F_ZERO)
5043 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 break;
5045
5046 case 'e':
5047 case 'E':
5048 case 'f':
5049 case 'g':
5050 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005051 pbuf = formatbuf;
5052 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5053 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 if (len < 0)
5055 goto onError;
5056 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005057 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 fill = '0';
5059 break;
5060
5061 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005062 pbuf = formatbuf;
5063 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 if (len < 0)
5065 goto onError;
5066 break;
5067
5068 default:
5069 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005070 "unsupported format character '%c' (0x%x) "
5071 "at index %i",
5072 c, c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 goto onError;
5074 }
5075 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005076 if (*pbuf == '-' || *pbuf == '+') {
5077 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 len--;
5079 }
5080 else if (flags & F_SIGN)
5081 sign = '+';
5082 else if (flags & F_BLANK)
5083 sign = ' ';
5084 else
5085 sign = 0;
5086 }
5087 if (width < len)
5088 width = len;
5089 if (rescnt < width + (sign != 0)) {
5090 reslen -= rescnt;
5091 rescnt = width + fmtcnt + 100;
5092 reslen += rescnt;
5093 if (_PyUnicode_Resize(result, reslen) < 0)
5094 return NULL;
5095 res = PyUnicode_AS_UNICODE(result)
5096 + reslen - rescnt;
5097 }
5098 if (sign) {
5099 if (fill != ' ')
5100 *res++ = sign;
5101 rescnt--;
5102 if (width > len)
5103 width--;
5104 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005105 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5106 assert(pbuf[0] == '0');
5107 assert(pbuf[1] == c);
5108 if (fill != ' ') {
5109 *res++ = *pbuf++;
5110 *res++ = *pbuf++;
5111 }
5112 rescnt -= 2;
5113 width -= 2;
5114 if (width < 0)
5115 width = 0;
5116 len -= 2;
5117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 if (width > len && !(flags & F_LJUST)) {
5119 do {
5120 --rescnt;
5121 *res++ = fill;
5122 } while (--width > len);
5123 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005124 if (fill == ' ') {
5125 if (sign)
5126 *res++ = sign;
5127 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5128 assert(pbuf[0] == '0');
5129 assert(pbuf[1] == c);
5130 *res++ = *pbuf++;
5131 *res++ = *pbuf++;
5132 }
5133 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005134 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 res += len;
5136 rescnt -= len;
5137 while (--width >= len) {
5138 --rescnt;
5139 *res++ = ' ';
5140 }
5141 if (dict && (argidx < arglen) && c != '%') {
5142 PyErr_SetString(PyExc_TypeError,
5143 "not all arguments converted");
5144 goto onError;
5145 }
5146 Py_XDECREF(temp);
5147 } /* '%' */
5148 } /* until end */
5149 if (argidx < arglen && !dict) {
5150 PyErr_SetString(PyExc_TypeError,
5151 "not all arguments converted");
5152 goto onError;
5153 }
5154
5155 if (args_owned) {
5156 Py_DECREF(args);
5157 }
5158 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005159 if (_PyUnicode_Resize(result, reslen - rescnt))
5160 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 return (PyObject *)result;
5162
5163 onError:
5164 Py_XDECREF(result);
5165 Py_DECREF(uformat);
5166 if (args_owned) {
5167 Py_DECREF(args);
5168 }
5169 return NULL;
5170}
5171
5172static PyBufferProcs unicode_as_buffer = {
5173 (getreadbufferproc) unicode_buffer_getreadbuf,
5174 (getwritebufferproc) unicode_buffer_getwritebuf,
5175 (getsegcountproc) unicode_buffer_getsegcount,
5176 (getcharbufferproc) unicode_buffer_getcharbuf,
5177};
5178
5179PyTypeObject PyUnicode_Type = {
5180 PyObject_HEAD_INIT(&PyType_Type)
5181 0, /* ob_size */
5182 "unicode", /* tp_name */
5183 sizeof(PyUnicodeObject), /* tp_size */
5184 0, /* tp_itemsize */
5185 /* Slots */
5186 (destructor)_PyUnicode_Free, /* tp_dealloc */
5187 0, /* tp_print */
5188 (getattrfunc)unicode_getattr, /* tp_getattr */
5189 0, /* tp_setattr */
5190 (cmpfunc) unicode_compare, /* tp_compare */
5191 (reprfunc) unicode_repr, /* tp_repr */
5192 0, /* tp_as_number */
5193 &unicode_as_sequence, /* tp_as_sequence */
5194 0, /* tp_as_mapping */
5195 (hashfunc) unicode_hash, /* tp_hash*/
5196 0, /* tp_call*/
5197 (reprfunc) unicode_str, /* tp_str */
5198 (getattrofunc) NULL, /* tp_getattro */
5199 (setattrofunc) NULL, /* tp_setattro */
5200 &unicode_as_buffer, /* tp_as_buffer */
5201 Py_TPFLAGS_DEFAULT, /* tp_flags */
5202};
5203
5204/* Initialize the Unicode implementation */
5205
Thomas Wouters78890102000-07-22 19:25:51 +00005206void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
5208 /* Doublecheck the configuration... */
5209 if (sizeof(Py_UNICODE) != 2)
5210 Py_FatalError("Unicode configuration error: "
5211 "sizeof(Py_UNICODE) != 2 bytes");
5212
Fred Drakee4315f52000-05-09 19:53:39 +00005213 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005214 unicode_freelist = NULL;
5215 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005217 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218}
5219
5220/* Finalize the Unicode implementation */
5221
5222void
Thomas Wouters78890102000-07-22 19:25:51 +00005223_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005225 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005227 Py_XDECREF(unicode_empty);
5228 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005229
5230 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 PyUnicodeObject *v = u;
5232 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005233 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005234 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005235 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005236 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005238 unicode_freelist = NULL;
5239 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240}