blob: dcef11bc8c459c55565c3370aa378c051652610c [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
359 const char *s;
360 int len;
361
362 if (obj == NULL) {
363 PyErr_BadInternalCall();
364 return NULL;
365 }
366 else if (PyUnicode_Check(obj)) {
367 Py_INCREF(obj);
368 return obj;
369 }
370 else if (PyString_Check(obj)) {
371 s = PyString_AS_STRING(obj);
372 len = PyString_GET_SIZE(obj);
373 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000374 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
375 /* Overwrite the error message with something more useful in
376 case of a TypeError. */
377 if (PyErr_ExceptionMatches(PyExc_TypeError))
378 PyErr_SetString(PyExc_TypeError,
379 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382 if (len == 0) {
383 Py_INCREF(unicode_empty);
384 return (PyObject *)unicode_empty;
385 }
Fred Drakee4315f52000-05-09 19:53:39 +0000386 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389PyObject *PyUnicode_Decode(const char *s,
390 int size,
391 const char *encoding,
392 const char *errors)
393{
394 PyObject *buffer = NULL, *unicode;
395
Fred Drakee4315f52000-05-09 19:53:39 +0000396 if (encoding == NULL)
397 encoding = PyUnicode_GetDefaultEncoding();
398
399 /* Shortcuts for common default encodings */
400 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000402 else if (strcmp(encoding, "latin-1") == 0)
403 return PyUnicode_DecodeLatin1(s, size, errors);
404 else if (strcmp(encoding, "ascii") == 0)
405 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
407 /* Decode via the codec registry */
408 buffer = PyBuffer_FromMemory((void *)s, size);
409 if (buffer == NULL)
410 goto onError;
411 unicode = PyCodec_Decode(buffer, encoding, errors);
412 if (unicode == NULL)
413 goto onError;
414 if (!PyUnicode_Check(unicode)) {
415 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000416 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417 unicode->ob_type->tp_name);
418 Py_DECREF(unicode);
419 goto onError;
420 }
421 Py_DECREF(buffer);
422 return unicode;
423
424 onError:
425 Py_XDECREF(buffer);
426 return NULL;
427}
428
429PyObject *PyUnicode_Encode(const Py_UNICODE *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *v, *unicode;
435
436 unicode = PyUnicode_FromUnicode(s, size);
437 if (unicode == NULL)
438 return NULL;
439 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
440 Py_DECREF(unicode);
441 return v;
442}
443
444PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
445 const char *encoding,
446 const char *errors)
447{
448 PyObject *v;
449
450 if (!PyUnicode_Check(unicode)) {
451 PyErr_BadArgument();
452 goto onError;
453 }
Fred Drakee4315f52000-05-09 19:53:39 +0000454
455 if (encoding == NULL)
456 encoding = PyUnicode_GetDefaultEncoding();
457
458 /* Shortcuts for common default encodings */
459 if (errors == NULL) {
460 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000462 else if (strcmp(encoding, "latin-1") == 0)
463 return PyUnicode_AsLatin1String(unicode);
464 else if (strcmp(encoding, "ascii") == 0)
465 return PyUnicode_AsASCIIString(unicode);
466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467
468 /* Encode via the codec registry */
469 v = PyCodec_Encode(unicode, encoding, errors);
470 if (v == NULL)
471 goto onError;
472 /* XXX Should we really enforce this ? */
473 if (!PyString_Check(v)) {
474 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000475 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 v->ob_type->tp_name);
477 Py_DECREF(v);
478 goto onError;
479 }
480 return v;
481
482 onError:
483 return NULL;
484}
485
486Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
487{
488 if (!PyUnicode_Check(unicode)) {
489 PyErr_BadArgument();
490 goto onError;
491 }
492 return PyUnicode_AS_UNICODE(unicode);
493
494 onError:
495 return NULL;
496}
497
498int PyUnicode_GetSize(PyObject *unicode)
499{
500 if (!PyUnicode_Check(unicode)) {
501 PyErr_BadArgument();
502 goto onError;
503 }
504 return PyUnicode_GET_SIZE(unicode);
505
506 onError:
507 return -1;
508}
509
Fred Drakee4315f52000-05-09 19:53:39 +0000510const char *PyUnicode_GetDefaultEncoding()
511{
512 return unicode_default_encoding;
513}
514
515int PyUnicode_SetDefaultEncoding(const char *encoding)
516{
517 PyObject *v;
518
519 /* Make sure the encoding is valid. As side effect, this also
520 loads the encoding into the codec registry cache. */
521 v = _PyCodec_Lookup(encoding);
522 if (v == NULL)
523 goto onError;
524 Py_DECREF(v);
525 strncpy(unicode_default_encoding,
526 encoding,
527 sizeof(unicode_default_encoding));
528 return 0;
529
530 onError:
531 return -1;
532}
533
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534/* --- UTF-8 Codec -------------------------------------------------------- */
535
536static
537char utf8_code_length[256] = {
538 /* Map UTF-8 encoded prefix byte to sequence length. zero means
539 illegal prefix. see RFC 2279 for details */
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
546 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
547 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
553 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
554 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
555 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
556};
557
558static
559int utf8_decoding_error(const char **source,
560 Py_UNICODE **dest,
561 const char *errors,
562 const char *details)
563{
564 if ((errors == NULL) ||
565 (strcmp(errors,"strict") == 0)) {
566 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000567 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 details);
569 return -1;
570 }
571 else if (strcmp(errors,"ignore") == 0) {
572 (*source)++;
573 return 0;
574 }
575 else if (strcmp(errors,"replace") == 0) {
576 (*source)++;
577 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
578 (*dest)++;
579 return 0;
580 }
581 else {
582 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000583 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 errors);
585 return -1;
586 }
587}
588
589#define UTF8_ERROR(details) do { \
590 if (utf8_decoding_error(&s, &p, errors, details)) \
591 goto onError; \
592 continue; \
593} while (0)
594
595PyObject *PyUnicode_DecodeUTF8(const char *s,
596 int size,
597 const char *errors)
598{
599 int n;
600 const char *e;
601 PyUnicodeObject *unicode;
602 Py_UNICODE *p;
603
604 /* Note: size will always be longer than the resulting Unicode
605 character count */
606 unicode = _PyUnicode_New(size);
607 if (!unicode)
608 return NULL;
609 if (size == 0)
610 return (PyObject *)unicode;
611
612 /* Unpack UTF-8 encoded data */
613 p = unicode->str;
614 e = s + size;
615
616 while (s < e) {
617 register Py_UNICODE ch = (unsigned char)*s;
618
619 if (ch < 0x80) {
620 *p++ = ch;
621 s++;
622 continue;
623 }
624
625 n = utf8_code_length[ch];
626
627 if (s + n > e)
628 UTF8_ERROR("unexpected end of data");
629
630 switch (n) {
631
632 case 0:
633 UTF8_ERROR("unexpected code byte");
634 break;
635
636 case 1:
637 UTF8_ERROR("internal error");
638 break;
639
640 case 2:
641 if ((s[1] & 0xc0) != 0x80)
642 UTF8_ERROR("invalid data");
643 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
644 if (ch < 0x80)
645 UTF8_ERROR("illegal encoding");
646 else
647 *p++ = ch;
648 break;
649
650 case 3:
651 if ((s[1] & 0xc0) != 0x80 ||
652 (s[2] & 0xc0) != 0x80)
653 UTF8_ERROR("invalid data");
654 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
655 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
656 UTF8_ERROR("illegal encoding");
657 else
658 *p++ = ch;
659 break;
660
661 default:
662 /* Other sizes are only needed for UCS-4 */
663 UTF8_ERROR("unsupported Unicode code range");
664 }
665 s += n;
666 }
667
668 /* Adjust length */
669 if (_PyUnicode_Resize(unicode, p - unicode->str))
670 goto onError;
671
672 return (PyObject *)unicode;
673
674onError:
675 Py_DECREF(unicode);
676 return NULL;
677}
678
679#undef UTF8_ERROR
680
681static
682int utf8_encoding_error(const Py_UNICODE **source,
683 char **dest,
684 const char *errors,
685 const char *details)
686{
687 if ((errors == NULL) ||
688 (strcmp(errors,"strict") == 0)) {
689 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000690 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 details);
692 return -1;
693 }
694 else if (strcmp(errors,"ignore") == 0) {
695 return 0;
696 }
697 else if (strcmp(errors,"replace") == 0) {
698 **dest = '?';
699 (*dest)++;
700 return 0;
701 }
702 else {
703 PyErr_Format(PyExc_ValueError,
704 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000705 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 errors);
707 return -1;
708 }
709}
710
711PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
712 int size,
713 const char *errors)
714{
715 PyObject *v;
716 char *p;
717 char *q;
718
719 v = PyString_FromStringAndSize(NULL, 3 * size);
720 if (v == NULL)
721 return NULL;
722 if (size == 0)
723 goto done;
724
725 p = q = PyString_AS_STRING(v);
726 while (size-- > 0) {
727 Py_UNICODE ch = *s++;
728 if (ch < 0x80)
729 *p++ = (char) ch;
730 else if (ch < 0x0800) {
731 *p++ = 0xc0 | (ch >> 6);
732 *p++ = 0x80 | (ch & 0x3f);
733 } else if (0xD800 <= ch && ch <= 0xDFFF) {
734 /* These byte ranges are reserved for UTF-16 surrogate
735 bytes which the Python implementation currently does
736 not support. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 if (utf8_encoding_error(&s, &p, errors,
738 "unsupported code range"))
739 goto onError;
740 } else {
741 *p++ = 0xe0 | (ch >> 12);
742 *p++ = 0x80 | ((ch >> 6) & 0x3f);
743 *p++ = 0x80 | (ch & 0x3f);
744 }
745 }
746 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000747 if (_PyString_Resize(&v, p - q))
748 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749
750 done:
751 return v;
752
753 onError:
754 Py_DECREF(v);
755 return NULL;
756}
757
758/* Return a Python string holding the UTF-8 encoded value of the
759 Unicode object.
760
761 The resulting string is cached in the Unicode object for subsequent
762 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000763 the character buffer interface and will live (at least) as long as
764 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765
766 The refcount of the string is *not* incremented.
767
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000768 *** Exported for internal use by the interpreter only !!! ***
769
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770*/
771
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000772PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +0000773 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000775 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776
777 if (v)
778 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000779 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
780 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781 errors);
782 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000783 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784 return v;
785}
786
787PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
788{
789 PyObject *str;
790
791 if (!PyUnicode_Check(unicode)) {
792 PyErr_BadArgument();
793 return NULL;
794 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000795 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 if (str == NULL)
797 return NULL;
798 Py_INCREF(str);
799 return str;
800}
801
802/* --- UTF-16 Codec ------------------------------------------------------- */
803
804static
805int utf16_decoding_error(const Py_UNICODE **source,
806 Py_UNICODE **dest,
807 const char *errors,
808 const char *details)
809{
810 if ((errors == NULL) ||
811 (strcmp(errors,"strict") == 0)) {
812 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000813 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 details);
815 return -1;
816 }
817 else if (strcmp(errors,"ignore") == 0) {
818 return 0;
819 }
820 else if (strcmp(errors,"replace") == 0) {
821 if (dest) {
822 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
823 (*dest)++;
824 }
825 return 0;
826 }
827 else {
828 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000829 "UTF-16 decoding error; "
830 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 errors);
832 return -1;
833 }
834}
835
836#define UTF16_ERROR(details) do { \
837 if (utf16_decoding_error(&q, &p, errors, details)) \
838 goto onError; \
839 continue; \
840} while(0)
841
842PyObject *PyUnicode_DecodeUTF16(const char *s,
843 int size,
844 const char *errors,
845 int *byteorder)
846{
847 PyUnicodeObject *unicode;
848 Py_UNICODE *p;
849 const Py_UNICODE *q, *e;
850 int bo = 0;
851
852 /* size should be an even number */
853 if (size % sizeof(Py_UNICODE) != 0) {
854 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
855 return NULL;
856 /* The remaining input chars are ignored if we fall through
857 here... */
858 }
859
860 /* Note: size will always be longer than the resulting Unicode
861 character count */
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 /* Unpack UTF-16 encoded data */
869 p = unicode->str;
870 q = (Py_UNICODE *)s;
871 e = q + (size / sizeof(Py_UNICODE));
872
873 if (byteorder)
874 bo = *byteorder;
875
876 while (q < e) {
877 register Py_UNICODE ch = *q++;
878
879 /* Check for BOM marks (U+FEFF) in the input and adjust
880 current byte order setting accordingly. Swap input
881 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
882 !) */
883#ifdef BYTEORDER_IS_LITTLE_ENDIAN
884 if (ch == 0xFEFF) {
885 bo = -1;
886 continue;
887 } else if (ch == 0xFFFE) {
888 bo = 1;
889 continue;
890 }
891 if (bo == 1)
892 ch = (ch >> 8) | (ch << 8);
893#else
894 if (ch == 0xFEFF) {
895 bo = 1;
896 continue;
897 } else if (ch == 0xFFFE) {
898 bo = -1;
899 continue;
900 }
901 if (bo == -1)
902 ch = (ch >> 8) | (ch << 8);
903#endif
904 if (ch < 0xD800 || ch > 0xDFFF) {
905 *p++ = ch;
906 continue;
907 }
908
909 /* UTF-16 code pair: */
910 if (q >= e)
911 UTF16_ERROR("unexpected end of data");
912 if (0xDC00 <= *q && *q <= 0xDFFF) {
913 q++;
914 if (0xD800 <= *q && *q <= 0xDBFF)
915 /* This is valid data (a UTF-16 surrogate pair), but
916 we are not able to store this information since our
917 Py_UNICODE type only has 16 bits... this might
918 change someday, even though it's unlikely. */
919 UTF16_ERROR("code pairs are not supported");
920 else
921 continue;
922 }
923 UTF16_ERROR("illegal encoding");
924 }
925
926 if (byteorder)
927 *byteorder = bo;
928
929 /* Adjust length */
930 if (_PyUnicode_Resize(unicode, p - unicode->str))
931 goto onError;
932
933 return (PyObject *)unicode;
934
935onError:
936 Py_DECREF(unicode);
937 return NULL;
938}
939
940#undef UTF16_ERROR
941
942PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
943 int size,
944 const char *errors,
945 int byteorder)
946{
947 PyObject *v;
948 Py_UNICODE *p;
949 char *q;
950
951 /* We don't create UTF-16 pairs... */
952 v = PyString_FromStringAndSize(NULL,
953 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
954 if (v == NULL)
955 return NULL;
956 if (size == 0)
957 goto done;
958
959 q = PyString_AS_STRING(v);
960 p = (Py_UNICODE *)q;
961
962 if (byteorder == 0)
963 *p++ = 0xFEFF;
964 if (byteorder == 0 ||
965#ifdef BYTEORDER_IS_LITTLE_ENDIAN
966 byteorder == -1
967#else
968 byteorder == 1
969#endif
970 )
971 memcpy(p, s, size * sizeof(Py_UNICODE));
972 else
973 while (size-- > 0) {
974 Py_UNICODE ch = *s++;
975 *p++ = (ch >> 8) | (ch << 8);
976 }
977 done:
978 return v;
979}
980
981PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
982{
983 if (!PyUnicode_Check(unicode)) {
984 PyErr_BadArgument();
985 return NULL;
986 }
987 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
988 PyUnicode_GET_SIZE(unicode),
989 NULL,
990 0);
991}
992
993/* --- Unicode Escape Codec ----------------------------------------------- */
994
995static
996int unicodeescape_decoding_error(const char **source,
997 unsigned int *x,
998 const char *errors,
999 const char *details)
1000{
1001 if ((errors == NULL) ||
1002 (strcmp(errors,"strict") == 0)) {
1003 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001004 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 details);
1006 return -1;
1007 }
1008 else if (strcmp(errors,"ignore") == 0) {
1009 return 0;
1010 }
1011 else if (strcmp(errors,"replace") == 0) {
1012 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1013 return 0;
1014 }
1015 else {
1016 PyErr_Format(PyExc_ValueError,
1017 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001018 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 errors);
1020 return -1;
1021 }
1022}
1023
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001024static _Py_UCNHashAPI *pucnHash = NULL;
1025
1026static
1027int mystrnicmp(const char *s1, const char *s2, size_t count)
1028{
1029 char c1, c2;
1030
1031 if (count)
1032 {
1033 do
1034 {
1035 c1 = tolower(*(s1++));
1036 c2 = tolower(*(s2++));
1037 }
1038 while(--count && c1 == c2);
1039
1040 return c1 - c2;
1041 }
1042
1043 return 0;
1044}
1045
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1047 int size,
1048 const char *errors)
1049{
1050 PyUnicodeObject *v;
1051 Py_UNICODE *p = NULL, *buf = NULL;
1052 const char *end;
1053
1054 /* Escaped strings will always be longer than the resulting
1055 Unicode string, so we start with size here and then reduce the
1056 length after conversion to the true value. */
1057 v = _PyUnicode_New(size);
1058 if (v == NULL)
1059 goto onError;
1060 if (size == 0)
1061 return (PyObject *)v;
1062 p = buf = PyUnicode_AS_UNICODE(v);
1063 end = s + size;
1064 while (s < end) {
1065 unsigned char c;
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001066 unsigned long x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 int i;
1068
1069 /* Non-escape characters are interpreted as Unicode ordinals */
1070 if (*s != '\\') {
1071 *p++ = (unsigned char)*s++;
1072 continue;
1073 }
1074
1075 /* \ - Escapes */
1076 s++;
1077 switch (*s++) {
1078
1079 /* \x escapes */
1080 case '\n': break;
1081 case '\\': *p++ = '\\'; break;
1082 case '\'': *p++ = '\''; break;
1083 case '\"': *p++ = '\"'; break;
1084 case 'b': *p++ = '\b'; break;
1085 case 'f': *p++ = '\014'; break; /* FF */
1086 case 't': *p++ = '\t'; break;
1087 case 'n': *p++ = '\n'; break;
1088 case 'r': *p++ = '\r'; break;
1089 case 'v': *p++ = '\013'; break; /* VT */
1090 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1091
1092 /* \OOO (octal) escapes */
1093 case '0': case '1': case '2': case '3':
1094 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001095 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001097 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001099 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001101 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 break;
1103
1104 /* \xXXXX escape with 0-4 hex digits */
1105 case 'x':
1106 x = 0;
1107 c = (unsigned char)*s;
1108 if (isxdigit(c)) {
1109 do {
1110 x = (x<<4) & ~0xF;
1111 if ('0' <= c && c <= '9')
1112 x += c - '0';
1113 else if ('a' <= c && c <= 'f')
1114 x += 10 + c - 'a';
1115 else
1116 x += 10 + c - 'A';
1117 c = (unsigned char)*++s;
1118 } while (isxdigit(c));
1119 *p++ = x;
1120 } else {
1121 *p++ = '\\';
1122 *p++ = (unsigned char)s[-1];
1123 }
1124 break;
1125
1126 /* \uXXXX with 4 hex digits */
1127 case 'u':
1128 for (x = 0, i = 0; i < 4; i++) {
1129 c = (unsigned char)s[i];
1130 if (!isxdigit(c)) {
1131 if (unicodeescape_decoding_error(&s, &x, errors,
1132 "truncated \\uXXXX"))
1133 goto onError;
1134 i++;
1135 break;
1136 }
1137 x = (x<<4) & ~0xF;
1138 if (c >= '0' && c <= '9')
1139 x += c - '0';
1140 else if (c >= 'a' && c <= 'f')
1141 x += 10 + c - 'a';
1142 else
1143 x += 10 + c - 'A';
1144 }
1145 s += i;
1146 *p++ = x;
1147 break;
1148
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001149 case 'N':
1150 /* Ok, we need to deal with Unicode Character Names now,
1151 * make sure we've imported the hash table data...
1152 */
1153 if (pucnHash == NULL)
1154 {
1155 PyObject *mod = 0, *v = 0;
1156
1157 mod = PyImport_ImportModule("ucnhash");
1158 if (mod == NULL)
1159 goto onError;
1160 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1161 Py_DECREF(mod);
1162 if (v == NULL)
1163 {
1164 goto onError;
1165 }
1166 pucnHash = PyCObject_AsVoidPtr(v);
1167 Py_DECREF(v);
1168 if (pucnHash == NULL)
1169 {
1170 goto onError;
1171 }
1172 }
1173
1174 if (*s == '{')
1175 {
1176 const char *start = s + 1;
1177 const char *endBrace = start;
1178 unsigned int uiValue;
1179 unsigned long j;
1180
1181 /* look for either the closing brace, or we
1182 * exceed the maximum length of the unicode character names
1183 */
1184 while (*endBrace != '}' &&
1185 (unsigned int)(endBrace - start) <=
1186 pucnHash->cchMax &&
1187 endBrace < end)
1188 {
1189 endBrace++;
1190 }
1191 if (endBrace != end && *endBrace == '}')
1192 {
1193 j = pucnHash->hash(start, endBrace - start);
1194 if (j > pucnHash->cKeys ||
1195 mystrnicmp(
1196 start,
1197 ((_Py_UnicodeCharacterName *)
1198 (pucnHash->getValue(j)))->pszUCN,
1199 (int)(endBrace - start)) != 0)
1200 {
1201 if (unicodeescape_decoding_error(
1202 &s, &x, errors,
1203 "Invalid Unicode Character Name"))
1204 {
1205 goto onError;
1206 }
1207 goto ucnFallthrough;
1208 }
1209 uiValue = ((_Py_UnicodeCharacterName *)
1210 (pucnHash->getValue(j)))->uiValue;
1211 if (uiValue < 1<<16)
1212 {
1213 /* In UCS-2 range, easy solution.. */
1214 *p++ = uiValue;
1215 }
1216 else
1217 {
1218 /* Oops, its in UCS-4 space, */
1219 /* compute and append the two surrogates: */
1220 /* translate from 10000..10FFFF to 0..FFFFF */
1221 uiValue -= 0x10000;
1222
1223 /* high surrogate = top 10 bits added to D800 */
1224 *p++ = 0xD800 + (uiValue >> 10);
1225
1226 /* low surrogate = bottom 10 bits added to DC00 */
1227 *p++ = 0xDC00 + (uiValue & ~0xFC00);
1228 }
1229 s = endBrace + 1;
1230 }
1231 else
1232 {
1233 if (unicodeescape_decoding_error(
1234 &s, &x, errors,
1235 "Unicode name missing closing brace"))
1236 goto onError;
1237 goto ucnFallthrough;
1238 }
1239 break;
1240 }
1241 if (unicodeescape_decoding_error(
1242 &s, &x, errors,
1243 "Missing opening brace for Unicode Character Name escape"))
1244 goto onError;
1245ucnFallthrough:
1246 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001247 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 *p++ = '\\';
1249 *p++ = (unsigned char)s[-1];
1250 break;
1251 }
1252 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001253 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return (PyObject *)v;
1256
1257 onError:
1258 Py_XDECREF(v);
1259 return NULL;
1260}
1261
1262/* Return a Unicode-Escape string version of the Unicode object.
1263
1264 If quotes is true, the string is enclosed in u"" or u'' quotes as
1265 appropriate.
1266
1267*/
1268
Barry Warsaw51ac5802000-03-20 16:36:48 +00001269static const Py_UNICODE *findchar(const Py_UNICODE *s,
1270 int size,
1271 Py_UNICODE ch);
1272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273static
1274PyObject *unicodeescape_string(const Py_UNICODE *s,
1275 int size,
1276 int quotes)
1277{
1278 PyObject *repr;
1279 char *p;
1280 char *q;
1281
1282 static const char *hexdigit = "0123456789ABCDEF";
1283
1284 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1285 if (repr == NULL)
1286 return NULL;
1287
1288 p = q = PyString_AS_STRING(repr);
1289
1290 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 *p++ = 'u';
1292 *p++ = (findchar(s, size, '\'') &&
1293 !findchar(s, size, '"')) ? '"' : '\'';
1294 }
1295 while (size-- > 0) {
1296 Py_UNICODE ch = *s++;
1297 /* Escape quotes */
1298 if (quotes && (ch == q[1] || ch == '\\')) {
1299 *p++ = '\\';
1300 *p++ = (char) ch;
1301 }
1302 /* Map 16-bit characters to '\uxxxx' */
1303 else if (ch >= 256) {
1304 *p++ = '\\';
1305 *p++ = 'u';
1306 *p++ = hexdigit[(ch >> 12) & 0xf];
1307 *p++ = hexdigit[(ch >> 8) & 0xf];
1308 *p++ = hexdigit[(ch >> 4) & 0xf];
1309 *p++ = hexdigit[ch & 15];
1310 }
1311 /* Map non-printable US ASCII to '\ooo' */
1312 else if (ch < ' ' || ch >= 128) {
1313 *p++ = '\\';
1314 *p++ = hexdigit[(ch >> 6) & 7];
1315 *p++ = hexdigit[(ch >> 3) & 7];
1316 *p++ = hexdigit[ch & 7];
1317 }
1318 /* Copy everything else as-is */
1319 else
1320 *p++ = (char) ch;
1321 }
1322 if (quotes)
1323 *p++ = q[1];
1324
1325 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001326 if (_PyString_Resize(&repr, p - q))
1327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001330
1331 onError:
1332 Py_DECREF(repr);
1333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334}
1335
1336PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1337 int size)
1338{
1339 return unicodeescape_string(s, size, 0);
1340}
1341
1342PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1343{
1344 if (!PyUnicode_Check(unicode)) {
1345 PyErr_BadArgument();
1346 return NULL;
1347 }
1348 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1349 PyUnicode_GET_SIZE(unicode));
1350}
1351
1352/* --- Raw Unicode Escape Codec ------------------------------------------- */
1353
1354PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1355 int size,
1356 const char *errors)
1357{
1358 PyUnicodeObject *v;
1359 Py_UNICODE *p, *buf;
1360 const char *end;
1361 const char *bs;
1362
1363 /* Escaped strings will always be longer than the resulting
1364 Unicode string, so we start with size here and then reduce the
1365 length after conversion to the true value. */
1366 v = _PyUnicode_New(size);
1367 if (v == NULL)
1368 goto onError;
1369 if (size == 0)
1370 return (PyObject *)v;
1371 p = buf = PyUnicode_AS_UNICODE(v);
1372 end = s + size;
1373 while (s < end) {
1374 unsigned char c;
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001375 unsigned long x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 int i;
1377
1378 /* Non-escape characters are interpreted as Unicode ordinals */
1379 if (*s != '\\') {
1380 *p++ = (unsigned char)*s++;
1381 continue;
1382 }
1383
1384 /* \u-escapes are only interpreted iff the number of leading
1385 backslashes if odd */
1386 bs = s;
1387 for (;s < end;) {
1388 if (*s != '\\')
1389 break;
1390 *p++ = (unsigned char)*s++;
1391 }
1392 if (((s - bs) & 1) == 0 ||
1393 s >= end ||
1394 *s != 'u') {
1395 continue;
1396 }
1397 p--;
1398 s++;
1399
1400 /* \uXXXX with 4 hex digits */
1401 for (x = 0, i = 0; i < 4; i++) {
1402 c = (unsigned char)s[i];
1403 if (!isxdigit(c)) {
1404 if (unicodeescape_decoding_error(&s, &x, errors,
1405 "truncated \\uXXXX"))
1406 goto onError;
1407 i++;
1408 break;
1409 }
1410 x = (x<<4) & ~0xF;
1411 if (c >= '0' && c <= '9')
1412 x += c - '0';
1413 else if (c >= 'a' && c <= 'f')
1414 x += 10 + c - 'a';
1415 else
1416 x += 10 + c - 'A';
1417 }
1418 s += i;
1419 *p++ = x;
1420 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001421 if (_PyUnicode_Resize(v, (int)(p - buf)))
1422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 return (PyObject *)v;
1424
1425 onError:
1426 Py_XDECREF(v);
1427 return NULL;
1428}
1429
1430PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1431 int size)
1432{
1433 PyObject *repr;
1434 char *p;
1435 char *q;
1436
1437 static const char *hexdigit = "0123456789ABCDEF";
1438
1439 repr = PyString_FromStringAndSize(NULL, 6 * size);
1440 if (repr == NULL)
1441 return NULL;
1442
1443 p = q = PyString_AS_STRING(repr);
1444 while (size-- > 0) {
1445 Py_UNICODE ch = *s++;
1446 /* Map 16-bit characters to '\uxxxx' */
1447 if (ch >= 256) {
1448 *p++ = '\\';
1449 *p++ = 'u';
1450 *p++ = hexdigit[(ch >> 12) & 0xf];
1451 *p++ = hexdigit[(ch >> 8) & 0xf];
1452 *p++ = hexdigit[(ch >> 4) & 0xf];
1453 *p++ = hexdigit[ch & 15];
1454 }
1455 /* Copy everything else as-is */
1456 else
1457 *p++ = (char) ch;
1458 }
1459 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001460 if (_PyString_Resize(&repr, p - q))
1461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462
1463 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001464
1465 onError:
1466 Py_DECREF(repr);
1467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468}
1469
1470PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1471{
1472 if (!PyUnicode_Check(unicode)) {
1473 PyErr_BadArgument();
1474 return NULL;
1475 }
1476 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1477 PyUnicode_GET_SIZE(unicode));
1478}
1479
1480/* --- Latin-1 Codec ------------------------------------------------------ */
1481
1482PyObject *PyUnicode_DecodeLatin1(const char *s,
1483 int size,
1484 const char *errors)
1485{
1486 PyUnicodeObject *v;
1487 Py_UNICODE *p;
1488
1489 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1490 v = _PyUnicode_New(size);
1491 if (v == NULL)
1492 goto onError;
1493 if (size == 0)
1494 return (PyObject *)v;
1495 p = PyUnicode_AS_UNICODE(v);
1496 while (size-- > 0)
1497 *p++ = (unsigned char)*s++;
1498 return (PyObject *)v;
1499
1500 onError:
1501 Py_XDECREF(v);
1502 return NULL;
1503}
1504
1505static
1506int latin1_encoding_error(const Py_UNICODE **source,
1507 char **dest,
1508 const char *errors,
1509 const char *details)
1510{
1511 if ((errors == NULL) ||
1512 (strcmp(errors,"strict") == 0)) {
1513 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001514 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 details);
1516 return -1;
1517 }
1518 else if (strcmp(errors,"ignore") == 0) {
1519 return 0;
1520 }
1521 else if (strcmp(errors,"replace") == 0) {
1522 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001523 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001524 return 0;
1525 }
1526 else {
1527 PyErr_Format(PyExc_ValueError,
1528 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001529 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 errors);
1531 return -1;
1532 }
1533}
1534
1535PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1536 int size,
1537 const char *errors)
1538{
1539 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001540 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 repr = PyString_FromStringAndSize(NULL, size);
1542 if (repr == NULL)
1543 return NULL;
1544
1545 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001546 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 while (size-- > 0) {
1548 Py_UNICODE ch = *p++;
1549 if (ch >= 256) {
1550 if (latin1_encoding_error(&p, &s, errors,
1551 "ordinal not in range(256)"))
1552 goto onError;
1553 }
1554 else
1555 *s++ = (char)ch;
1556 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001557 /* Resize if error handling skipped some characters */
1558 if (s - start < PyString_GET_SIZE(repr))
1559 if (_PyString_Resize(&repr, s - start))
1560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return repr;
1562
1563 onError:
1564 Py_DECREF(repr);
1565 return NULL;
1566}
1567
1568PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1569{
1570 if (!PyUnicode_Check(unicode)) {
1571 PyErr_BadArgument();
1572 return NULL;
1573 }
1574 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1575 PyUnicode_GET_SIZE(unicode),
1576 NULL);
1577}
1578
1579/* --- 7-bit ASCII Codec -------------------------------------------------- */
1580
1581static
1582int ascii_decoding_error(const char **source,
1583 Py_UNICODE **dest,
1584 const char *errors,
1585 const char *details)
1586{
1587 if ((errors == NULL) ||
1588 (strcmp(errors,"strict") == 0)) {
1589 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001590 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 details);
1592 return -1;
1593 }
1594 else if (strcmp(errors,"ignore") == 0) {
1595 return 0;
1596 }
1597 else if (strcmp(errors,"replace") == 0) {
1598 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1599 (*dest)++;
1600 return 0;
1601 }
1602 else {
1603 PyErr_Format(PyExc_ValueError,
1604 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001605 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606 errors);
1607 return -1;
1608 }
1609}
1610
1611PyObject *PyUnicode_DecodeASCII(const char *s,
1612 int size,
1613 const char *errors)
1614{
1615 PyUnicodeObject *v;
1616 Py_UNICODE *p;
1617
1618 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1619 v = _PyUnicode_New(size);
1620 if (v == NULL)
1621 goto onError;
1622 if (size == 0)
1623 return (PyObject *)v;
1624 p = PyUnicode_AS_UNICODE(v);
1625 while (size-- > 0) {
1626 register unsigned char c;
1627
1628 c = (unsigned char)*s++;
1629 if (c < 128)
1630 *p++ = c;
1631 else if (ascii_decoding_error(&s, &p, errors,
1632 "ordinal not in range(128)"))
1633 goto onError;
1634 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001635 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1636 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 return (PyObject *)v;
1639
1640 onError:
1641 Py_XDECREF(v);
1642 return NULL;
1643}
1644
1645static
1646int ascii_encoding_error(const Py_UNICODE **source,
1647 char **dest,
1648 const char *errors,
1649 const char *details)
1650{
1651 if ((errors == NULL) ||
1652 (strcmp(errors,"strict") == 0)) {
1653 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001654 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 details);
1656 return -1;
1657 }
1658 else if (strcmp(errors,"ignore") == 0) {
1659 return 0;
1660 }
1661 else if (strcmp(errors,"replace") == 0) {
1662 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001663 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 return 0;
1665 }
1666 else {
1667 PyErr_Format(PyExc_ValueError,
1668 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001669 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 errors);
1671 return -1;
1672 }
1673}
1674
1675PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1676 int size,
1677 const char *errors)
1678{
1679 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001680 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 repr = PyString_FromStringAndSize(NULL, size);
1682 if (repr == NULL)
1683 return NULL;
1684
1685 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001686 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 while (size-- > 0) {
1688 Py_UNICODE ch = *p++;
1689 if (ch >= 128) {
1690 if (ascii_encoding_error(&p, &s, errors,
1691 "ordinal not in range(128)"))
1692 goto onError;
1693 }
1694 else
1695 *s++ = (char)ch;
1696 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001697 /* Resize if error handling skipped some characters */
1698 if (s - start < PyString_GET_SIZE(repr))
1699 if (_PyString_Resize(&repr, s - start))
1700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 return repr;
1702
1703 onError:
1704 Py_DECREF(repr);
1705 return NULL;
1706}
1707
1708PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1709{
1710 if (!PyUnicode_Check(unicode)) {
1711 PyErr_BadArgument();
1712 return NULL;
1713 }
1714 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1715 PyUnicode_GET_SIZE(unicode),
1716 NULL);
1717}
1718
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001719#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001720
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001721/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001722
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001723PyObject *PyUnicode_DecodeMBCS(const char *s,
1724 int size,
1725 const char *errors)
1726{
1727 PyUnicodeObject *v;
1728 Py_UNICODE *p;
1729
1730 /* First get the size of the result */
1731 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001732 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001733 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1734
1735 v = _PyUnicode_New(usize);
1736 if (v == NULL)
1737 return NULL;
1738 if (usize == 0)
1739 return (PyObject *)v;
1740 p = PyUnicode_AS_UNICODE(v);
1741 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1742 Py_DECREF(v);
1743 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1744 }
1745
1746 return (PyObject *)v;
1747}
1748
1749PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1750 int size,
1751 const char *errors)
1752{
1753 PyObject *repr;
1754 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001755 DWORD mbcssize;
1756
1757 /* If there are no characters, bail now! */
1758 if (size==0)
1759 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001760
1761 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001762 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001763 if (mbcssize==0)
1764 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1765
1766 repr = PyString_FromStringAndSize(NULL, mbcssize);
1767 if (repr == NULL)
1768 return NULL;
1769 if (mbcssize==0)
1770 return repr;
1771
1772 /* Do the conversion */
1773 s = PyString_AS_STRING(repr);
1774 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1775 Py_DECREF(repr);
1776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1777 }
1778 return repr;
1779}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001780
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001781#endif /* MS_WIN32 */
1782
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783/* --- Character Mapping Codec -------------------------------------------- */
1784
1785static
1786int charmap_decoding_error(const char **source,
1787 Py_UNICODE **dest,
1788 const char *errors,
1789 const char *details)
1790{
1791 if ((errors == NULL) ||
1792 (strcmp(errors,"strict") == 0)) {
1793 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 details);
1796 return -1;
1797 }
1798 else if (strcmp(errors,"ignore") == 0) {
1799 return 0;
1800 }
1801 else if (strcmp(errors,"replace") == 0) {
1802 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1803 (*dest)++;
1804 return 0;
1805 }
1806 else {
1807 PyErr_Format(PyExc_ValueError,
1808 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001809 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 errors);
1811 return -1;
1812 }
1813}
1814
1815PyObject *PyUnicode_DecodeCharmap(const char *s,
1816 int size,
1817 PyObject *mapping,
1818 const char *errors)
1819{
1820 PyUnicodeObject *v;
1821 Py_UNICODE *p;
1822
1823 /* Default to Latin-1 */
1824 if (mapping == NULL)
1825 return PyUnicode_DecodeLatin1(s, size, errors);
1826
1827 v = _PyUnicode_New(size);
1828 if (v == NULL)
1829 goto onError;
1830 if (size == 0)
1831 return (PyObject *)v;
1832 p = PyUnicode_AS_UNICODE(v);
1833 while (size-- > 0) {
1834 unsigned char ch = *s++;
1835 PyObject *w, *x;
1836
1837 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1838 w = PyInt_FromLong((long)ch);
1839 if (w == NULL)
1840 goto onError;
1841 x = PyObject_GetItem(mapping, w);
1842 Py_DECREF(w);
1843 if (x == NULL) {
1844 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1845 /* No mapping found: default to Latin-1 mapping */
1846 PyErr_Clear();
1847 *p++ = (Py_UNICODE)ch;
1848 continue;
1849 }
1850 goto onError;
1851 }
1852
1853 /* Apply mapping */
1854 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001855 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 if (value < 0 || value > 65535) {
1857 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001858 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 Py_DECREF(x);
1860 goto onError;
1861 }
1862 *p++ = (Py_UNICODE)value;
1863 }
1864 else if (x == Py_None) {
1865 /* undefined mapping */
1866 if (charmap_decoding_error(&s, &p, errors,
1867 "character maps to <undefined>")) {
1868 Py_DECREF(x);
1869 goto onError;
1870 }
1871 }
1872 else if (PyUnicode_Check(x)) {
1873 if (PyUnicode_GET_SIZE(x) != 1) {
1874 /* 1-n mapping */
1875 PyErr_SetString(PyExc_NotImplementedError,
1876 "1-n mappings are currently not implemented");
1877 Py_DECREF(x);
1878 goto onError;
1879 }
1880 *p++ = *PyUnicode_AS_UNICODE(x);
1881 }
1882 else {
1883 /* wrong return value */
1884 PyErr_SetString(PyExc_TypeError,
1885 "character mapping must return integer, None or unicode");
1886 Py_DECREF(x);
1887 goto onError;
1888 }
1889 Py_DECREF(x);
1890 }
1891 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1892 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1893 goto onError;
1894 return (PyObject *)v;
1895
1896 onError:
1897 Py_XDECREF(v);
1898 return NULL;
1899}
1900
1901static
1902int charmap_encoding_error(const Py_UNICODE **source,
1903 char **dest,
1904 const char *errors,
1905 const char *details)
1906{
1907 if ((errors == NULL) ||
1908 (strcmp(errors,"strict") == 0)) {
1909 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001910 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 details);
1912 return -1;
1913 }
1914 else if (strcmp(errors,"ignore") == 0) {
1915 return 0;
1916 }
1917 else if (strcmp(errors,"replace") == 0) {
1918 **dest = '?';
1919 (*dest)++;
1920 return 0;
1921 }
1922 else {
1923 PyErr_Format(PyExc_ValueError,
1924 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001925 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 errors);
1927 return -1;
1928 }
1929}
1930
1931PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1932 int size,
1933 PyObject *mapping,
1934 const char *errors)
1935{
1936 PyObject *v;
1937 char *s;
1938
1939 /* Default to Latin-1 */
1940 if (mapping == NULL)
1941 return PyUnicode_EncodeLatin1(p, size, errors);
1942
1943 v = PyString_FromStringAndSize(NULL, size);
1944 if (v == NULL)
1945 return NULL;
1946 s = PyString_AS_STRING(v);
1947 while (size-- > 0) {
1948 Py_UNICODE ch = *p++;
1949 PyObject *w, *x;
1950
1951 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1952 w = PyInt_FromLong((long)ch);
1953 if (w == NULL)
1954 goto onError;
1955 x = PyObject_GetItem(mapping, w);
1956 Py_DECREF(w);
1957 if (x == NULL) {
1958 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1959 /* No mapping found: default to Latin-1 mapping if possible */
1960 PyErr_Clear();
1961 if (ch < 256) {
1962 *s++ = (char)ch;
1963 continue;
1964 }
1965 else if (!charmap_encoding_error(&p, &s, errors,
1966 "missing character mapping"))
1967 continue;
1968 }
1969 goto onError;
1970 }
1971
1972 /* Apply mapping */
1973 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001974 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 if (value < 0 || value > 255) {
1976 PyErr_SetString(PyExc_TypeError,
1977 "character mapping must be in range(256)");
1978 Py_DECREF(x);
1979 goto onError;
1980 }
1981 *s++ = (char)value;
1982 }
1983 else if (x == Py_None) {
1984 /* undefined mapping */
1985 if (charmap_encoding_error(&p, &s, errors,
1986 "character maps to <undefined>")) {
1987 Py_DECREF(x);
1988 goto onError;
1989 }
1990 }
1991 else if (PyString_Check(x)) {
1992 if (PyString_GET_SIZE(x) != 1) {
1993 /* 1-n mapping */
1994 PyErr_SetString(PyExc_NotImplementedError,
1995 "1-n mappings are currently not implemented");
1996 Py_DECREF(x);
1997 goto onError;
1998 }
1999 *s++ = *PyString_AS_STRING(x);
2000 }
2001 else {
2002 /* wrong return value */
2003 PyErr_SetString(PyExc_TypeError,
2004 "character mapping must return integer, None or unicode");
2005 Py_DECREF(x);
2006 goto onError;
2007 }
2008 Py_DECREF(x);
2009 }
2010 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2011 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2012 goto onError;
2013 return v;
2014
2015 onError:
2016 Py_DECREF(v);
2017 return NULL;
2018}
2019
2020PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2021 PyObject *mapping)
2022{
2023 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2024 PyErr_BadArgument();
2025 return NULL;
2026 }
2027 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2028 PyUnicode_GET_SIZE(unicode),
2029 mapping,
2030 NULL);
2031}
2032
2033static
2034int translate_error(const Py_UNICODE **source,
2035 Py_UNICODE **dest,
2036 const char *errors,
2037 const char *details)
2038{
2039 if ((errors == NULL) ||
2040 (strcmp(errors,"strict") == 0)) {
2041 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002042 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 details);
2044 return -1;
2045 }
2046 else if (strcmp(errors,"ignore") == 0) {
2047 return 0;
2048 }
2049 else if (strcmp(errors,"replace") == 0) {
2050 **dest = '?';
2051 (*dest)++;
2052 return 0;
2053 }
2054 else {
2055 PyErr_Format(PyExc_ValueError,
2056 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002057 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 errors);
2059 return -1;
2060 }
2061}
2062
2063PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2064 int size,
2065 PyObject *mapping,
2066 const char *errors)
2067{
2068 PyUnicodeObject *v;
2069 Py_UNICODE *p;
2070
2071 if (mapping == NULL) {
2072 PyErr_BadArgument();
2073 return NULL;
2074 }
2075
2076 /* Output will never be longer than input */
2077 v = _PyUnicode_New(size);
2078 if (v == NULL)
2079 goto onError;
2080 if (size == 0)
2081 goto done;
2082 p = PyUnicode_AS_UNICODE(v);
2083 while (size-- > 0) {
2084 Py_UNICODE ch = *s++;
2085 PyObject *w, *x;
2086
2087 /* Get mapping */
2088 w = PyInt_FromLong(ch);
2089 if (w == NULL)
2090 goto onError;
2091 x = PyObject_GetItem(mapping, w);
2092 Py_DECREF(w);
2093 if (x == NULL) {
2094 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2095 /* No mapping found: default to 1-1 mapping */
2096 PyErr_Clear();
2097 *p++ = ch;
2098 continue;
2099 }
2100 goto onError;
2101 }
2102
2103 /* Apply mapping */
2104 if (PyInt_Check(x))
2105 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2106 else if (x == Py_None) {
2107 /* undefined mapping */
2108 if (translate_error(&s, &p, errors,
2109 "character maps to <undefined>")) {
2110 Py_DECREF(x);
2111 goto onError;
2112 }
2113 }
2114 else if (PyUnicode_Check(x)) {
2115 if (PyUnicode_GET_SIZE(x) != 1) {
2116 /* 1-n mapping */
2117 PyErr_SetString(PyExc_NotImplementedError,
2118 "1-n mappings are currently not implemented");
2119 Py_DECREF(x);
2120 goto onError;
2121 }
2122 *p++ = *PyUnicode_AS_UNICODE(x);
2123 }
2124 else {
2125 /* wrong return value */
2126 PyErr_SetString(PyExc_TypeError,
2127 "translate mapping must return integer, None or unicode");
2128 Py_DECREF(x);
2129 goto onError;
2130 }
2131 Py_DECREF(x);
2132 }
2133 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002134 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 done:
2138 return (PyObject *)v;
2139
2140 onError:
2141 Py_XDECREF(v);
2142 return NULL;
2143}
2144
2145PyObject *PyUnicode_Translate(PyObject *str,
2146 PyObject *mapping,
2147 const char *errors)
2148{
2149 PyObject *result;
2150
2151 str = PyUnicode_FromObject(str);
2152 if (str == NULL)
2153 goto onError;
2154 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2155 PyUnicode_GET_SIZE(str),
2156 mapping,
2157 errors);
2158 Py_DECREF(str);
2159 return result;
2160
2161 onError:
2162 Py_XDECREF(str);
2163 return NULL;
2164}
2165
Guido van Rossum9e896b32000-04-05 20:11:21 +00002166/* --- Decimal Encoder ---------------------------------------------------- */
2167
2168int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2169 int length,
2170 char *output,
2171 const char *errors)
2172{
2173 Py_UNICODE *p, *end;
2174
2175 if (output == NULL) {
2176 PyErr_BadArgument();
2177 return -1;
2178 }
2179
2180 p = s;
2181 end = s + length;
2182 while (p < end) {
2183 register Py_UNICODE ch = *p++;
2184 int decimal;
2185
2186 if (Py_UNICODE_ISSPACE(ch)) {
2187 *output++ = ' ';
2188 continue;
2189 }
2190 decimal = Py_UNICODE_TODECIMAL(ch);
2191 if (decimal >= 0) {
2192 *output++ = '0' + decimal;
2193 continue;
2194 }
Guido van Rossumba477042000-04-06 18:18:10 +00002195 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002196 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002197 continue;
2198 }
2199 /* All other characters are considered invalid */
2200 if (errors == NULL || strcmp(errors, "strict") == 0) {
2201 PyErr_SetString(PyExc_ValueError,
2202 "invalid decimal Unicode string");
2203 goto onError;
2204 }
2205 else if (strcmp(errors, "ignore") == 0)
2206 continue;
2207 else if (strcmp(errors, "replace") == 0) {
2208 *output++ = '?';
2209 continue;
2210 }
2211 }
2212 /* 0-terminate the output string */
2213 *output++ = '\0';
2214 return 0;
2215
2216 onError:
2217 return -1;
2218}
2219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220/* --- Helpers ------------------------------------------------------------ */
2221
2222static
2223int count(PyUnicodeObject *self,
2224 int start,
2225 int end,
2226 PyUnicodeObject *substring)
2227{
2228 int count = 0;
2229
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002230 if (substring->length == 0)
2231 return (end - start + 1);
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 end -= substring->length;
2234
2235 while (start <= end)
2236 if (Py_UNICODE_MATCH(self, start, substring)) {
2237 count++;
2238 start += substring->length;
2239 } else
2240 start++;
2241
2242 return count;
2243}
2244
2245int PyUnicode_Count(PyObject *str,
2246 PyObject *substr,
2247 int start,
2248 int end)
2249{
2250 int result;
2251
2252 str = PyUnicode_FromObject(str);
2253 if (str == NULL)
2254 return -1;
2255 substr = PyUnicode_FromObject(substr);
2256 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002257 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 return -1;
2259 }
2260
2261 result = count((PyUnicodeObject *)str,
2262 start, end,
2263 (PyUnicodeObject *)substr);
2264
2265 Py_DECREF(str);
2266 Py_DECREF(substr);
2267 return result;
2268}
2269
2270static
2271int findstring(PyUnicodeObject *self,
2272 PyUnicodeObject *substring,
2273 int start,
2274 int end,
2275 int direction)
2276{
2277 if (start < 0)
2278 start += self->length;
2279 if (start < 0)
2280 start = 0;
2281
2282 if (substring->length == 0)
2283 return start;
2284
2285 if (end > self->length)
2286 end = self->length;
2287 if (end < 0)
2288 end += self->length;
2289 if (end < 0)
2290 end = 0;
2291
2292 end -= substring->length;
2293
2294 if (direction < 0) {
2295 for (; end >= start; end--)
2296 if (Py_UNICODE_MATCH(self, end, substring))
2297 return end;
2298 } else {
2299 for (; start <= end; start++)
2300 if (Py_UNICODE_MATCH(self, start, substring))
2301 return start;
2302 }
2303
2304 return -1;
2305}
2306
2307int PyUnicode_Find(PyObject *str,
2308 PyObject *substr,
2309 int start,
2310 int end,
2311 int direction)
2312{
2313 int result;
2314
2315 str = PyUnicode_FromObject(str);
2316 if (str == NULL)
2317 return -1;
2318 substr = PyUnicode_FromObject(substr);
2319 if (substr == NULL) {
2320 Py_DECREF(substr);
2321 return -1;
2322 }
2323
2324 result = findstring((PyUnicodeObject *)str,
2325 (PyUnicodeObject *)substr,
2326 start, end, direction);
2327 Py_DECREF(str);
2328 Py_DECREF(substr);
2329 return result;
2330}
2331
2332static
2333int tailmatch(PyUnicodeObject *self,
2334 PyUnicodeObject *substring,
2335 int start,
2336 int end,
2337 int direction)
2338{
2339 if (start < 0)
2340 start += self->length;
2341 if (start < 0)
2342 start = 0;
2343
2344 if (substring->length == 0)
2345 return 1;
2346
2347 if (end > self->length)
2348 end = self->length;
2349 if (end < 0)
2350 end += self->length;
2351 if (end < 0)
2352 end = 0;
2353
2354 end -= substring->length;
2355 if (end < start)
2356 return 0;
2357
2358 if (direction > 0) {
2359 if (Py_UNICODE_MATCH(self, end, substring))
2360 return 1;
2361 } else {
2362 if (Py_UNICODE_MATCH(self, start, substring))
2363 return 1;
2364 }
2365
2366 return 0;
2367}
2368
2369int PyUnicode_Tailmatch(PyObject *str,
2370 PyObject *substr,
2371 int start,
2372 int end,
2373 int direction)
2374{
2375 int result;
2376
2377 str = PyUnicode_FromObject(str);
2378 if (str == NULL)
2379 return -1;
2380 substr = PyUnicode_FromObject(substr);
2381 if (substr == NULL) {
2382 Py_DECREF(substr);
2383 return -1;
2384 }
2385
2386 result = tailmatch((PyUnicodeObject *)str,
2387 (PyUnicodeObject *)substr,
2388 start, end, direction);
2389 Py_DECREF(str);
2390 Py_DECREF(substr);
2391 return result;
2392}
2393
2394static
2395const Py_UNICODE *findchar(const Py_UNICODE *s,
2396 int size,
2397 Py_UNICODE ch)
2398{
2399 /* like wcschr, but doesn't stop at NULL characters */
2400
2401 while (size-- > 0) {
2402 if (*s == ch)
2403 return s;
2404 s++;
2405 }
2406
2407 return NULL;
2408}
2409
2410/* Apply fixfct filter to the Unicode object self and return a
2411 reference to the modified object */
2412
2413static
2414PyObject *fixup(PyUnicodeObject *self,
2415 int (*fixfct)(PyUnicodeObject *s))
2416{
2417
2418 PyUnicodeObject *u;
2419
2420 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2421 self->length);
2422 if (u == NULL)
2423 return NULL;
2424 if (!fixfct(u)) {
2425 /* fixfct should return TRUE if it modified the buffer. If
2426 FALSE, return a reference to the original buffer instead
2427 (to save space, not time) */
2428 Py_INCREF(self);
2429 Py_DECREF(u);
2430 return (PyObject*) self;
2431 }
2432 return (PyObject*) u;
2433}
2434
2435static
2436int fixupper(PyUnicodeObject *self)
2437{
2438 int len = self->length;
2439 Py_UNICODE *s = self->str;
2440 int status = 0;
2441
2442 while (len-- > 0) {
2443 register Py_UNICODE ch;
2444
2445 ch = Py_UNICODE_TOUPPER(*s);
2446 if (ch != *s) {
2447 status = 1;
2448 *s = ch;
2449 }
2450 s++;
2451 }
2452
2453 return status;
2454}
2455
2456static
2457int fixlower(PyUnicodeObject *self)
2458{
2459 int len = self->length;
2460 Py_UNICODE *s = self->str;
2461 int status = 0;
2462
2463 while (len-- > 0) {
2464 register Py_UNICODE ch;
2465
2466 ch = Py_UNICODE_TOLOWER(*s);
2467 if (ch != *s) {
2468 status = 1;
2469 *s = ch;
2470 }
2471 s++;
2472 }
2473
2474 return status;
2475}
2476
2477static
2478int fixswapcase(PyUnicodeObject *self)
2479{
2480 int len = self->length;
2481 Py_UNICODE *s = self->str;
2482 int status = 0;
2483
2484 while (len-- > 0) {
2485 if (Py_UNICODE_ISUPPER(*s)) {
2486 *s = Py_UNICODE_TOLOWER(*s);
2487 status = 1;
2488 } else if (Py_UNICODE_ISLOWER(*s)) {
2489 *s = Py_UNICODE_TOUPPER(*s);
2490 status = 1;
2491 }
2492 s++;
2493 }
2494
2495 return status;
2496}
2497
2498static
2499int fixcapitalize(PyUnicodeObject *self)
2500{
2501 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2502 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2503 return 1;
2504 }
2505 return 0;
2506}
2507
2508static
2509int fixtitle(PyUnicodeObject *self)
2510{
2511 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2512 register Py_UNICODE *e;
2513 int previous_is_cased;
2514
2515 /* Shortcut for single character strings */
2516 if (PyUnicode_GET_SIZE(self) == 1) {
2517 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2518 if (*p != ch) {
2519 *p = ch;
2520 return 1;
2521 }
2522 else
2523 return 0;
2524 }
2525
2526 e = p + PyUnicode_GET_SIZE(self);
2527 previous_is_cased = 0;
2528 for (; p < e; p++) {
2529 register const Py_UNICODE ch = *p;
2530
2531 if (previous_is_cased)
2532 *p = Py_UNICODE_TOLOWER(ch);
2533 else
2534 *p = Py_UNICODE_TOTITLE(ch);
2535
2536 if (Py_UNICODE_ISLOWER(ch) ||
2537 Py_UNICODE_ISUPPER(ch) ||
2538 Py_UNICODE_ISTITLE(ch))
2539 previous_is_cased = 1;
2540 else
2541 previous_is_cased = 0;
2542 }
2543 return 1;
2544}
2545
2546PyObject *PyUnicode_Join(PyObject *separator,
2547 PyObject *seq)
2548{
2549 Py_UNICODE *sep;
2550 int seplen;
2551 PyUnicodeObject *res = NULL;
2552 int reslen = 0;
2553 Py_UNICODE *p;
2554 int seqlen = 0;
2555 int sz = 100;
2556 int i;
2557
2558 seqlen = PySequence_Length(seq);
2559 if (seqlen < 0 && PyErr_Occurred())
2560 return NULL;
2561
2562 if (separator == NULL) {
2563 Py_UNICODE blank = ' ';
2564 sep = &blank;
2565 seplen = 1;
2566 }
2567 else {
2568 separator = PyUnicode_FromObject(separator);
2569 if (separator == NULL)
2570 return NULL;
2571 sep = PyUnicode_AS_UNICODE(separator);
2572 seplen = PyUnicode_GET_SIZE(separator);
2573 }
2574
2575 res = _PyUnicode_New(sz);
2576 if (res == NULL)
2577 goto onError;
2578 p = PyUnicode_AS_UNICODE(res);
2579 reslen = 0;
2580
2581 for (i = 0; i < seqlen; i++) {
2582 int itemlen;
2583 PyObject *item;
2584
2585 item = PySequence_GetItem(seq, i);
2586 if (item == NULL)
2587 goto onError;
2588 if (!PyUnicode_Check(item)) {
2589 PyObject *v;
2590 v = PyUnicode_FromObject(item);
2591 Py_DECREF(item);
2592 item = v;
2593 if (item == NULL)
2594 goto onError;
2595 }
2596 itemlen = PyUnicode_GET_SIZE(item);
2597 while (reslen + itemlen + seplen >= sz) {
2598 if (_PyUnicode_Resize(res, sz*2))
2599 goto onError;
2600 sz *= 2;
2601 p = PyUnicode_AS_UNICODE(res) + reslen;
2602 }
2603 if (i > 0) {
2604 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2605 p += seplen;
2606 reslen += seplen;
2607 }
2608 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2609 p += itemlen;
2610 reslen += itemlen;
2611 Py_DECREF(item);
2612 }
2613 if (_PyUnicode_Resize(res, reslen))
2614 goto onError;
2615
2616 Py_XDECREF(separator);
2617 return (PyObject *)res;
2618
2619 onError:
2620 Py_XDECREF(separator);
2621 Py_DECREF(res);
2622 return NULL;
2623}
2624
2625static
2626PyUnicodeObject *pad(PyUnicodeObject *self,
2627 int left,
2628 int right,
2629 Py_UNICODE fill)
2630{
2631 PyUnicodeObject *u;
2632
2633 if (left < 0)
2634 left = 0;
2635 if (right < 0)
2636 right = 0;
2637
2638 if (left == 0 && right == 0) {
2639 Py_INCREF(self);
2640 return self;
2641 }
2642
2643 u = _PyUnicode_New(left + self->length + right);
2644 if (u) {
2645 if (left)
2646 Py_UNICODE_FILL(u->str, fill, left);
2647 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2648 if (right)
2649 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2650 }
2651
2652 return u;
2653}
2654
2655#define SPLIT_APPEND(data, left, right) \
2656 str = PyUnicode_FromUnicode(data + left, right - left); \
2657 if (!str) \
2658 goto onError; \
2659 if (PyList_Append(list, str)) { \
2660 Py_DECREF(str); \
2661 goto onError; \
2662 } \
2663 else \
2664 Py_DECREF(str);
2665
2666static
2667PyObject *split_whitespace(PyUnicodeObject *self,
2668 PyObject *list,
2669 int maxcount)
2670{
2671 register int i;
2672 register int j;
2673 int len = self->length;
2674 PyObject *str;
2675
2676 for (i = j = 0; i < len; ) {
2677 /* find a token */
2678 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2679 i++;
2680 j = i;
2681 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2682 i++;
2683 if (j < i) {
2684 if (maxcount-- <= 0)
2685 break;
2686 SPLIT_APPEND(self->str, j, i);
2687 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2688 i++;
2689 j = i;
2690 }
2691 }
2692 if (j < len) {
2693 SPLIT_APPEND(self->str, j, len);
2694 }
2695 return list;
2696
2697 onError:
2698 Py_DECREF(list);
2699 return NULL;
2700}
2701
2702PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002703 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704{
2705 register int i;
2706 register int j;
2707 int len;
2708 PyObject *list;
2709 PyObject *str;
2710 Py_UNICODE *data;
2711
2712 string = PyUnicode_FromObject(string);
2713 if (string == NULL)
2714 return NULL;
2715 data = PyUnicode_AS_UNICODE(string);
2716 len = PyUnicode_GET_SIZE(string);
2717
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 list = PyList_New(0);
2719 if (!list)
2720 goto onError;
2721
2722 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002723 int eol;
2724
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 /* Find a line and append it */
2726 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2727 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728
2729 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002730 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 if (i < len) {
2732 if (data[i] == '\r' && i + 1 < len &&
2733 data[i+1] == '\n')
2734 i += 2;
2735 else
2736 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002737 if (keepends)
2738 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 }
Guido van Rossum86662912000-04-11 15:38:46 +00002740 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 j = i;
2742 }
2743 if (j < len) {
2744 SPLIT_APPEND(data, j, len);
2745 }
2746
2747 Py_DECREF(string);
2748 return list;
2749
2750 onError:
2751 Py_DECREF(list);
2752 Py_DECREF(string);
2753 return NULL;
2754}
2755
2756static
2757PyObject *split_char(PyUnicodeObject *self,
2758 PyObject *list,
2759 Py_UNICODE ch,
2760 int maxcount)
2761{
2762 register int i;
2763 register int j;
2764 int len = self->length;
2765 PyObject *str;
2766
2767 for (i = j = 0; i < len; ) {
2768 if (self->str[i] == ch) {
2769 if (maxcount-- <= 0)
2770 break;
2771 SPLIT_APPEND(self->str, j, i);
2772 i = j = i + 1;
2773 } else
2774 i++;
2775 }
2776 if (j <= len) {
2777 SPLIT_APPEND(self->str, j, len);
2778 }
2779 return list;
2780
2781 onError:
2782 Py_DECREF(list);
2783 return NULL;
2784}
2785
2786static
2787PyObject *split_substring(PyUnicodeObject *self,
2788 PyObject *list,
2789 PyUnicodeObject *substring,
2790 int maxcount)
2791{
2792 register int i;
2793 register int j;
2794 int len = self->length;
2795 int sublen = substring->length;
2796 PyObject *str;
2797
2798 for (i = j = 0; i < len - sublen; ) {
2799 if (Py_UNICODE_MATCH(self, i, substring)) {
2800 if (maxcount-- <= 0)
2801 break;
2802 SPLIT_APPEND(self->str, j, i);
2803 i = j = i + sublen;
2804 } else
2805 i++;
2806 }
2807 if (j <= len) {
2808 SPLIT_APPEND(self->str, j, len);
2809 }
2810 return list;
2811
2812 onError:
2813 Py_DECREF(list);
2814 return NULL;
2815}
2816
2817#undef SPLIT_APPEND
2818
2819static
2820PyObject *split(PyUnicodeObject *self,
2821 PyUnicodeObject *substring,
2822 int maxcount)
2823{
2824 PyObject *list;
2825
2826 if (maxcount < 0)
2827 maxcount = INT_MAX;
2828
2829 list = PyList_New(0);
2830 if (!list)
2831 return NULL;
2832
2833 if (substring == NULL)
2834 return split_whitespace(self,list,maxcount);
2835
2836 else if (substring->length == 1)
2837 return split_char(self,list,substring->str[0],maxcount);
2838
2839 else if (substring->length == 0) {
2840 Py_DECREF(list);
2841 PyErr_SetString(PyExc_ValueError, "empty separator");
2842 return NULL;
2843 }
2844 else
2845 return split_substring(self,list,substring,maxcount);
2846}
2847
2848static
2849PyObject *strip(PyUnicodeObject *self,
2850 int left,
2851 int right)
2852{
2853 Py_UNICODE *p = self->str;
2854 int start = 0;
2855 int end = self->length;
2856
2857 if (left)
2858 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2859 start++;
2860
2861 if (right)
2862 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2863 end--;
2864
2865 if (start == 0 && end == self->length) {
2866 /* couldn't strip anything off, return original string */
2867 Py_INCREF(self);
2868 return (PyObject*) self;
2869 }
2870
2871 return (PyObject*) PyUnicode_FromUnicode(
2872 self->str + start,
2873 end - start
2874 );
2875}
2876
2877static
2878PyObject *replace(PyUnicodeObject *self,
2879 PyUnicodeObject *str1,
2880 PyUnicodeObject *str2,
2881 int maxcount)
2882{
2883 PyUnicodeObject *u;
2884
2885 if (maxcount < 0)
2886 maxcount = INT_MAX;
2887
2888 if (str1->length == 1 && str2->length == 1) {
2889 int i;
2890
2891 /* replace characters */
2892 if (!findchar(self->str, self->length, str1->str[0])) {
2893 /* nothing to replace, return original string */
2894 Py_INCREF(self);
2895 u = self;
2896 } else {
2897 Py_UNICODE u1 = str1->str[0];
2898 Py_UNICODE u2 = str2->str[0];
2899
2900 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2901 self->str,
2902 self->length
2903 );
2904 if (u)
2905 for (i = 0; i < u->length; i++)
2906 if (u->str[i] == u1) {
2907 if (--maxcount < 0)
2908 break;
2909 u->str[i] = u2;
2910 }
2911 }
2912
2913 } else {
2914 int n, i;
2915 Py_UNICODE *p;
2916
2917 /* replace strings */
2918 n = count(self, 0, self->length, str1);
2919 if (n > maxcount)
2920 n = maxcount;
2921 if (n == 0) {
2922 /* nothing to replace, return original string */
2923 Py_INCREF(self);
2924 u = self;
2925 } else {
2926 u = _PyUnicode_New(
2927 self->length + n * (str2->length - str1->length));
2928 if (u) {
2929 i = 0;
2930 p = u->str;
2931 while (i <= self->length - str1->length)
2932 if (Py_UNICODE_MATCH(self, i, str1)) {
2933 /* replace string segment */
2934 Py_UNICODE_COPY(p, str2->str, str2->length);
2935 p += str2->length;
2936 i += str1->length;
2937 if (--n <= 0) {
2938 /* copy remaining part */
2939 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2940 break;
2941 }
2942 } else
2943 *p++ = self->str[i++];
2944 }
2945 }
2946 }
2947
2948 return (PyObject *) u;
2949}
2950
2951/* --- Unicode Object Methods --------------------------------------------- */
2952
2953static char title__doc__[] =
2954"S.title() -> unicode\n\
2955\n\
2956Return a titlecased version of S, i.e. words start with title case\n\
2957characters, all remaining cased characters have lower case.";
2958
2959static PyObject*
2960unicode_title(PyUnicodeObject *self, PyObject *args)
2961{
2962 if (!PyArg_NoArgs(args))
2963 return NULL;
2964 return fixup(self, fixtitle);
2965}
2966
2967static char capitalize__doc__[] =
2968"S.capitalize() -> unicode\n\
2969\n\
2970Return a capitalized version of S, i.e. make the first character\n\
2971have upper case.";
2972
2973static PyObject*
2974unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2975{
2976 if (!PyArg_NoArgs(args))
2977 return NULL;
2978 return fixup(self, fixcapitalize);
2979}
2980
2981#if 0
2982static char capwords__doc__[] =
2983"S.capwords() -> unicode\n\
2984\n\
2985Apply .capitalize() to all words in S and return the result with\n\
2986normalized whitespace (all whitespace strings are replaced by ' ').";
2987
2988static PyObject*
2989unicode_capwords(PyUnicodeObject *self, PyObject *args)
2990{
2991 PyObject *list;
2992 PyObject *item;
2993 int i;
2994
2995 if (!PyArg_NoArgs(args))
2996 return NULL;
2997
2998 /* Split into words */
2999 list = split(self, NULL, -1);
3000 if (!list)
3001 return NULL;
3002
3003 /* Capitalize each word */
3004 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3005 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3006 fixcapitalize);
3007 if (item == NULL)
3008 goto onError;
3009 Py_DECREF(PyList_GET_ITEM(list, i));
3010 PyList_SET_ITEM(list, i, item);
3011 }
3012
3013 /* Join the words to form a new string */
3014 item = PyUnicode_Join(NULL, list);
3015
3016onError:
3017 Py_DECREF(list);
3018 return (PyObject *)item;
3019}
3020#endif
3021
3022static char center__doc__[] =
3023"S.center(width) -> unicode\n\
3024\n\
3025Return S centered in a Unicode string of length width. Padding is done\n\
3026using spaces.";
3027
3028static PyObject *
3029unicode_center(PyUnicodeObject *self, PyObject *args)
3030{
3031 int marg, left;
3032 int width;
3033
3034 if (!PyArg_ParseTuple(args, "i:center", &width))
3035 return NULL;
3036
3037 if (self->length >= width) {
3038 Py_INCREF(self);
3039 return (PyObject*) self;
3040 }
3041
3042 marg = width - self->length;
3043 left = marg / 2 + (marg & width & 1);
3044
3045 return (PyObject*) pad(self, left, marg - left, ' ');
3046}
3047
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003048/* speedy UTF-16 code point order comparison */
3049/* gleaned from: */
3050/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3051
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003052static unsigned long utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003053{
3054 0, 0, 0, 0, 0, 0, 0, 0,
3055 0, 0, 0, 0, 0, 0, 0, 0,
3056 0, 0, 0, 0, 0, 0, 0, 0,
3057 0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
3058};
3059
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060static int
3061unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3062{
3063 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 Py_UNICODE *s1 = str1->str;
3066 Py_UNICODE *s2 = str2->str;
3067
3068 len1 = str1->length;
3069 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003070
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 while (len1 > 0 && len2 > 0) {
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003072 unsigned long c1, c2;
3073 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003074
3075 c1 = *s1++;
3076 c2 = *s2++;
3077 if (c1 > (1<<11) * 26)
3078 c1 += utf16Fixup[c1>>11];
3079 if (c2 > (1<<11) * 26)
3080 c2 += utf16Fixup[c2>>11];
3081
3082 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003083 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003084 if (diff)
3085 return (diff < 0) ? -1 : (diff != 0);
3086 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
3088
3089 return (len1 < len2) ? -1 : (len1 != len2);
3090}
3091
3092int PyUnicode_Compare(PyObject *left,
3093 PyObject *right)
3094{
3095 PyUnicodeObject *u = NULL, *v = NULL;
3096 int result;
3097
3098 /* Coerce the two arguments */
3099 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3100 if (u == NULL)
3101 goto onError;
3102 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3103 if (v == NULL)
3104 goto onError;
3105
3106 /* Shortcut for emtpy or interned objects */
3107 if (v == u) {
3108 Py_DECREF(u);
3109 Py_DECREF(v);
3110 return 0;
3111 }
3112
3113 result = unicode_compare(u, v);
3114
3115 Py_DECREF(u);
3116 Py_DECREF(v);
3117 return result;
3118
3119onError:
3120 Py_XDECREF(u);
3121 Py_XDECREF(v);
3122 return -1;
3123}
3124
Guido van Rossum403d68b2000-03-13 15:55:09 +00003125int PyUnicode_Contains(PyObject *container,
3126 PyObject *element)
3127{
3128 PyUnicodeObject *u = NULL, *v = NULL;
3129 int result;
3130 register const Py_UNICODE *p, *e;
3131 register Py_UNICODE ch;
3132
3133 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003134 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003135 if (v == NULL) {
3136 PyErr_SetString(PyExc_TypeError,
3137 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003138 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003139 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003140 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3141 if (u == NULL) {
3142 Py_DECREF(v);
3143 goto onError;
3144 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003145
3146 /* Check v in u */
3147 if (PyUnicode_GET_SIZE(v) != 1) {
3148 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003149 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003150 goto onError;
3151 }
3152 ch = *PyUnicode_AS_UNICODE(v);
3153 p = PyUnicode_AS_UNICODE(u);
3154 e = p + PyUnicode_GET_SIZE(u);
3155 result = 0;
3156 while (p < e) {
3157 if (*p++ == ch) {
3158 result = 1;
3159 break;
3160 }
3161 }
3162
3163 Py_DECREF(u);
3164 Py_DECREF(v);
3165 return result;
3166
3167onError:
3168 Py_XDECREF(u);
3169 Py_XDECREF(v);
3170 return -1;
3171}
3172
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173/* Concat to string or Unicode object giving a new Unicode object. */
3174
3175PyObject *PyUnicode_Concat(PyObject *left,
3176 PyObject *right)
3177{
3178 PyUnicodeObject *u = NULL, *v = NULL, *w;
3179
3180 /* Coerce the two arguments */
3181 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3182 if (u == NULL)
3183 goto onError;
3184 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3185 if (v == NULL)
3186 goto onError;
3187
3188 /* Shortcuts */
3189 if (v == unicode_empty) {
3190 Py_DECREF(v);
3191 return (PyObject *)u;
3192 }
3193 if (u == unicode_empty) {
3194 Py_DECREF(u);
3195 return (PyObject *)v;
3196 }
3197
3198 /* Concat the two Unicode strings */
3199 w = _PyUnicode_New(u->length + v->length);
3200 if (w == NULL)
3201 goto onError;
3202 Py_UNICODE_COPY(w->str, u->str, u->length);
3203 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3204
3205 Py_DECREF(u);
3206 Py_DECREF(v);
3207 return (PyObject *)w;
3208
3209onError:
3210 Py_XDECREF(u);
3211 Py_XDECREF(v);
3212 return NULL;
3213}
3214
3215static char count__doc__[] =
3216"S.count(sub[, start[, end]]) -> int\n\
3217\n\
3218Return the number of occurrences of substring sub in Unicode string\n\
3219S[start:end]. Optional arguments start and end are\n\
3220interpreted as in slice notation.";
3221
3222static PyObject *
3223unicode_count(PyUnicodeObject *self, PyObject *args)
3224{
3225 PyUnicodeObject *substring;
3226 int start = 0;
3227 int end = INT_MAX;
3228 PyObject *result;
3229
Guido van Rossumb8872e62000-05-09 14:14:27 +00003230 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3231 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232 return NULL;
3233
3234 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3235 (PyObject *)substring);
3236 if (substring == NULL)
3237 return NULL;
3238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 if (start < 0)
3240 start += self->length;
3241 if (start < 0)
3242 start = 0;
3243 if (end > self->length)
3244 end = self->length;
3245 if (end < 0)
3246 end += self->length;
3247 if (end < 0)
3248 end = 0;
3249
3250 result = PyInt_FromLong((long) count(self, start, end, substring));
3251
3252 Py_DECREF(substring);
3253 return result;
3254}
3255
3256static char encode__doc__[] =
3257"S.encode([encoding[,errors]]) -> string\n\
3258\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003259Return an encoded string version of S. Default encoding is the current\n\
3260default string encoding. errors may be given to set a different error\n\
3261handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3262a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263
3264static PyObject *
3265unicode_encode(PyUnicodeObject *self, PyObject *args)
3266{
3267 char *encoding = NULL;
3268 char *errors = NULL;
3269 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3270 return NULL;
3271 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3272}
3273
3274static char expandtabs__doc__[] =
3275"S.expandtabs([tabsize]) -> unicode\n\
3276\n\
3277Return a copy of S where all tab characters are expanded using spaces.\n\
3278If tabsize is not given, a tab size of 8 characters is assumed.";
3279
3280static PyObject*
3281unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3282{
3283 Py_UNICODE *e;
3284 Py_UNICODE *p;
3285 Py_UNICODE *q;
3286 int i, j;
3287 PyUnicodeObject *u;
3288 int tabsize = 8;
3289
3290 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3291 return NULL;
3292
3293 /* First pass: determine size of ouput string */
3294 i = j = 0;
3295 e = self->str + self->length;
3296 for (p = self->str; p < e; p++)
3297 if (*p == '\t') {
3298 if (tabsize > 0)
3299 j += tabsize - (j % tabsize);
3300 }
3301 else {
3302 j++;
3303 if (*p == '\n' || *p == '\r') {
3304 i += j;
3305 j = 0;
3306 }
3307 }
3308
3309 /* Second pass: create output string and fill it */
3310 u = _PyUnicode_New(i + j);
3311 if (!u)
3312 return NULL;
3313
3314 j = 0;
3315 q = u->str;
3316
3317 for (p = self->str; p < e; p++)
3318 if (*p == '\t') {
3319 if (tabsize > 0) {
3320 i = tabsize - (j % tabsize);
3321 j += i;
3322 while (i--)
3323 *q++ = ' ';
3324 }
3325 }
3326 else {
3327 j++;
3328 *q++ = *p;
3329 if (*p == '\n' || *p == '\r')
3330 j = 0;
3331 }
3332
3333 return (PyObject*) u;
3334}
3335
3336static char find__doc__[] =
3337"S.find(sub [,start [,end]]) -> int\n\
3338\n\
3339Return the lowest index in S where substring sub is found,\n\
3340such that sub is contained within s[start,end]. Optional\n\
3341arguments start and end are interpreted as in slice notation.\n\
3342\n\
3343Return -1 on failure.";
3344
3345static PyObject *
3346unicode_find(PyUnicodeObject *self, PyObject *args)
3347{
3348 PyUnicodeObject *substring;
3349 int start = 0;
3350 int end = INT_MAX;
3351 PyObject *result;
3352
Guido van Rossumb8872e62000-05-09 14:14:27 +00003353 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3354 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 return NULL;
3356 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3357 (PyObject *)substring);
3358 if (substring == NULL)
3359 return NULL;
3360
3361 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3362
3363 Py_DECREF(substring);
3364 return result;
3365}
3366
3367static PyObject *
3368unicode_getitem(PyUnicodeObject *self, int index)
3369{
3370 if (index < 0 || index >= self->length) {
3371 PyErr_SetString(PyExc_IndexError, "string index out of range");
3372 return NULL;
3373 }
3374
3375 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3376}
3377
3378static long
3379unicode_hash(PyUnicodeObject *self)
3380{
3381 long hash;
3382 PyObject *utf8;
3383
3384 /* Since Unicode objects compare equal to their UTF-8 string
3385 counterparts, they should also use the UTF-8 strings as basis
3386 for their hash value. This is needed to assure that strings and
3387 Unicode objects behave in the same way as dictionary
3388 keys. Unfortunately, this costs some performance and also some
3389 memory if the cached UTF-8 representation is not used later
3390 on. */
3391 if (self->hash != -1)
3392 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003393 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 if (utf8 == NULL)
3395 return -1;
3396 hash = PyObject_Hash(utf8);
3397 if (hash == -1)
3398 return -1;
3399 self->hash = hash;
3400 return hash;
3401}
3402
3403static char index__doc__[] =
3404"S.index(sub [,start [,end]]) -> int\n\
3405\n\
3406Like S.find() but raise ValueError when the substring is not found.";
3407
3408static PyObject *
3409unicode_index(PyUnicodeObject *self, PyObject *args)
3410{
3411 int result;
3412 PyUnicodeObject *substring;
3413 int start = 0;
3414 int end = INT_MAX;
3415
Guido van Rossumb8872e62000-05-09 14:14:27 +00003416 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3417 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 return NULL;
3419
3420 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3421 (PyObject *)substring);
3422 if (substring == NULL)
3423 return NULL;
3424
3425 result = findstring(self, substring, start, end, 1);
3426
3427 Py_DECREF(substring);
3428 if (result < 0) {
3429 PyErr_SetString(PyExc_ValueError, "substring not found");
3430 return NULL;
3431 }
3432 return PyInt_FromLong(result);
3433}
3434
3435static char islower__doc__[] =
3436"S.islower() -> int\n\
3437\n\
3438Return 1 if all cased characters in S are lowercase and there is\n\
3439at least one cased character in S, 0 otherwise.";
3440
3441static PyObject*
3442unicode_islower(PyUnicodeObject *self, PyObject *args)
3443{
3444 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3445 register const Py_UNICODE *e;
3446 int cased;
3447
3448 if (!PyArg_NoArgs(args))
3449 return NULL;
3450
3451 /* Shortcut for single character strings */
3452 if (PyUnicode_GET_SIZE(self) == 1)
3453 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3454
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003455 /* Special case for empty strings */
3456 if (PyString_GET_SIZE(self) == 0)
3457 return PyInt_FromLong(0);
3458
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 e = p + PyUnicode_GET_SIZE(self);
3460 cased = 0;
3461 for (; p < e; p++) {
3462 register const Py_UNICODE ch = *p;
3463
3464 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3465 return PyInt_FromLong(0);
3466 else if (!cased && Py_UNICODE_ISLOWER(ch))
3467 cased = 1;
3468 }
3469 return PyInt_FromLong(cased);
3470}
3471
3472static char isupper__doc__[] =
3473"S.isupper() -> int\n\
3474\n\
3475Return 1 if all cased characters in S are uppercase and there is\n\
3476at least one cased character in S, 0 otherwise.";
3477
3478static PyObject*
3479unicode_isupper(PyUnicodeObject *self, PyObject *args)
3480{
3481 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3482 register const Py_UNICODE *e;
3483 int cased;
3484
3485 if (!PyArg_NoArgs(args))
3486 return NULL;
3487
3488 /* Shortcut for single character strings */
3489 if (PyUnicode_GET_SIZE(self) == 1)
3490 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003492 /* Special case for empty strings */
3493 if (PyString_GET_SIZE(self) == 0)
3494 return PyInt_FromLong(0);
3495
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 e = p + PyUnicode_GET_SIZE(self);
3497 cased = 0;
3498 for (; p < e; p++) {
3499 register const Py_UNICODE ch = *p;
3500
3501 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3502 return PyInt_FromLong(0);
3503 else if (!cased && Py_UNICODE_ISUPPER(ch))
3504 cased = 1;
3505 }
3506 return PyInt_FromLong(cased);
3507}
3508
3509static char istitle__doc__[] =
3510"S.istitle() -> int\n\
3511\n\
3512Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3513may only follow uncased characters and lowercase characters only cased\n\
3514ones. Return 0 otherwise.";
3515
3516static PyObject*
3517unicode_istitle(PyUnicodeObject *self, PyObject *args)
3518{
3519 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3520 register const Py_UNICODE *e;
3521 int cased, previous_is_cased;
3522
3523 if (!PyArg_NoArgs(args))
3524 return NULL;
3525
3526 /* Shortcut for single character strings */
3527 if (PyUnicode_GET_SIZE(self) == 1)
3528 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3529 (Py_UNICODE_ISUPPER(*p) != 0));
3530
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003531 /* Special case for empty strings */
3532 if (PyString_GET_SIZE(self) == 0)
3533 return PyInt_FromLong(0);
3534
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 e = p + PyUnicode_GET_SIZE(self);
3536 cased = 0;
3537 previous_is_cased = 0;
3538 for (; p < e; p++) {
3539 register const Py_UNICODE ch = *p;
3540
3541 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3542 if (previous_is_cased)
3543 return PyInt_FromLong(0);
3544 previous_is_cased = 1;
3545 cased = 1;
3546 }
3547 else if (Py_UNICODE_ISLOWER(ch)) {
3548 if (!previous_is_cased)
3549 return PyInt_FromLong(0);
3550 previous_is_cased = 1;
3551 cased = 1;
3552 }
3553 else
3554 previous_is_cased = 0;
3555 }
3556 return PyInt_FromLong(cased);
3557}
3558
3559static char isspace__doc__[] =
3560"S.isspace() -> int\n\
3561\n\
3562Return 1 if there are only whitespace characters in S,\n\
35630 otherwise.";
3564
3565static PyObject*
3566unicode_isspace(PyUnicodeObject *self, PyObject *args)
3567{
3568 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3569 register const Py_UNICODE *e;
3570
3571 if (!PyArg_NoArgs(args))
3572 return NULL;
3573
3574 /* Shortcut for single character strings */
3575 if (PyUnicode_GET_SIZE(self) == 1 &&
3576 Py_UNICODE_ISSPACE(*p))
3577 return PyInt_FromLong(1);
3578
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003579 /* Special case for empty strings */
3580 if (PyString_GET_SIZE(self) == 0)
3581 return PyInt_FromLong(0);
3582
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 e = p + PyUnicode_GET_SIZE(self);
3584 for (; p < e; p++) {
3585 if (!Py_UNICODE_ISSPACE(*p))
3586 return PyInt_FromLong(0);
3587 }
3588 return PyInt_FromLong(1);
3589}
3590
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003591static char isalpha__doc__[] =
3592"S.isalpha() -> int\n\
3593\n\
3594Return 1 if all characters in S are alphabetic\n\
3595and there is at least one character in S, 0 otherwise.";
3596
3597static PyObject*
3598unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3599{
3600 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3601 register const Py_UNICODE *e;
3602
3603 if (!PyArg_NoArgs(args))
3604 return NULL;
3605
3606 /* Shortcut for single character strings */
3607 if (PyUnicode_GET_SIZE(self) == 1 &&
3608 Py_UNICODE_ISALPHA(*p))
3609 return PyInt_FromLong(1);
3610
3611 /* Special case for empty strings */
3612 if (PyString_GET_SIZE(self) == 0)
3613 return PyInt_FromLong(0);
3614
3615 e = p + PyUnicode_GET_SIZE(self);
3616 for (; p < e; p++) {
3617 if (!Py_UNICODE_ISALPHA(*p))
3618 return PyInt_FromLong(0);
3619 }
3620 return PyInt_FromLong(1);
3621}
3622
3623static char isalnum__doc__[] =
3624"S.isalnum() -> int\n\
3625\n\
3626Return 1 if all characters in S are alphanumeric\n\
3627and there is at least one character in S, 0 otherwise.";
3628
3629static PyObject*
3630unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3631{
3632 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3633 register const Py_UNICODE *e;
3634
3635 if (!PyArg_NoArgs(args))
3636 return NULL;
3637
3638 /* Shortcut for single character strings */
3639 if (PyUnicode_GET_SIZE(self) == 1 &&
3640 Py_UNICODE_ISALNUM(*p))
3641 return PyInt_FromLong(1);
3642
3643 /* Special case for empty strings */
3644 if (PyString_GET_SIZE(self) == 0)
3645 return PyInt_FromLong(0);
3646
3647 e = p + PyUnicode_GET_SIZE(self);
3648 for (; p < e; p++) {
3649 if (!Py_UNICODE_ISALNUM(*p))
3650 return PyInt_FromLong(0);
3651 }
3652 return PyInt_FromLong(1);
3653}
3654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655static char isdecimal__doc__[] =
3656"S.isdecimal() -> int\n\
3657\n\
3658Return 1 if there are only decimal characters in S,\n\
36590 otherwise.";
3660
3661static PyObject*
3662unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3663{
3664 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3665 register const Py_UNICODE *e;
3666
3667 if (!PyArg_NoArgs(args))
3668 return NULL;
3669
3670 /* Shortcut for single character strings */
3671 if (PyUnicode_GET_SIZE(self) == 1 &&
3672 Py_UNICODE_ISDECIMAL(*p))
3673 return PyInt_FromLong(1);
3674
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003675 /* Special case for empty strings */
3676 if (PyString_GET_SIZE(self) == 0)
3677 return PyInt_FromLong(0);
3678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 e = p + PyUnicode_GET_SIZE(self);
3680 for (; p < e; p++) {
3681 if (!Py_UNICODE_ISDECIMAL(*p))
3682 return PyInt_FromLong(0);
3683 }
3684 return PyInt_FromLong(1);
3685}
3686
3687static char isdigit__doc__[] =
3688"S.isdigit() -> int\n\
3689\n\
3690Return 1 if there are only digit characters in S,\n\
36910 otherwise.";
3692
3693static PyObject*
3694unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3695{
3696 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3697 register const Py_UNICODE *e;
3698
3699 if (!PyArg_NoArgs(args))
3700 return NULL;
3701
3702 /* Shortcut for single character strings */
3703 if (PyUnicode_GET_SIZE(self) == 1 &&
3704 Py_UNICODE_ISDIGIT(*p))
3705 return PyInt_FromLong(1);
3706
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003707 /* Special case for empty strings */
3708 if (PyString_GET_SIZE(self) == 0)
3709 return PyInt_FromLong(0);
3710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 e = p + PyUnicode_GET_SIZE(self);
3712 for (; p < e; p++) {
3713 if (!Py_UNICODE_ISDIGIT(*p))
3714 return PyInt_FromLong(0);
3715 }
3716 return PyInt_FromLong(1);
3717}
3718
3719static char isnumeric__doc__[] =
3720"S.isnumeric() -> int\n\
3721\n\
3722Return 1 if there are only numeric characters in S,\n\
37230 otherwise.";
3724
3725static PyObject*
3726unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3727{
3728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3729 register const Py_UNICODE *e;
3730
3731 if (!PyArg_NoArgs(args))
3732 return NULL;
3733
3734 /* Shortcut for single character strings */
3735 if (PyUnicode_GET_SIZE(self) == 1 &&
3736 Py_UNICODE_ISNUMERIC(*p))
3737 return PyInt_FromLong(1);
3738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003739 /* Special case for empty strings */
3740 if (PyString_GET_SIZE(self) == 0)
3741 return PyInt_FromLong(0);
3742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 e = p + PyUnicode_GET_SIZE(self);
3744 for (; p < e; p++) {
3745 if (!Py_UNICODE_ISNUMERIC(*p))
3746 return PyInt_FromLong(0);
3747 }
3748 return PyInt_FromLong(1);
3749}
3750
3751static char join__doc__[] =
3752"S.join(sequence) -> unicode\n\
3753\n\
3754Return a string which is the concatenation of the strings in the\n\
3755sequence. The separator between elements is S.";
3756
3757static PyObject*
3758unicode_join(PyUnicodeObject *self, PyObject *args)
3759{
3760 PyObject *data;
3761 if (!PyArg_ParseTuple(args, "O:join", &data))
3762 return NULL;
3763
3764 return PyUnicode_Join((PyObject *)self, data);
3765}
3766
3767static int
3768unicode_length(PyUnicodeObject *self)
3769{
3770 return self->length;
3771}
3772
3773static char ljust__doc__[] =
3774"S.ljust(width) -> unicode\n\
3775\n\
3776Return S left justified in a Unicode string of length width. Padding is\n\
3777done using spaces.";
3778
3779static PyObject *
3780unicode_ljust(PyUnicodeObject *self, PyObject *args)
3781{
3782 int width;
3783 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3784 return NULL;
3785
3786 if (self->length >= width) {
3787 Py_INCREF(self);
3788 return (PyObject*) self;
3789 }
3790
3791 return (PyObject*) pad(self, 0, width - self->length, ' ');
3792}
3793
3794static char lower__doc__[] =
3795"S.lower() -> unicode\n\
3796\n\
3797Return a copy of the string S converted to lowercase.";
3798
3799static PyObject*
3800unicode_lower(PyUnicodeObject *self, PyObject *args)
3801{
3802 if (!PyArg_NoArgs(args))
3803 return NULL;
3804 return fixup(self, fixlower);
3805}
3806
3807static char lstrip__doc__[] =
3808"S.lstrip() -> unicode\n\
3809\n\
3810Return a copy of the string S with leading whitespace removed.";
3811
3812static PyObject *
3813unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3814{
3815 if (!PyArg_NoArgs(args))
3816 return NULL;
3817 return strip(self, 1, 0);
3818}
3819
3820static PyObject*
3821unicode_repeat(PyUnicodeObject *str, int len)
3822{
3823 PyUnicodeObject *u;
3824 Py_UNICODE *p;
3825
3826 if (len < 0)
3827 len = 0;
3828
3829 if (len == 1) {
3830 /* no repeat, return original string */
3831 Py_INCREF(str);
3832 return (PyObject*) str;
3833 }
3834
3835 u = _PyUnicode_New(len * str->length);
3836 if (!u)
3837 return NULL;
3838
3839 p = u->str;
3840
3841 while (len-- > 0) {
3842 Py_UNICODE_COPY(p, str->str, str->length);
3843 p += str->length;
3844 }
3845
3846 return (PyObject*) u;
3847}
3848
3849PyObject *PyUnicode_Replace(PyObject *obj,
3850 PyObject *subobj,
3851 PyObject *replobj,
3852 int maxcount)
3853{
3854 PyObject *self;
3855 PyObject *str1;
3856 PyObject *str2;
3857 PyObject *result;
3858
3859 self = PyUnicode_FromObject(obj);
3860 if (self == NULL)
3861 return NULL;
3862 str1 = PyUnicode_FromObject(subobj);
3863 if (str1 == NULL) {
3864 Py_DECREF(self);
3865 return NULL;
3866 }
3867 str2 = PyUnicode_FromObject(replobj);
3868 if (str2 == NULL) {
3869 Py_DECREF(self);
3870 Py_DECREF(str1);
3871 return NULL;
3872 }
3873 result = replace((PyUnicodeObject *)self,
3874 (PyUnicodeObject *)str1,
3875 (PyUnicodeObject *)str2,
3876 maxcount);
3877 Py_DECREF(self);
3878 Py_DECREF(str1);
3879 Py_DECREF(str2);
3880 return result;
3881}
3882
3883static char replace__doc__[] =
3884"S.replace (old, new[, maxsplit]) -> unicode\n\
3885\n\
3886Return a copy of S with all occurrences of substring\n\
3887old replaced by new. If the optional argument maxsplit is\n\
3888given, only the first maxsplit occurrences are replaced.";
3889
3890static PyObject*
3891unicode_replace(PyUnicodeObject *self, PyObject *args)
3892{
3893 PyUnicodeObject *str1;
3894 PyUnicodeObject *str2;
3895 int maxcount = -1;
3896 PyObject *result;
3897
3898 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3899 return NULL;
3900 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3901 if (str1 == NULL)
3902 return NULL;
3903 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3904 if (str2 == NULL)
3905 return NULL;
3906
3907 result = replace(self, str1, str2, maxcount);
3908
3909 Py_DECREF(str1);
3910 Py_DECREF(str2);
3911 return result;
3912}
3913
3914static
3915PyObject *unicode_repr(PyObject *unicode)
3916{
3917 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3918 PyUnicode_GET_SIZE(unicode),
3919 1);
3920}
3921
3922static char rfind__doc__[] =
3923"S.rfind(sub [,start [,end]]) -> int\n\
3924\n\
3925Return the highest index in S where substring sub is found,\n\
3926such that sub is contained within s[start,end]. Optional\n\
3927arguments start and end are interpreted as in slice notation.\n\
3928\n\
3929Return -1 on failure.";
3930
3931static PyObject *
3932unicode_rfind(PyUnicodeObject *self, PyObject *args)
3933{
3934 PyUnicodeObject *substring;
3935 int start = 0;
3936 int end = INT_MAX;
3937 PyObject *result;
3938
Guido van Rossumb8872e62000-05-09 14:14:27 +00003939 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3940 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 return NULL;
3942 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3943 (PyObject *)substring);
3944 if (substring == NULL)
3945 return NULL;
3946
3947 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3948
3949 Py_DECREF(substring);
3950 return result;
3951}
3952
3953static char rindex__doc__[] =
3954"S.rindex(sub [,start [,end]]) -> int\n\
3955\n\
3956Like S.rfind() but raise ValueError when the substring is not found.";
3957
3958static PyObject *
3959unicode_rindex(PyUnicodeObject *self, PyObject *args)
3960{
3961 int result;
3962 PyUnicodeObject *substring;
3963 int start = 0;
3964 int end = INT_MAX;
3965
Guido van Rossumb8872e62000-05-09 14:14:27 +00003966 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3967 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 return NULL;
3969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3970 (PyObject *)substring);
3971 if (substring == NULL)
3972 return NULL;
3973
3974 result = findstring(self, substring, start, end, -1);
3975
3976 Py_DECREF(substring);
3977 if (result < 0) {
3978 PyErr_SetString(PyExc_ValueError, "substring not found");
3979 return NULL;
3980 }
3981 return PyInt_FromLong(result);
3982}
3983
3984static char rjust__doc__[] =
3985"S.rjust(width) -> unicode\n\
3986\n\
3987Return S right justified in a Unicode string of length width. Padding is\n\
3988done using spaces.";
3989
3990static PyObject *
3991unicode_rjust(PyUnicodeObject *self, PyObject *args)
3992{
3993 int width;
3994 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3995 return NULL;
3996
3997 if (self->length >= width) {
3998 Py_INCREF(self);
3999 return (PyObject*) self;
4000 }
4001
4002 return (PyObject*) pad(self, width - self->length, 0, ' ');
4003}
4004
4005static char rstrip__doc__[] =
4006"S.rstrip() -> unicode\n\
4007\n\
4008Return a copy of the string S with trailing whitespace removed.";
4009
4010static PyObject *
4011unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4012{
4013 if (!PyArg_NoArgs(args))
4014 return NULL;
4015 return strip(self, 0, 1);
4016}
4017
4018static PyObject*
4019unicode_slice(PyUnicodeObject *self, int start, int end)
4020{
4021 /* standard clamping */
4022 if (start < 0)
4023 start = 0;
4024 if (end < 0)
4025 end = 0;
4026 if (end > self->length)
4027 end = self->length;
4028 if (start == 0 && end == self->length) {
4029 /* full slice, return original string */
4030 Py_INCREF(self);
4031 return (PyObject*) self;
4032 }
4033 if (start > end)
4034 start = end;
4035 /* copy slice */
4036 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4037 end - start);
4038}
4039
4040PyObject *PyUnicode_Split(PyObject *s,
4041 PyObject *sep,
4042 int maxsplit)
4043{
4044 PyObject *result;
4045
4046 s = PyUnicode_FromObject(s);
4047 if (s == NULL)
4048 return NULL;
4049 if (sep != NULL) {
4050 sep = PyUnicode_FromObject(sep);
4051 if (sep == NULL) {
4052 Py_DECREF(s);
4053 return NULL;
4054 }
4055 }
4056
4057 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4058
4059 Py_DECREF(s);
4060 Py_XDECREF(sep);
4061 return result;
4062}
4063
4064static char split__doc__[] =
4065"S.split([sep [,maxsplit]]) -> list of strings\n\
4066\n\
4067Return a list of the words in S, using sep as the\n\
4068delimiter string. If maxsplit is given, at most maxsplit\n\
4069splits are done. If sep is not specified, any whitespace string\n\
4070is a separator.";
4071
4072static PyObject*
4073unicode_split(PyUnicodeObject *self, PyObject *args)
4074{
4075 PyObject *substring = Py_None;
4076 int maxcount = -1;
4077
4078 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4079 return NULL;
4080
4081 if (substring == Py_None)
4082 return split(self, NULL, maxcount);
4083 else if (PyUnicode_Check(substring))
4084 return split(self, (PyUnicodeObject *)substring, maxcount);
4085 else
4086 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4087}
4088
4089static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004090"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091\n\
4092Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004093Line breaks are not included in the resulting list unless keepends\n\
4094is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095
4096static PyObject*
4097unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4098{
Guido van Rossum86662912000-04-11 15:38:46 +00004099 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100
Guido van Rossum86662912000-04-11 15:38:46 +00004101 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 return NULL;
4103
Guido van Rossum86662912000-04-11 15:38:46 +00004104 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
4107static
4108PyObject *unicode_str(PyUnicodeObject *self)
4109{
Fred Drakee4315f52000-05-09 19:53:39 +00004110 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111}
4112
4113static char strip__doc__[] =
4114"S.strip() -> unicode\n\
4115\n\
4116Return a copy of S with leading and trailing whitespace removed.";
4117
4118static PyObject *
4119unicode_strip(PyUnicodeObject *self, PyObject *args)
4120{
4121 if (!PyArg_NoArgs(args))
4122 return NULL;
4123 return strip(self, 1, 1);
4124}
4125
4126static char swapcase__doc__[] =
4127"S.swapcase() -> unicode\n\
4128\n\
4129Return a copy of S with uppercase characters converted to lowercase\n\
4130and vice versa.";
4131
4132static PyObject*
4133unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4134{
4135 if (!PyArg_NoArgs(args))
4136 return NULL;
4137 return fixup(self, fixswapcase);
4138}
4139
4140static char translate__doc__[] =
4141"S.translate(table) -> unicode\n\
4142\n\
4143Return a copy of the string S, where all characters have been mapped\n\
4144through the given translation table, which must be a mapping of\n\
4145Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4146are left untouched. Characters mapped to None are deleted.";
4147
4148static PyObject*
4149unicode_translate(PyUnicodeObject *self, PyObject *args)
4150{
4151 PyObject *table;
4152
4153 if (!PyArg_ParseTuple(args, "O:translate", &table))
4154 return NULL;
4155 return PyUnicode_TranslateCharmap(self->str,
4156 self->length,
4157 table,
4158 "ignore");
4159}
4160
4161static char upper__doc__[] =
4162"S.upper() -> unicode\n\
4163\n\
4164Return a copy of S converted to uppercase.";
4165
4166static PyObject*
4167unicode_upper(PyUnicodeObject *self, PyObject *args)
4168{
4169 if (!PyArg_NoArgs(args))
4170 return NULL;
4171 return fixup(self, fixupper);
4172}
4173
4174#if 0
4175static char zfill__doc__[] =
4176"S.zfill(width) -> unicode\n\
4177\n\
4178Pad a numeric string x with zeros on the left, to fill a field\n\
4179of the specified width. The string x is never truncated.";
4180
4181static PyObject *
4182unicode_zfill(PyUnicodeObject *self, PyObject *args)
4183{
4184 int fill;
4185 PyUnicodeObject *u;
4186
4187 int width;
4188 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4189 return NULL;
4190
4191 if (self->length >= width) {
4192 Py_INCREF(self);
4193 return (PyObject*) self;
4194 }
4195
4196 fill = width - self->length;
4197
4198 u = pad(self, fill, 0, '0');
4199
4200 if (u->str[fill] == '+' || u->str[fill] == '-') {
4201 /* move sign to beginning of string */
4202 u->str[0] = u->str[fill];
4203 u->str[fill] = '0';
4204 }
4205
4206 return (PyObject*) u;
4207}
4208#endif
4209
4210#if 0
4211static PyObject*
4212unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4213{
4214 if (!PyArg_NoArgs(args))
4215 return NULL;
4216 return PyInt_FromLong(unicode_freelist_size);
4217}
4218#endif
4219
4220static char startswith__doc__[] =
4221"S.startswith(prefix[, start[, end]]) -> int\n\
4222\n\
4223Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4224optional start, test S beginning at that position. With optional end, stop\n\
4225comparing S at that position.";
4226
4227static PyObject *
4228unicode_startswith(PyUnicodeObject *self,
4229 PyObject *args)
4230{
4231 PyUnicodeObject *substring;
4232 int start = 0;
4233 int end = INT_MAX;
4234 PyObject *result;
4235
Guido van Rossumb8872e62000-05-09 14:14:27 +00004236 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4237 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 return NULL;
4239 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4240 (PyObject *)substring);
4241 if (substring == NULL)
4242 return NULL;
4243
4244 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4245
4246 Py_DECREF(substring);
4247 return result;
4248}
4249
4250
4251static char endswith__doc__[] =
4252"S.endswith(suffix[, start[, end]]) -> int\n\
4253\n\
4254Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4255optional start, test S beginning at that position. With optional end, stop\n\
4256comparing S at that position.";
4257
4258static PyObject *
4259unicode_endswith(PyUnicodeObject *self,
4260 PyObject *args)
4261{
4262 PyUnicodeObject *substring;
4263 int start = 0;
4264 int end = INT_MAX;
4265 PyObject *result;
4266
Guido van Rossumb8872e62000-05-09 14:14:27 +00004267 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4268 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 return NULL;
4270 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4271 (PyObject *)substring);
4272 if (substring == NULL)
4273 return NULL;
4274
4275 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4276
4277 Py_DECREF(substring);
4278 return result;
4279}
4280
4281
4282static PyMethodDef unicode_methods[] = {
4283
4284 /* Order is according to common usage: often used methods should
4285 appear first, since lookup is done sequentially. */
4286
4287 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4288 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4289 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4290 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4291 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4292 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4293 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4294 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4295 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4296 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4297 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4298 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4299 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4300 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4301/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4302 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4303 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4304 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4305 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4306 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4307 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4308 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4309 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4310 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4311 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4312 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4313 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4314 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4315 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4316 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4317 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4318 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4319 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004320 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4321 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322#if 0
4323 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4324 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4325#endif
4326
4327#if 0
4328 /* This one is just used for debugging the implementation. */
4329 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4330#endif
4331
4332 {NULL, NULL}
4333};
4334
4335static PyObject *
4336unicode_getattr(PyUnicodeObject *self, char *name)
4337{
4338 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4339}
4340
4341static PySequenceMethods unicode_as_sequence = {
4342 (inquiry) unicode_length, /* sq_length */
4343 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4344 (intargfunc) unicode_repeat, /* sq_repeat */
4345 (intargfunc) unicode_getitem, /* sq_item */
4346 (intintargfunc) unicode_slice, /* sq_slice */
4347 0, /* sq_ass_item */
4348 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004349 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350};
4351
4352static int
4353unicode_buffer_getreadbuf(PyUnicodeObject *self,
4354 int index,
4355 const void **ptr)
4356{
4357 if (index != 0) {
4358 PyErr_SetString(PyExc_SystemError,
4359 "accessing non-existent unicode segment");
4360 return -1;
4361 }
4362 *ptr = (void *) self->str;
4363 return PyUnicode_GET_DATA_SIZE(self);
4364}
4365
4366static int
4367unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4368 const void **ptr)
4369{
4370 PyErr_SetString(PyExc_TypeError,
4371 "cannot use unicode as modifyable buffer");
4372 return -1;
4373}
4374
4375static int
4376unicode_buffer_getsegcount(PyUnicodeObject *self,
4377 int *lenp)
4378{
4379 if (lenp)
4380 *lenp = PyUnicode_GET_DATA_SIZE(self);
4381 return 1;
4382}
4383
4384static int
4385unicode_buffer_getcharbuf(PyUnicodeObject *self,
4386 int index,
4387 const void **ptr)
4388{
4389 PyObject *str;
4390
4391 if (index != 0) {
4392 PyErr_SetString(PyExc_SystemError,
4393 "accessing non-existent unicode segment");
4394 return -1;
4395 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004396 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 if (str == NULL)
4398 return -1;
4399 *ptr = (void *) PyString_AS_STRING(str);
4400 return PyString_GET_SIZE(str);
4401}
4402
4403/* Helpers for PyUnicode_Format() */
4404
4405static PyObject *
4406getnextarg(args, arglen, p_argidx)
4407 PyObject *args;
4408int arglen;
4409int *p_argidx;
4410{
4411 int argidx = *p_argidx;
4412 if (argidx < arglen) {
4413 (*p_argidx)++;
4414 if (arglen < 0)
4415 return args;
4416 else
4417 return PyTuple_GetItem(args, argidx);
4418 }
4419 PyErr_SetString(PyExc_TypeError,
4420 "not enough arguments for format string");
4421 return NULL;
4422}
4423
4424#define F_LJUST (1<<0)
4425#define F_SIGN (1<<1)
4426#define F_BLANK (1<<2)
4427#define F_ALT (1<<3)
4428#define F_ZERO (1<<4)
4429
4430static
4431#ifdef HAVE_STDARG_PROTOTYPES
4432int usprintf(register Py_UNICODE *buffer, char *format, ...)
4433#else
4434int usprintf(va_alist) va_dcl
4435#endif
4436{
4437 register int i;
4438 int len;
4439 va_list va;
4440 char *charbuffer;
4441#ifdef HAVE_STDARG_PROTOTYPES
4442 va_start(va, format);
4443#else
4444 Py_UNICODE *args;
4445 char *format;
4446
4447 va_start(va);
4448 buffer = va_arg(va, Py_UNICODE *);
4449 format = va_arg(va, char *);
4450#endif
4451
4452 /* First, format the string as char array, then expand to Py_UNICODE
4453 array. */
4454 charbuffer = (char *)buffer;
4455 len = vsprintf(charbuffer, format, va);
4456 for (i = len - 1; i >= 0; i--)
4457 buffer[i] = (Py_UNICODE) charbuffer[i];
4458
4459 va_end(va);
4460 return len;
4461}
4462
4463static int
4464formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004465 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 int flags,
4467 int prec,
4468 int type,
4469 PyObject *v)
4470{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004471 /* fmt = '%#.' + `prec` + `type`
4472 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 char fmt[20];
4474 double x;
4475
4476 x = PyFloat_AsDouble(v);
4477 if (x == -1.0 && PyErr_Occurred())
4478 return -1;
4479 if (prec < 0)
4480 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4482 type = 'g';
4483 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004484 /* worst case length calc to ensure no buffer overrun:
4485 fmt = %#.<prec>g
4486 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4487 for any double rep.)
4488 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4489 If prec=0 the effective precision is 1 (the leading digit is
4490 always given), therefore increase by one to 10+prec. */
4491 if (buflen <= (size_t)10 + (size_t)prec) {
4492 PyErr_SetString(PyExc_OverflowError,
4493 "formatted float is too long (precision too long?)");
4494 return -1;
4495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 return usprintf(buf, fmt, x);
4497}
4498
4499static int
4500formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004501 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 int flags,
4503 int prec,
4504 int type,
4505 PyObject *v)
4506{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004507 /* fmt = '%#.' + `prec` + 'l' + `type`
4508 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 char fmt[20];
4510 long x;
4511
4512 x = PyInt_AsLong(v);
4513 if (x == -1 && PyErr_Occurred())
4514 return -1;
4515 if (prec < 0)
4516 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004517 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4518 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4519 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4520 PyErr_SetString(PyExc_OverflowError,
4521 "formatted integer is too long (precision too long?)");
4522 return -1;
4523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4525 return usprintf(buf, fmt, x);
4526}
4527
4528static int
4529formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004530 size_t buflen,
4531 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004533 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004534 if (PyUnicode_Check(v)) {
4535 if (PyUnicode_GET_SIZE(v) != 1)
4536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004540 else if (PyString_Check(v)) {
4541 if (PyString_GET_SIZE(v) != 1)
4542 goto onError;
4543 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545
4546 else {
4547 /* Integer input truncated to a character */
4548 long x;
4549 x = PyInt_AsLong(v);
4550 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004551 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 buf[0] = (char) x;
4553 }
4554 buf[1] = '\0';
4555 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004556
4557 onError:
4558 PyErr_SetString(PyExc_TypeError,
4559 "%c requires int or char");
4560 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561}
4562
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004563/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4564
4565 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4566 chars are formatted. XXX This is a magic number. Each formatting
4567 routine does bounds checking to ensure no overflow, but a better
4568 solution may be to malloc a buffer of appropriate size for each
4569 format. For now, the current solution is sufficient.
4570*/
4571#define FORMATBUFLEN (size_t)120
4572
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573PyObject *PyUnicode_Format(PyObject *format,
4574 PyObject *args)
4575{
4576 Py_UNICODE *fmt, *res;
4577 int fmtcnt, rescnt, reslen, arglen, argidx;
4578 int args_owned = 0;
4579 PyUnicodeObject *result = NULL;
4580 PyObject *dict = NULL;
4581 PyObject *uformat;
4582
4583 if (format == NULL || args == NULL) {
4584 PyErr_BadInternalCall();
4585 return NULL;
4586 }
4587 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004588 if (uformat == NULL)
4589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 fmt = PyUnicode_AS_UNICODE(uformat);
4591 fmtcnt = PyUnicode_GET_SIZE(uformat);
4592
4593 reslen = rescnt = fmtcnt + 100;
4594 result = _PyUnicode_New(reslen);
4595 if (result == NULL)
4596 goto onError;
4597 res = PyUnicode_AS_UNICODE(result);
4598
4599 if (PyTuple_Check(args)) {
4600 arglen = PyTuple_Size(args);
4601 argidx = 0;
4602 }
4603 else {
4604 arglen = -1;
4605 argidx = -2;
4606 }
4607 if (args->ob_type->tp_as_mapping)
4608 dict = args;
4609
4610 while (--fmtcnt >= 0) {
4611 if (*fmt != '%') {
4612 if (--rescnt < 0) {
4613 rescnt = fmtcnt + 100;
4614 reslen += rescnt;
4615 if (_PyUnicode_Resize(result, reslen) < 0)
4616 return NULL;
4617 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4618 --rescnt;
4619 }
4620 *res++ = *fmt++;
4621 }
4622 else {
4623 /* Got a format specifier */
4624 int flags = 0;
4625 int width = -1;
4626 int prec = -1;
4627 int size = 0;
4628 Py_UNICODE c = '\0';
4629 Py_UNICODE fill;
4630 PyObject *v = NULL;
4631 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004632 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 Py_UNICODE sign;
4634 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004635 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637 fmt++;
4638 if (*fmt == '(') {
4639 Py_UNICODE *keystart;
4640 int keylen;
4641 PyObject *key;
4642 int pcount = 1;
4643
4644 if (dict == NULL) {
4645 PyErr_SetString(PyExc_TypeError,
4646 "format requires a mapping");
4647 goto onError;
4648 }
4649 ++fmt;
4650 --fmtcnt;
4651 keystart = fmt;
4652 /* Skip over balanced parentheses */
4653 while (pcount > 0 && --fmtcnt >= 0) {
4654 if (*fmt == ')')
4655 --pcount;
4656 else if (*fmt == '(')
4657 ++pcount;
4658 fmt++;
4659 }
4660 keylen = fmt - keystart - 1;
4661 if (fmtcnt < 0 || pcount > 0) {
4662 PyErr_SetString(PyExc_ValueError,
4663 "incomplete format key");
4664 goto onError;
4665 }
Fred Drakee4315f52000-05-09 19:53:39 +00004666 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 then looked up since Python uses strings to hold
4668 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004669 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670 key = PyUnicode_EncodeUTF8(keystart,
4671 keylen,
4672 NULL);
4673 if (key == NULL)
4674 goto onError;
4675 if (args_owned) {
4676 Py_DECREF(args);
4677 args_owned = 0;
4678 }
4679 args = PyObject_GetItem(dict, key);
4680 Py_DECREF(key);
4681 if (args == NULL) {
4682 goto onError;
4683 }
4684 args_owned = 1;
4685 arglen = -1;
4686 argidx = -2;
4687 }
4688 while (--fmtcnt >= 0) {
4689 switch (c = *fmt++) {
4690 case '-': flags |= F_LJUST; continue;
4691 case '+': flags |= F_SIGN; continue;
4692 case ' ': flags |= F_BLANK; continue;
4693 case '#': flags |= F_ALT; continue;
4694 case '0': flags |= F_ZERO; continue;
4695 }
4696 break;
4697 }
4698 if (c == '*') {
4699 v = getnextarg(args, arglen, &argidx);
4700 if (v == NULL)
4701 goto onError;
4702 if (!PyInt_Check(v)) {
4703 PyErr_SetString(PyExc_TypeError,
4704 "* wants int");
4705 goto onError;
4706 }
4707 width = PyInt_AsLong(v);
4708 if (width < 0) {
4709 flags |= F_LJUST;
4710 width = -width;
4711 }
4712 if (--fmtcnt >= 0)
4713 c = *fmt++;
4714 }
4715 else if (c >= '0' && c <= '9') {
4716 width = c - '0';
4717 while (--fmtcnt >= 0) {
4718 c = *fmt++;
4719 if (c < '0' || c > '9')
4720 break;
4721 if ((width*10) / 10 != width) {
4722 PyErr_SetString(PyExc_ValueError,
4723 "width too big");
4724 goto onError;
4725 }
4726 width = width*10 + (c - '0');
4727 }
4728 }
4729 if (c == '.') {
4730 prec = 0;
4731 if (--fmtcnt >= 0)
4732 c = *fmt++;
4733 if (c == '*') {
4734 v = getnextarg(args, arglen, &argidx);
4735 if (v == NULL)
4736 goto onError;
4737 if (!PyInt_Check(v)) {
4738 PyErr_SetString(PyExc_TypeError,
4739 "* wants int");
4740 goto onError;
4741 }
4742 prec = PyInt_AsLong(v);
4743 if (prec < 0)
4744 prec = 0;
4745 if (--fmtcnt >= 0)
4746 c = *fmt++;
4747 }
4748 else if (c >= '0' && c <= '9') {
4749 prec = c - '0';
4750 while (--fmtcnt >= 0) {
4751 c = Py_CHARMASK(*fmt++);
4752 if (c < '0' || c > '9')
4753 break;
4754 if ((prec*10) / 10 != prec) {
4755 PyErr_SetString(PyExc_ValueError,
4756 "prec too big");
4757 goto onError;
4758 }
4759 prec = prec*10 + (c - '0');
4760 }
4761 }
4762 } /* prec */
4763 if (fmtcnt >= 0) {
4764 if (c == 'h' || c == 'l' || c == 'L') {
4765 size = c;
4766 if (--fmtcnt >= 0)
4767 c = *fmt++;
4768 }
4769 }
4770 if (fmtcnt < 0) {
4771 PyErr_SetString(PyExc_ValueError,
4772 "incomplete format");
4773 goto onError;
4774 }
4775 if (c != '%') {
4776 v = getnextarg(args, arglen, &argidx);
4777 if (v == NULL)
4778 goto onError;
4779 }
4780 sign = 0;
4781 fill = ' ';
4782 switch (c) {
4783
4784 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004785 pbuf = formatbuf;
4786 /* presume that buffer length is at least 1 */
4787 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 len = 1;
4789 break;
4790
4791 case 's':
4792 case 'r':
4793 if (PyUnicode_Check(v) && c == 's') {
4794 temp = v;
4795 Py_INCREF(temp);
4796 }
4797 else {
4798 PyObject *unicode;
4799 if (c == 's')
4800 temp = PyObject_Str(v);
4801 else
4802 temp = PyObject_Repr(v);
4803 if (temp == NULL)
4804 goto onError;
4805 if (!PyString_Check(temp)) {
4806 /* XXX Note: this should never happen, since
4807 PyObject_Repr() and PyObject_Str() assure
4808 this */
4809 Py_DECREF(temp);
4810 PyErr_SetString(PyExc_TypeError,
4811 "%s argument has non-string str()");
4812 goto onError;
4813 }
Fred Drakee4315f52000-05-09 19:53:39 +00004814 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004816 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 "strict");
4818 Py_DECREF(temp);
4819 temp = unicode;
4820 if (temp == NULL)
4821 goto onError;
4822 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004823 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 len = PyUnicode_GET_SIZE(temp);
4825 if (prec >= 0 && len > prec)
4826 len = prec;
4827 break;
4828
4829 case 'i':
4830 case 'd':
4831 case 'u':
4832 case 'o':
4833 case 'x':
4834 case 'X':
4835 if (c == 'i')
4836 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004837 pbuf = formatbuf;
4838 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4839 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 if (len < 0)
4841 goto onError;
4842 sign = (c == 'd');
4843 if (flags & F_ZERO) {
4844 fill = '0';
4845 if ((flags&F_ALT) &&
4846 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004847 pbuf[0] == '0' && pbuf[1] == c) {
4848 *res++ = *pbuf++;
4849 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 rescnt -= 2;
4851 len -= 2;
4852 width -= 2;
4853 if (width < 0)
4854 width = 0;
4855 }
4856 }
4857 break;
4858
4859 case 'e':
4860 case 'E':
4861 case 'f':
4862 case 'g':
4863 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004864 pbuf = formatbuf;
4865 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4866 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 if (len < 0)
4868 goto onError;
4869 sign = 1;
4870 if (flags&F_ZERO)
4871 fill = '0';
4872 break;
4873
4874 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004875 pbuf = formatbuf;
4876 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 if (len < 0)
4878 goto onError;
4879 break;
4880
4881 default:
4882 PyErr_Format(PyExc_ValueError,
4883 "unsupported format character '%c' (0x%x)",
4884 c, c);
4885 goto onError;
4886 }
4887 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004888 if (*pbuf == '-' || *pbuf == '+') {
4889 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 len--;
4891 }
4892 else if (flags & F_SIGN)
4893 sign = '+';
4894 else if (flags & F_BLANK)
4895 sign = ' ';
4896 else
4897 sign = 0;
4898 }
4899 if (width < len)
4900 width = len;
4901 if (rescnt < width + (sign != 0)) {
4902 reslen -= rescnt;
4903 rescnt = width + fmtcnt + 100;
4904 reslen += rescnt;
4905 if (_PyUnicode_Resize(result, reslen) < 0)
4906 return NULL;
4907 res = PyUnicode_AS_UNICODE(result)
4908 + reslen - rescnt;
4909 }
4910 if (sign) {
4911 if (fill != ' ')
4912 *res++ = sign;
4913 rescnt--;
4914 if (width > len)
4915 width--;
4916 }
4917 if (width > len && !(flags & F_LJUST)) {
4918 do {
4919 --rescnt;
4920 *res++ = fill;
4921 } while (--width > len);
4922 }
4923 if (sign && fill == ' ')
4924 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004925 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 res += len;
4927 rescnt -= len;
4928 while (--width >= len) {
4929 --rescnt;
4930 *res++ = ' ';
4931 }
4932 if (dict && (argidx < arglen) && c != '%') {
4933 PyErr_SetString(PyExc_TypeError,
4934 "not all arguments converted");
4935 goto onError;
4936 }
4937 Py_XDECREF(temp);
4938 } /* '%' */
4939 } /* until end */
4940 if (argidx < arglen && !dict) {
4941 PyErr_SetString(PyExc_TypeError,
4942 "not all arguments converted");
4943 goto onError;
4944 }
4945
4946 if (args_owned) {
4947 Py_DECREF(args);
4948 }
4949 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004950 if (_PyUnicode_Resize(result, reslen - rescnt))
4951 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 return (PyObject *)result;
4953
4954 onError:
4955 Py_XDECREF(result);
4956 Py_DECREF(uformat);
4957 if (args_owned) {
4958 Py_DECREF(args);
4959 }
4960 return NULL;
4961}
4962
4963static PyBufferProcs unicode_as_buffer = {
4964 (getreadbufferproc) unicode_buffer_getreadbuf,
4965 (getwritebufferproc) unicode_buffer_getwritebuf,
4966 (getsegcountproc) unicode_buffer_getsegcount,
4967 (getcharbufferproc) unicode_buffer_getcharbuf,
4968};
4969
4970PyTypeObject PyUnicode_Type = {
4971 PyObject_HEAD_INIT(&PyType_Type)
4972 0, /* ob_size */
4973 "unicode", /* tp_name */
4974 sizeof(PyUnicodeObject), /* tp_size */
4975 0, /* tp_itemsize */
4976 /* Slots */
4977 (destructor)_PyUnicode_Free, /* tp_dealloc */
4978 0, /* tp_print */
4979 (getattrfunc)unicode_getattr, /* tp_getattr */
4980 0, /* tp_setattr */
4981 (cmpfunc) unicode_compare, /* tp_compare */
4982 (reprfunc) unicode_repr, /* tp_repr */
4983 0, /* tp_as_number */
4984 &unicode_as_sequence, /* tp_as_sequence */
4985 0, /* tp_as_mapping */
4986 (hashfunc) unicode_hash, /* tp_hash*/
4987 0, /* tp_call*/
4988 (reprfunc) unicode_str, /* tp_str */
4989 (getattrofunc) NULL, /* tp_getattro */
4990 (setattrofunc) NULL, /* tp_setattro */
4991 &unicode_as_buffer, /* tp_as_buffer */
4992 Py_TPFLAGS_DEFAULT, /* tp_flags */
4993};
4994
4995/* Initialize the Unicode implementation */
4996
4997void _PyUnicode_Init()
4998{
4999 /* Doublecheck the configuration... */
5000 if (sizeof(Py_UNICODE) != 2)
5001 Py_FatalError("Unicode configuration error: "
5002 "sizeof(Py_UNICODE) != 2 bytes");
5003
Fred Drakee4315f52000-05-09 19:53:39 +00005004 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005005 unicode_freelist = NULL;
5006 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005008 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009}
5010
5011/* Finalize the Unicode implementation */
5012
5013void
5014_PyUnicode_Fini()
5015{
5016 PyUnicodeObject *u = unicode_freelist;
5017
5018 while (u != NULL) {
5019 PyUnicodeObject *v = u;
5020 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005021 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005022 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005023 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005024 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005026 unicode_freelist = NULL;
5027 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005029 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030}