blob: b4096a045f48167fd4ca3b8d72575c301bdb321a [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
359 const char *s;
360 int len;
361
362 if (obj == NULL) {
363 PyErr_BadInternalCall();
364 return NULL;
365 }
366 else if (PyUnicode_Check(obj)) {
367 Py_INCREF(obj);
368 return obj;
369 }
370 else if (PyString_Check(obj)) {
371 s = PyString_AS_STRING(obj);
372 len = PyString_GET_SIZE(obj);
373 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000374 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
375 /* Overwrite the error message with something more useful in
376 case of a TypeError. */
377 if (PyErr_ExceptionMatches(PyExc_TypeError))
378 PyErr_SetString(PyExc_TypeError,
379 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382 if (len == 0) {
383 Py_INCREF(unicode_empty);
384 return (PyObject *)unicode_empty;
385 }
Fred Drakee4315f52000-05-09 19:53:39 +0000386 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389PyObject *PyUnicode_Decode(const char *s,
390 int size,
391 const char *encoding,
392 const char *errors)
393{
394 PyObject *buffer = NULL, *unicode;
395
Fred Drakee4315f52000-05-09 19:53:39 +0000396 if (encoding == NULL)
397 encoding = PyUnicode_GetDefaultEncoding();
398
399 /* Shortcuts for common default encodings */
400 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000402 else if (strcmp(encoding, "latin-1") == 0)
403 return PyUnicode_DecodeLatin1(s, size, errors);
404 else if (strcmp(encoding, "ascii") == 0)
405 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
407 /* Decode via the codec registry */
408 buffer = PyBuffer_FromMemory((void *)s, size);
409 if (buffer == NULL)
410 goto onError;
411 unicode = PyCodec_Decode(buffer, encoding, errors);
412 if (unicode == NULL)
413 goto onError;
414 if (!PyUnicode_Check(unicode)) {
415 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000416 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417 unicode->ob_type->tp_name);
418 Py_DECREF(unicode);
419 goto onError;
420 }
421 Py_DECREF(buffer);
422 return unicode;
423
424 onError:
425 Py_XDECREF(buffer);
426 return NULL;
427}
428
429PyObject *PyUnicode_Encode(const Py_UNICODE *s,
430 int size,
431 const char *encoding,
432 const char *errors)
433{
434 PyObject *v, *unicode;
435
436 unicode = PyUnicode_FromUnicode(s, size);
437 if (unicode == NULL)
438 return NULL;
439 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
440 Py_DECREF(unicode);
441 return v;
442}
443
444PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
445 const char *encoding,
446 const char *errors)
447{
448 PyObject *v;
449
450 if (!PyUnicode_Check(unicode)) {
451 PyErr_BadArgument();
452 goto onError;
453 }
Fred Drakee4315f52000-05-09 19:53:39 +0000454
455 if (encoding == NULL)
456 encoding = PyUnicode_GetDefaultEncoding();
457
458 /* Shortcuts for common default encodings */
459 if (errors == NULL) {
460 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000462 else if (strcmp(encoding, "latin-1") == 0)
463 return PyUnicode_AsLatin1String(unicode);
464 else if (strcmp(encoding, "ascii") == 0)
465 return PyUnicode_AsASCIIString(unicode);
466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467
468 /* Encode via the codec registry */
469 v = PyCodec_Encode(unicode, encoding, errors);
470 if (v == NULL)
471 goto onError;
472 /* XXX Should we really enforce this ? */
473 if (!PyString_Check(v)) {
474 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000475 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 v->ob_type->tp_name);
477 Py_DECREF(v);
478 goto onError;
479 }
480 return v;
481
482 onError:
483 return NULL;
484}
485
486Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
487{
488 if (!PyUnicode_Check(unicode)) {
489 PyErr_BadArgument();
490 goto onError;
491 }
492 return PyUnicode_AS_UNICODE(unicode);
493
494 onError:
495 return NULL;
496}
497
498int PyUnicode_GetSize(PyObject *unicode)
499{
500 if (!PyUnicode_Check(unicode)) {
501 PyErr_BadArgument();
502 goto onError;
503 }
504 return PyUnicode_GET_SIZE(unicode);
505
506 onError:
507 return -1;
508}
509
Fred Drakee4315f52000-05-09 19:53:39 +0000510const char *PyUnicode_GetDefaultEncoding()
511{
512 return unicode_default_encoding;
513}
514
515int PyUnicode_SetDefaultEncoding(const char *encoding)
516{
517 PyObject *v;
518
519 /* Make sure the encoding is valid. As side effect, this also
520 loads the encoding into the codec registry cache. */
521 v = _PyCodec_Lookup(encoding);
522 if (v == NULL)
523 goto onError;
524 Py_DECREF(v);
525 strncpy(unicode_default_encoding,
526 encoding,
527 sizeof(unicode_default_encoding));
528 return 0;
529
530 onError:
531 return -1;
532}
533
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534/* --- UTF-8 Codec -------------------------------------------------------- */
535
536static
537char utf8_code_length[256] = {
538 /* Map UTF-8 encoded prefix byte to sequence length. zero means
539 illegal prefix. see RFC 2279 for details */
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
546 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
547 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
553 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
554 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
555 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
556};
557
558static
559int utf8_decoding_error(const char **source,
560 Py_UNICODE **dest,
561 const char *errors,
562 const char *details)
563{
564 if ((errors == NULL) ||
565 (strcmp(errors,"strict") == 0)) {
566 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000567 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 details);
569 return -1;
570 }
571 else if (strcmp(errors,"ignore") == 0) {
572 (*source)++;
573 return 0;
574 }
575 else if (strcmp(errors,"replace") == 0) {
576 (*source)++;
577 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
578 (*dest)++;
579 return 0;
580 }
581 else {
582 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000583 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584 errors);
585 return -1;
586 }
587}
588
589#define UTF8_ERROR(details) do { \
590 if (utf8_decoding_error(&s, &p, errors, details)) \
591 goto onError; \
592 continue; \
593} while (0)
594
595PyObject *PyUnicode_DecodeUTF8(const char *s,
596 int size,
597 const char *errors)
598{
599 int n;
600 const char *e;
601 PyUnicodeObject *unicode;
602 Py_UNICODE *p;
603
604 /* Note: size will always be longer than the resulting Unicode
605 character count */
606 unicode = _PyUnicode_New(size);
607 if (!unicode)
608 return NULL;
609 if (size == 0)
610 return (PyObject *)unicode;
611
612 /* Unpack UTF-8 encoded data */
613 p = unicode->str;
614 e = s + size;
615
616 while (s < e) {
617 register Py_UNICODE ch = (unsigned char)*s;
618
619 if (ch < 0x80) {
620 *p++ = ch;
621 s++;
622 continue;
623 }
624
625 n = utf8_code_length[ch];
626
627 if (s + n > e)
628 UTF8_ERROR("unexpected end of data");
629
630 switch (n) {
631
632 case 0:
633 UTF8_ERROR("unexpected code byte");
634 break;
635
636 case 1:
637 UTF8_ERROR("internal error");
638 break;
639
640 case 2:
641 if ((s[1] & 0xc0) != 0x80)
642 UTF8_ERROR("invalid data");
643 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
644 if (ch < 0x80)
645 UTF8_ERROR("illegal encoding");
646 else
647 *p++ = ch;
648 break;
649
650 case 3:
651 if ((s[1] & 0xc0) != 0x80 ||
652 (s[2] & 0xc0) != 0x80)
653 UTF8_ERROR("invalid data");
654 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
655 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
656 UTF8_ERROR("illegal encoding");
657 else
658 *p++ = ch;
659 break;
660
661 default:
662 /* Other sizes are only needed for UCS-4 */
663 UTF8_ERROR("unsupported Unicode code range");
664 }
665 s += n;
666 }
667
668 /* Adjust length */
669 if (_PyUnicode_Resize(unicode, p - unicode->str))
670 goto onError;
671
672 return (PyObject *)unicode;
673
674onError:
675 Py_DECREF(unicode);
676 return NULL;
677}
678
679#undef UTF8_ERROR
680
681static
682int utf8_encoding_error(const Py_UNICODE **source,
683 char **dest,
684 const char *errors,
685 const char *details)
686{
687 if ((errors == NULL) ||
688 (strcmp(errors,"strict") == 0)) {
689 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000690 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 details);
692 return -1;
693 }
694 else if (strcmp(errors,"ignore") == 0) {
695 return 0;
696 }
697 else if (strcmp(errors,"replace") == 0) {
698 **dest = '?';
699 (*dest)++;
700 return 0;
701 }
702 else {
703 PyErr_Format(PyExc_ValueError,
704 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000705 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 errors);
707 return -1;
708 }
709}
710
711PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
712 int size,
713 const char *errors)
714{
715 PyObject *v;
716 char *p;
717 char *q;
718
719 v = PyString_FromStringAndSize(NULL, 3 * size);
720 if (v == NULL)
721 return NULL;
722 if (size == 0)
723 goto done;
724
725 p = q = PyString_AS_STRING(v);
726 while (size-- > 0) {
727 Py_UNICODE ch = *s++;
728 if (ch < 0x80)
729 *p++ = (char) ch;
730 else if (ch < 0x0800) {
731 *p++ = 0xc0 | (ch >> 6);
732 *p++ = 0x80 | (ch & 0x3f);
733 } else if (0xD800 <= ch && ch <= 0xDFFF) {
734 /* These byte ranges are reserved for UTF-16 surrogate
735 bytes which the Python implementation currently does
736 not support. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 if (utf8_encoding_error(&s, &p, errors,
738 "unsupported code range"))
739 goto onError;
740 } else {
741 *p++ = 0xe0 | (ch >> 12);
742 *p++ = 0x80 | ((ch >> 6) & 0x3f);
743 *p++ = 0x80 | (ch & 0x3f);
744 }
745 }
746 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000747 if (_PyString_Resize(&v, p - q))
748 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749
750 done:
751 return v;
752
753 onError:
754 Py_DECREF(v);
755 return NULL;
756}
757
758/* Return a Python string holding the UTF-8 encoded value of the
759 Unicode object.
760
761 The resulting string is cached in the Unicode object for subsequent
762 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000763 the character buffer interface and will live (at least) as long as
764 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765
766 The refcount of the string is *not* incremented.
767
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000768 *** Exported for internal use by the interpreter only !!! ***
769
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770*/
771
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000772PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000773 const char *errors)
774{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000775 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776
777 if (v)
778 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000779 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
780 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781 errors);
782 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000783 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784 return v;
785}
786
787PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
788{
789 PyObject *str;
790
791 if (!PyUnicode_Check(unicode)) {
792 PyErr_BadArgument();
793 return NULL;
794 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000795 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 if (str == NULL)
797 return NULL;
798 Py_INCREF(str);
799 return str;
800}
801
802/* --- UTF-16 Codec ------------------------------------------------------- */
803
804static
805int utf16_decoding_error(const Py_UNICODE **source,
806 Py_UNICODE **dest,
807 const char *errors,
808 const char *details)
809{
810 if ((errors == NULL) ||
811 (strcmp(errors,"strict") == 0)) {
812 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000813 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 details);
815 return -1;
816 }
817 else if (strcmp(errors,"ignore") == 0) {
818 return 0;
819 }
820 else if (strcmp(errors,"replace") == 0) {
821 if (dest) {
822 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
823 (*dest)++;
824 }
825 return 0;
826 }
827 else {
828 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000829 "UTF-16 decoding error; "
830 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 errors);
832 return -1;
833 }
834}
835
836#define UTF16_ERROR(details) do { \
837 if (utf16_decoding_error(&q, &p, errors, details)) \
838 goto onError; \
839 continue; \
840} while(0)
841
842PyObject *PyUnicode_DecodeUTF16(const char *s,
843 int size,
844 const char *errors,
845 int *byteorder)
846{
847 PyUnicodeObject *unicode;
848 Py_UNICODE *p;
849 const Py_UNICODE *q, *e;
850 int bo = 0;
851
852 /* size should be an even number */
853 if (size % sizeof(Py_UNICODE) != 0) {
854 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
855 return NULL;
856 /* The remaining input chars are ignored if we fall through
857 here... */
858 }
859
860 /* Note: size will always be longer than the resulting Unicode
861 character count */
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 /* Unpack UTF-16 encoded data */
869 p = unicode->str;
870 q = (Py_UNICODE *)s;
871 e = q + (size / sizeof(Py_UNICODE));
872
873 if (byteorder)
874 bo = *byteorder;
875
876 while (q < e) {
877 register Py_UNICODE ch = *q++;
878
879 /* Check for BOM marks (U+FEFF) in the input and adjust
880 current byte order setting accordingly. Swap input
881 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
882 !) */
883#ifdef BYTEORDER_IS_LITTLE_ENDIAN
884 if (ch == 0xFEFF) {
885 bo = -1;
886 continue;
887 } else if (ch == 0xFFFE) {
888 bo = 1;
889 continue;
890 }
891 if (bo == 1)
892 ch = (ch >> 8) | (ch << 8);
893#else
894 if (ch == 0xFEFF) {
895 bo = 1;
896 continue;
897 } else if (ch == 0xFFFE) {
898 bo = -1;
899 continue;
900 }
901 if (bo == -1)
902 ch = (ch >> 8) | (ch << 8);
903#endif
904 if (ch < 0xD800 || ch > 0xDFFF) {
905 *p++ = ch;
906 continue;
907 }
908
909 /* UTF-16 code pair: */
910 if (q >= e)
911 UTF16_ERROR("unexpected end of data");
912 if (0xDC00 <= *q && *q <= 0xDFFF) {
913 q++;
914 if (0xD800 <= *q && *q <= 0xDBFF)
915 /* This is valid data (a UTF-16 surrogate pair), but
916 we are not able to store this information since our
917 Py_UNICODE type only has 16 bits... this might
918 change someday, even though it's unlikely. */
919 UTF16_ERROR("code pairs are not supported");
920 else
921 continue;
922 }
923 UTF16_ERROR("illegal encoding");
924 }
925
926 if (byteorder)
927 *byteorder = bo;
928
929 /* Adjust length */
930 if (_PyUnicode_Resize(unicode, p - unicode->str))
931 goto onError;
932
933 return (PyObject *)unicode;
934
935onError:
936 Py_DECREF(unicode);
937 return NULL;
938}
939
940#undef UTF16_ERROR
941
942PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
943 int size,
944 const char *errors,
945 int byteorder)
946{
947 PyObject *v;
948 Py_UNICODE *p;
949 char *q;
950
951 /* We don't create UTF-16 pairs... */
952 v = PyString_FromStringAndSize(NULL,
953 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
954 if (v == NULL)
955 return NULL;
956 if (size == 0)
957 goto done;
958
959 q = PyString_AS_STRING(v);
960 p = (Py_UNICODE *)q;
961
962 if (byteorder == 0)
963 *p++ = 0xFEFF;
964 if (byteorder == 0 ||
965#ifdef BYTEORDER_IS_LITTLE_ENDIAN
966 byteorder == -1
967#else
968 byteorder == 1
969#endif
970 )
971 memcpy(p, s, size * sizeof(Py_UNICODE));
972 else
973 while (size-- > 0) {
974 Py_UNICODE ch = *s++;
975 *p++ = (ch >> 8) | (ch << 8);
976 }
977 done:
978 return v;
979}
980
981PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
982{
983 if (!PyUnicode_Check(unicode)) {
984 PyErr_BadArgument();
985 return NULL;
986 }
987 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
988 PyUnicode_GET_SIZE(unicode),
989 NULL,
990 0);
991}
992
993/* --- Unicode Escape Codec ----------------------------------------------- */
994
995static
996int unicodeescape_decoding_error(const char **source,
997 unsigned int *x,
998 const char *errors,
999 const char *details)
1000{
1001 if ((errors == NULL) ||
1002 (strcmp(errors,"strict") == 0)) {
1003 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001004 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 details);
1006 return -1;
1007 }
1008 else if (strcmp(errors,"ignore") == 0) {
1009 return 0;
1010 }
1011 else if (strcmp(errors,"replace") == 0) {
1012 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1013 return 0;
1014 }
1015 else {
1016 PyErr_Format(PyExc_ValueError,
1017 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001018 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 errors);
1020 return -1;
1021 }
1022}
1023
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001024static _Py_UCNHashAPI *pucnHash = NULL;
1025
1026static
1027int mystrnicmp(const char *s1, const char *s2, size_t count)
1028{
1029 char c1, c2;
1030
1031 if (count)
1032 {
1033 do
1034 {
1035 c1 = tolower(*(s1++));
1036 c2 = tolower(*(s2++));
1037 }
1038 while(--count && c1 == c2);
1039
1040 return c1 - c2;
1041 }
1042
1043 return 0;
1044}
1045
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1047 int size,
1048 const char *errors)
1049{
1050 PyUnicodeObject *v;
1051 Py_UNICODE *p = NULL, *buf = NULL;
1052 const char *end;
1053
1054 /* Escaped strings will always be longer than the resulting
1055 Unicode string, so we start with size here and then reduce the
1056 length after conversion to the true value. */
1057 v = _PyUnicode_New(size);
1058 if (v == NULL)
1059 goto onError;
1060 if (size == 0)
1061 return (PyObject *)v;
1062 p = buf = PyUnicode_AS_UNICODE(v);
1063 end = s + size;
1064 while (s < end) {
1065 unsigned char c;
1066 unsigned int x;
1067 int i;
1068
1069 /* Non-escape characters are interpreted as Unicode ordinals */
1070 if (*s != '\\') {
1071 *p++ = (unsigned char)*s++;
1072 continue;
1073 }
1074
1075 /* \ - Escapes */
1076 s++;
1077 switch (*s++) {
1078
1079 /* \x escapes */
1080 case '\n': break;
1081 case '\\': *p++ = '\\'; break;
1082 case '\'': *p++ = '\''; break;
1083 case '\"': *p++ = '\"'; break;
1084 case 'b': *p++ = '\b'; break;
1085 case 'f': *p++ = '\014'; break; /* FF */
1086 case 't': *p++ = '\t'; break;
1087 case 'n': *p++ = '\n'; break;
1088 case 'r': *p++ = '\r'; break;
1089 case 'v': *p++ = '\013'; break; /* VT */
1090 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1091
1092 /* \OOO (octal) escapes */
1093 case '0': case '1': case '2': case '3':
1094 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001095 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001097 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001099 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001101 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 break;
1103
1104 /* \xXXXX escape with 0-4 hex digits */
1105 case 'x':
1106 x = 0;
1107 c = (unsigned char)*s;
1108 if (isxdigit(c)) {
1109 do {
1110 x = (x<<4) & ~0xF;
1111 if ('0' <= c && c <= '9')
1112 x += c - '0';
1113 else if ('a' <= c && c <= 'f')
1114 x += 10 + c - 'a';
1115 else
1116 x += 10 + c - 'A';
1117 c = (unsigned char)*++s;
1118 } while (isxdigit(c));
1119 *p++ = x;
1120 } else {
1121 *p++ = '\\';
1122 *p++ = (unsigned char)s[-1];
1123 }
1124 break;
1125
1126 /* \uXXXX with 4 hex digits */
1127 case 'u':
1128 for (x = 0, i = 0; i < 4; i++) {
1129 c = (unsigned char)s[i];
1130 if (!isxdigit(c)) {
1131 if (unicodeescape_decoding_error(&s, &x, errors,
1132 "truncated \\uXXXX"))
1133 goto onError;
1134 i++;
1135 break;
1136 }
1137 x = (x<<4) & ~0xF;
1138 if (c >= '0' && c <= '9')
1139 x += c - '0';
1140 else if (c >= 'a' && c <= 'f')
1141 x += 10 + c - 'a';
1142 else
1143 x += 10 + c - 'A';
1144 }
1145 s += i;
1146 *p++ = x;
1147 break;
1148
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001149 case 'N':
1150 /* Ok, we need to deal with Unicode Character Names now,
1151 * make sure we've imported the hash table data...
1152 */
1153 if (pucnHash == NULL)
1154 {
1155 PyObject *mod = 0, *v = 0;
1156
1157 mod = PyImport_ImportModule("ucnhash");
1158 if (mod == NULL)
1159 goto onError;
1160 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1161 Py_DECREF(mod);
1162 if (v == NULL)
1163 {
1164 goto onError;
1165 }
1166 pucnHash = PyCObject_AsVoidPtr(v);
1167 Py_DECREF(v);
1168 if (pucnHash == NULL)
1169 {
1170 goto onError;
1171 }
1172 }
1173
1174 if (*s == '{')
1175 {
1176 const char *start = s + 1;
1177 const char *endBrace = start;
1178 unsigned int uiValue;
1179 unsigned long j;
1180
1181 /* look for either the closing brace, or we
1182 * exceed the maximum length of the unicode character names
1183 */
1184 while (*endBrace != '}' &&
1185 (unsigned int)(endBrace - start) <=
1186 pucnHash->cchMax &&
1187 endBrace < end)
1188 {
1189 endBrace++;
1190 }
1191 if (endBrace != end && *endBrace == '}')
1192 {
1193 j = pucnHash->hash(start, endBrace - start);
1194 if (j > pucnHash->cKeys ||
1195 mystrnicmp(
1196 start,
1197 ((_Py_UnicodeCharacterName *)
1198 (pucnHash->getValue(j)))->pszUCN,
1199 (int)(endBrace - start)) != 0)
1200 {
1201 if (unicodeescape_decoding_error(
1202 &s, &x, errors,
1203 "Invalid Unicode Character Name"))
1204 {
1205 goto onError;
1206 }
1207 goto ucnFallthrough;
1208 }
1209 uiValue = ((_Py_UnicodeCharacterName *)
1210 (pucnHash->getValue(j)))->uiValue;
1211 if (uiValue < 1<<16)
1212 {
1213 /* In UCS-2 range, easy solution.. */
1214 *p++ = uiValue;
1215 }
1216 else
1217 {
1218 /* Oops, its in UCS-4 space, */
1219 /* compute and append the two surrogates: */
1220 /* translate from 10000..10FFFF to 0..FFFFF */
1221 uiValue -= 0x10000;
1222
1223 /* high surrogate = top 10 bits added to D800 */
1224 *p++ = 0xD800 + (uiValue >> 10);
1225
1226 /* low surrogate = bottom 10 bits added to DC00 */
1227 *p++ = 0xDC00 + (uiValue & ~0xFC00);
1228 }
1229 s = endBrace + 1;
1230 }
1231 else
1232 {
1233 if (unicodeescape_decoding_error(
1234 &s, &x, errors,
1235 "Unicode name missing closing brace"))
1236 goto onError;
1237 goto ucnFallthrough;
1238 }
1239 break;
1240 }
1241 if (unicodeescape_decoding_error(
1242 &s, &x, errors,
1243 "Missing opening brace for Unicode Character Name escape"))
1244 goto onError;
1245ucnFallthrough:
1246 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001247 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 *p++ = '\\';
1249 *p++ = (unsigned char)s[-1];
1250 break;
1251 }
1252 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001253 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return (PyObject *)v;
1256
1257 onError:
1258 Py_XDECREF(v);
1259 return NULL;
1260}
1261
1262/* Return a Unicode-Escape string version of the Unicode object.
1263
1264 If quotes is true, the string is enclosed in u"" or u'' quotes as
1265 appropriate.
1266
1267*/
1268
Barry Warsaw51ac5802000-03-20 16:36:48 +00001269static const Py_UNICODE *findchar(const Py_UNICODE *s,
1270 int size,
1271 Py_UNICODE ch);
1272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273static
1274PyObject *unicodeescape_string(const Py_UNICODE *s,
1275 int size,
1276 int quotes)
1277{
1278 PyObject *repr;
1279 char *p;
1280 char *q;
1281
1282 static const char *hexdigit = "0123456789ABCDEF";
1283
1284 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1285 if (repr == NULL)
1286 return NULL;
1287
1288 p = q = PyString_AS_STRING(repr);
1289
1290 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 *p++ = 'u';
1292 *p++ = (findchar(s, size, '\'') &&
1293 !findchar(s, size, '"')) ? '"' : '\'';
1294 }
1295 while (size-- > 0) {
1296 Py_UNICODE ch = *s++;
1297 /* Escape quotes */
1298 if (quotes && (ch == q[1] || ch == '\\')) {
1299 *p++ = '\\';
1300 *p++ = (char) ch;
1301 }
1302 /* Map 16-bit characters to '\uxxxx' */
1303 else if (ch >= 256) {
1304 *p++ = '\\';
1305 *p++ = 'u';
1306 *p++ = hexdigit[(ch >> 12) & 0xf];
1307 *p++ = hexdigit[(ch >> 8) & 0xf];
1308 *p++ = hexdigit[(ch >> 4) & 0xf];
1309 *p++ = hexdigit[ch & 15];
1310 }
1311 /* Map non-printable US ASCII to '\ooo' */
1312 else if (ch < ' ' || ch >= 128) {
1313 *p++ = '\\';
1314 *p++ = hexdigit[(ch >> 6) & 7];
1315 *p++ = hexdigit[(ch >> 3) & 7];
1316 *p++ = hexdigit[ch & 7];
1317 }
1318 /* Copy everything else as-is */
1319 else
1320 *p++ = (char) ch;
1321 }
1322 if (quotes)
1323 *p++ = q[1];
1324
1325 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001326 if (_PyString_Resize(&repr, p - q))
1327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001330
1331 onError:
1332 Py_DECREF(repr);
1333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334}
1335
1336PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1337 int size)
1338{
1339 return unicodeescape_string(s, size, 0);
1340}
1341
1342PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1343{
1344 if (!PyUnicode_Check(unicode)) {
1345 PyErr_BadArgument();
1346 return NULL;
1347 }
1348 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1349 PyUnicode_GET_SIZE(unicode));
1350}
1351
1352/* --- Raw Unicode Escape Codec ------------------------------------------- */
1353
1354PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1355 int size,
1356 const char *errors)
1357{
1358 PyUnicodeObject *v;
1359 Py_UNICODE *p, *buf;
1360 const char *end;
1361 const char *bs;
1362
1363 /* Escaped strings will always be longer than the resulting
1364 Unicode string, so we start with size here and then reduce the
1365 length after conversion to the true value. */
1366 v = _PyUnicode_New(size);
1367 if (v == NULL)
1368 goto onError;
1369 if (size == 0)
1370 return (PyObject *)v;
1371 p = buf = PyUnicode_AS_UNICODE(v);
1372 end = s + size;
1373 while (s < end) {
1374 unsigned char c;
1375 unsigned int x;
1376 int i;
1377
1378 /* Non-escape characters are interpreted as Unicode ordinals */
1379 if (*s != '\\') {
1380 *p++ = (unsigned char)*s++;
1381 continue;
1382 }
1383
1384 /* \u-escapes are only interpreted iff the number of leading
1385 backslashes if odd */
1386 bs = s;
1387 for (;s < end;) {
1388 if (*s != '\\')
1389 break;
1390 *p++ = (unsigned char)*s++;
1391 }
1392 if (((s - bs) & 1) == 0 ||
1393 s >= end ||
1394 *s != 'u') {
1395 continue;
1396 }
1397 p--;
1398 s++;
1399
1400 /* \uXXXX with 4 hex digits */
1401 for (x = 0, i = 0; i < 4; i++) {
1402 c = (unsigned char)s[i];
1403 if (!isxdigit(c)) {
1404 if (unicodeescape_decoding_error(&s, &x, errors,
1405 "truncated \\uXXXX"))
1406 goto onError;
1407 i++;
1408 break;
1409 }
1410 x = (x<<4) & ~0xF;
1411 if (c >= '0' && c <= '9')
1412 x += c - '0';
1413 else if (c >= 'a' && c <= 'f')
1414 x += 10 + c - 'a';
1415 else
1416 x += 10 + c - 'A';
1417 }
1418 s += i;
1419 *p++ = x;
1420 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001421 if (_PyUnicode_Resize(v, (int)(p - buf)))
1422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 return (PyObject *)v;
1424
1425 onError:
1426 Py_XDECREF(v);
1427 return NULL;
1428}
1429
1430PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1431 int size)
1432{
1433 PyObject *repr;
1434 char *p;
1435 char *q;
1436
1437 static const char *hexdigit = "0123456789ABCDEF";
1438
1439 repr = PyString_FromStringAndSize(NULL, 6 * size);
1440 if (repr == NULL)
1441 return NULL;
1442
1443 p = q = PyString_AS_STRING(repr);
1444 while (size-- > 0) {
1445 Py_UNICODE ch = *s++;
1446 /* Map 16-bit characters to '\uxxxx' */
1447 if (ch >= 256) {
1448 *p++ = '\\';
1449 *p++ = 'u';
1450 *p++ = hexdigit[(ch >> 12) & 0xf];
1451 *p++ = hexdigit[(ch >> 8) & 0xf];
1452 *p++ = hexdigit[(ch >> 4) & 0xf];
1453 *p++ = hexdigit[ch & 15];
1454 }
1455 /* Copy everything else as-is */
1456 else
1457 *p++ = (char) ch;
1458 }
1459 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001460 if (_PyString_Resize(&repr, p - q))
1461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462
1463 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001464
1465 onError:
1466 Py_DECREF(repr);
1467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468}
1469
1470PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1471{
1472 if (!PyUnicode_Check(unicode)) {
1473 PyErr_BadArgument();
1474 return NULL;
1475 }
1476 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1477 PyUnicode_GET_SIZE(unicode));
1478}
1479
1480/* --- Latin-1 Codec ------------------------------------------------------ */
1481
1482PyObject *PyUnicode_DecodeLatin1(const char *s,
1483 int size,
1484 const char *errors)
1485{
1486 PyUnicodeObject *v;
1487 Py_UNICODE *p;
1488
1489 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1490 v = _PyUnicode_New(size);
1491 if (v == NULL)
1492 goto onError;
1493 if (size == 0)
1494 return (PyObject *)v;
1495 p = PyUnicode_AS_UNICODE(v);
1496 while (size-- > 0)
1497 *p++ = (unsigned char)*s++;
1498 return (PyObject *)v;
1499
1500 onError:
1501 Py_XDECREF(v);
1502 return NULL;
1503}
1504
1505static
1506int latin1_encoding_error(const Py_UNICODE **source,
1507 char **dest,
1508 const char *errors,
1509 const char *details)
1510{
1511 if ((errors == NULL) ||
1512 (strcmp(errors,"strict") == 0)) {
1513 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001514 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 details);
1516 return -1;
1517 }
1518 else if (strcmp(errors,"ignore") == 0) {
1519 return 0;
1520 }
1521 else if (strcmp(errors,"replace") == 0) {
1522 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001523 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001524 return 0;
1525 }
1526 else {
1527 PyErr_Format(PyExc_ValueError,
1528 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001529 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 errors);
1531 return -1;
1532 }
1533}
1534
1535PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1536 int size,
1537 const char *errors)
1538{
1539 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001540 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 repr = PyString_FromStringAndSize(NULL, size);
1542 if (repr == NULL)
1543 return NULL;
1544
1545 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001546 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 while (size-- > 0) {
1548 Py_UNICODE ch = *p++;
1549 if (ch >= 256) {
1550 if (latin1_encoding_error(&p, &s, errors,
1551 "ordinal not in range(256)"))
1552 goto onError;
1553 }
1554 else
1555 *s++ = (char)ch;
1556 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001557 /* Resize if error handling skipped some characters */
1558 if (s - start < PyString_GET_SIZE(repr))
1559 if (_PyString_Resize(&repr, s - start))
1560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 return repr;
1562
1563 onError:
1564 Py_DECREF(repr);
1565 return NULL;
1566}
1567
1568PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1569{
1570 if (!PyUnicode_Check(unicode)) {
1571 PyErr_BadArgument();
1572 return NULL;
1573 }
1574 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1575 PyUnicode_GET_SIZE(unicode),
1576 NULL);
1577}
1578
1579/* --- 7-bit ASCII Codec -------------------------------------------------- */
1580
1581static
1582int ascii_decoding_error(const char **source,
1583 Py_UNICODE **dest,
1584 const char *errors,
1585 const char *details)
1586{
1587 if ((errors == NULL) ||
1588 (strcmp(errors,"strict") == 0)) {
1589 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001590 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 details);
1592 return -1;
1593 }
1594 else if (strcmp(errors,"ignore") == 0) {
1595 return 0;
1596 }
1597 else if (strcmp(errors,"replace") == 0) {
1598 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1599 (*dest)++;
1600 return 0;
1601 }
1602 else {
1603 PyErr_Format(PyExc_ValueError,
1604 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001605 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606 errors);
1607 return -1;
1608 }
1609}
1610
1611PyObject *PyUnicode_DecodeASCII(const char *s,
1612 int size,
1613 const char *errors)
1614{
1615 PyUnicodeObject *v;
1616 Py_UNICODE *p;
1617
1618 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1619 v = _PyUnicode_New(size);
1620 if (v == NULL)
1621 goto onError;
1622 if (size == 0)
1623 return (PyObject *)v;
1624 p = PyUnicode_AS_UNICODE(v);
1625 while (size-- > 0) {
1626 register unsigned char c;
1627
1628 c = (unsigned char)*s++;
1629 if (c < 128)
1630 *p++ = c;
1631 else if (ascii_decoding_error(&s, &p, errors,
1632 "ordinal not in range(128)"))
1633 goto onError;
1634 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001635 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1636 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 return (PyObject *)v;
1639
1640 onError:
1641 Py_XDECREF(v);
1642 return NULL;
1643}
1644
1645static
1646int ascii_encoding_error(const Py_UNICODE **source,
1647 char **dest,
1648 const char *errors,
1649 const char *details)
1650{
1651 if ((errors == NULL) ||
1652 (strcmp(errors,"strict") == 0)) {
1653 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001654 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 details);
1656 return -1;
1657 }
1658 else if (strcmp(errors,"ignore") == 0) {
1659 return 0;
1660 }
1661 else if (strcmp(errors,"replace") == 0) {
1662 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001663 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664 return 0;
1665 }
1666 else {
1667 PyErr_Format(PyExc_ValueError,
1668 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001669 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 errors);
1671 return -1;
1672 }
1673}
1674
1675PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1676 int size,
1677 const char *errors)
1678{
1679 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001680 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 repr = PyString_FromStringAndSize(NULL, size);
1682 if (repr == NULL)
1683 return NULL;
1684
1685 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001686 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 while (size-- > 0) {
1688 Py_UNICODE ch = *p++;
1689 if (ch >= 128) {
1690 if (ascii_encoding_error(&p, &s, errors,
1691 "ordinal not in range(128)"))
1692 goto onError;
1693 }
1694 else
1695 *s++ = (char)ch;
1696 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001697 /* Resize if error handling skipped some characters */
1698 if (s - start < PyString_GET_SIZE(repr))
1699 if (_PyString_Resize(&repr, s - start))
1700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 return repr;
1702
1703 onError:
1704 Py_DECREF(repr);
1705 return NULL;
1706}
1707
1708PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1709{
1710 if (!PyUnicode_Check(unicode)) {
1711 PyErr_BadArgument();
1712 return NULL;
1713 }
1714 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1715 PyUnicode_GET_SIZE(unicode),
1716 NULL);
1717}
1718
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001719#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001720
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001721/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001722
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001723PyObject *PyUnicode_DecodeMBCS(const char *s,
1724 int size,
1725 const char *errors)
1726{
1727 PyUnicodeObject *v;
1728 Py_UNICODE *p;
1729
1730 /* First get the size of the result */
1731 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001732 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001733 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1734
1735 v = _PyUnicode_New(usize);
1736 if (v == NULL)
1737 return NULL;
1738 if (usize == 0)
1739 return (PyObject *)v;
1740 p = PyUnicode_AS_UNICODE(v);
1741 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1742 Py_DECREF(v);
1743 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1744 }
1745
1746 return (PyObject *)v;
1747}
1748
1749PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1750 int size,
1751 const char *errors)
1752{
1753 PyObject *repr;
1754 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001755 DWORD mbcssize;
1756
1757 /* If there are no characters, bail now! */
1758 if (size==0)
1759 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001760
1761 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001762 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001763 if (mbcssize==0)
1764 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1765
1766 repr = PyString_FromStringAndSize(NULL, mbcssize);
1767 if (repr == NULL)
1768 return NULL;
1769 if (mbcssize==0)
1770 return repr;
1771
1772 /* Do the conversion */
1773 s = PyString_AS_STRING(repr);
1774 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1775 Py_DECREF(repr);
1776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1777 }
1778 return repr;
1779}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001780
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001781#endif /* MS_WIN32 */
1782
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783/* --- Character Mapping Codec -------------------------------------------- */
1784
1785static
1786int charmap_decoding_error(const char **source,
1787 Py_UNICODE **dest,
1788 const char *errors,
1789 const char *details)
1790{
1791 if ((errors == NULL) ||
1792 (strcmp(errors,"strict") == 0)) {
1793 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001794 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 details);
1796 return -1;
1797 }
1798 else if (strcmp(errors,"ignore") == 0) {
1799 return 0;
1800 }
1801 else if (strcmp(errors,"replace") == 0) {
1802 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1803 (*dest)++;
1804 return 0;
1805 }
1806 else {
1807 PyErr_Format(PyExc_ValueError,
1808 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001809 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 errors);
1811 return -1;
1812 }
1813}
1814
1815PyObject *PyUnicode_DecodeCharmap(const char *s,
1816 int size,
1817 PyObject *mapping,
1818 const char *errors)
1819{
1820 PyUnicodeObject *v;
1821 Py_UNICODE *p;
1822
1823 /* Default to Latin-1 */
1824 if (mapping == NULL)
1825 return PyUnicode_DecodeLatin1(s, size, errors);
1826
1827 v = _PyUnicode_New(size);
1828 if (v == NULL)
1829 goto onError;
1830 if (size == 0)
1831 return (PyObject *)v;
1832 p = PyUnicode_AS_UNICODE(v);
1833 while (size-- > 0) {
1834 unsigned char ch = *s++;
1835 PyObject *w, *x;
1836
1837 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1838 w = PyInt_FromLong((long)ch);
1839 if (w == NULL)
1840 goto onError;
1841 x = PyObject_GetItem(mapping, w);
1842 Py_DECREF(w);
1843 if (x == NULL) {
1844 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1845 /* No mapping found: default to Latin-1 mapping */
1846 PyErr_Clear();
1847 *p++ = (Py_UNICODE)ch;
1848 continue;
1849 }
1850 goto onError;
1851 }
1852
1853 /* Apply mapping */
1854 if (PyInt_Check(x)) {
1855 int value = PyInt_AS_LONG(x);
1856 if (value < 0 || value > 65535) {
1857 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001858 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 Py_DECREF(x);
1860 goto onError;
1861 }
1862 *p++ = (Py_UNICODE)value;
1863 }
1864 else if (x == Py_None) {
1865 /* undefined mapping */
1866 if (charmap_decoding_error(&s, &p, errors,
1867 "character maps to <undefined>")) {
1868 Py_DECREF(x);
1869 goto onError;
1870 }
1871 }
1872 else if (PyUnicode_Check(x)) {
1873 if (PyUnicode_GET_SIZE(x) != 1) {
1874 /* 1-n mapping */
1875 PyErr_SetString(PyExc_NotImplementedError,
1876 "1-n mappings are currently not implemented");
1877 Py_DECREF(x);
1878 goto onError;
1879 }
1880 *p++ = *PyUnicode_AS_UNICODE(x);
1881 }
1882 else {
1883 /* wrong return value */
1884 PyErr_SetString(PyExc_TypeError,
1885 "character mapping must return integer, None or unicode");
1886 Py_DECREF(x);
1887 goto onError;
1888 }
1889 Py_DECREF(x);
1890 }
1891 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1892 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1893 goto onError;
1894 return (PyObject *)v;
1895
1896 onError:
1897 Py_XDECREF(v);
1898 return NULL;
1899}
1900
1901static
1902int charmap_encoding_error(const Py_UNICODE **source,
1903 char **dest,
1904 const char *errors,
1905 const char *details)
1906{
1907 if ((errors == NULL) ||
1908 (strcmp(errors,"strict") == 0)) {
1909 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001910 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 details);
1912 return -1;
1913 }
1914 else if (strcmp(errors,"ignore") == 0) {
1915 return 0;
1916 }
1917 else if (strcmp(errors,"replace") == 0) {
1918 **dest = '?';
1919 (*dest)++;
1920 return 0;
1921 }
1922 else {
1923 PyErr_Format(PyExc_ValueError,
1924 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001925 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 errors);
1927 return -1;
1928 }
1929}
1930
1931PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1932 int size,
1933 PyObject *mapping,
1934 const char *errors)
1935{
1936 PyObject *v;
1937 char *s;
1938
1939 /* Default to Latin-1 */
1940 if (mapping == NULL)
1941 return PyUnicode_EncodeLatin1(p, size, errors);
1942
1943 v = PyString_FromStringAndSize(NULL, size);
1944 if (v == NULL)
1945 return NULL;
1946 s = PyString_AS_STRING(v);
1947 while (size-- > 0) {
1948 Py_UNICODE ch = *p++;
1949 PyObject *w, *x;
1950
1951 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1952 w = PyInt_FromLong((long)ch);
1953 if (w == NULL)
1954 goto onError;
1955 x = PyObject_GetItem(mapping, w);
1956 Py_DECREF(w);
1957 if (x == NULL) {
1958 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1959 /* No mapping found: default to Latin-1 mapping if possible */
1960 PyErr_Clear();
1961 if (ch < 256) {
1962 *s++ = (char)ch;
1963 continue;
1964 }
1965 else if (!charmap_encoding_error(&p, &s, errors,
1966 "missing character mapping"))
1967 continue;
1968 }
1969 goto onError;
1970 }
1971
1972 /* Apply mapping */
1973 if (PyInt_Check(x)) {
1974 int value = PyInt_AS_LONG(x);
1975 if (value < 0 || value > 255) {
1976 PyErr_SetString(PyExc_TypeError,
1977 "character mapping must be in range(256)");
1978 Py_DECREF(x);
1979 goto onError;
1980 }
1981 *s++ = (char)value;
1982 }
1983 else if (x == Py_None) {
1984 /* undefined mapping */
1985 if (charmap_encoding_error(&p, &s, errors,
1986 "character maps to <undefined>")) {
1987 Py_DECREF(x);
1988 goto onError;
1989 }
1990 }
1991 else if (PyString_Check(x)) {
1992 if (PyString_GET_SIZE(x) != 1) {
1993 /* 1-n mapping */
1994 PyErr_SetString(PyExc_NotImplementedError,
1995 "1-n mappings are currently not implemented");
1996 Py_DECREF(x);
1997 goto onError;
1998 }
1999 *s++ = *PyString_AS_STRING(x);
2000 }
2001 else {
2002 /* wrong return value */
2003 PyErr_SetString(PyExc_TypeError,
2004 "character mapping must return integer, None or unicode");
2005 Py_DECREF(x);
2006 goto onError;
2007 }
2008 Py_DECREF(x);
2009 }
2010 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2011 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2012 goto onError;
2013 return v;
2014
2015 onError:
2016 Py_DECREF(v);
2017 return NULL;
2018}
2019
2020PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2021 PyObject *mapping)
2022{
2023 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2024 PyErr_BadArgument();
2025 return NULL;
2026 }
2027 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2028 PyUnicode_GET_SIZE(unicode),
2029 mapping,
2030 NULL);
2031}
2032
2033static
2034int translate_error(const Py_UNICODE **source,
2035 Py_UNICODE **dest,
2036 const char *errors,
2037 const char *details)
2038{
2039 if ((errors == NULL) ||
2040 (strcmp(errors,"strict") == 0)) {
2041 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002042 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 details);
2044 return -1;
2045 }
2046 else if (strcmp(errors,"ignore") == 0) {
2047 return 0;
2048 }
2049 else if (strcmp(errors,"replace") == 0) {
2050 **dest = '?';
2051 (*dest)++;
2052 return 0;
2053 }
2054 else {
2055 PyErr_Format(PyExc_ValueError,
2056 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002057 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 errors);
2059 return -1;
2060 }
2061}
2062
2063PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2064 int size,
2065 PyObject *mapping,
2066 const char *errors)
2067{
2068 PyUnicodeObject *v;
2069 Py_UNICODE *p;
2070
2071 if (mapping == NULL) {
2072 PyErr_BadArgument();
2073 return NULL;
2074 }
2075
2076 /* Output will never be longer than input */
2077 v = _PyUnicode_New(size);
2078 if (v == NULL)
2079 goto onError;
2080 if (size == 0)
2081 goto done;
2082 p = PyUnicode_AS_UNICODE(v);
2083 while (size-- > 0) {
2084 Py_UNICODE ch = *s++;
2085 PyObject *w, *x;
2086
2087 /* Get mapping */
2088 w = PyInt_FromLong(ch);
2089 if (w == NULL)
2090 goto onError;
2091 x = PyObject_GetItem(mapping, w);
2092 Py_DECREF(w);
2093 if (x == NULL) {
2094 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2095 /* No mapping found: default to 1-1 mapping */
2096 PyErr_Clear();
2097 *p++ = ch;
2098 continue;
2099 }
2100 goto onError;
2101 }
2102
2103 /* Apply mapping */
2104 if (PyInt_Check(x))
2105 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2106 else if (x == Py_None) {
2107 /* undefined mapping */
2108 if (translate_error(&s, &p, errors,
2109 "character maps to <undefined>")) {
2110 Py_DECREF(x);
2111 goto onError;
2112 }
2113 }
2114 else if (PyUnicode_Check(x)) {
2115 if (PyUnicode_GET_SIZE(x) != 1) {
2116 /* 1-n mapping */
2117 PyErr_SetString(PyExc_NotImplementedError,
2118 "1-n mappings are currently not implemented");
2119 Py_DECREF(x);
2120 goto onError;
2121 }
2122 *p++ = *PyUnicode_AS_UNICODE(x);
2123 }
2124 else {
2125 /* wrong return value */
2126 PyErr_SetString(PyExc_TypeError,
2127 "translate mapping must return integer, None or unicode");
2128 Py_DECREF(x);
2129 goto onError;
2130 }
2131 Py_DECREF(x);
2132 }
2133 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002134 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2135 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 done:
2138 return (PyObject *)v;
2139
2140 onError:
2141 Py_XDECREF(v);
2142 return NULL;
2143}
2144
2145PyObject *PyUnicode_Translate(PyObject *str,
2146 PyObject *mapping,
2147 const char *errors)
2148{
2149 PyObject *result;
2150
2151 str = PyUnicode_FromObject(str);
2152 if (str == NULL)
2153 goto onError;
2154 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2155 PyUnicode_GET_SIZE(str),
2156 mapping,
2157 errors);
2158 Py_DECREF(str);
2159 return result;
2160
2161 onError:
2162 Py_XDECREF(str);
2163 return NULL;
2164}
2165
Guido van Rossum9e896b32000-04-05 20:11:21 +00002166/* --- Decimal Encoder ---------------------------------------------------- */
2167
2168int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2169 int length,
2170 char *output,
2171 const char *errors)
2172{
2173 Py_UNICODE *p, *end;
2174
2175 if (output == NULL) {
2176 PyErr_BadArgument();
2177 return -1;
2178 }
2179
2180 p = s;
2181 end = s + length;
2182 while (p < end) {
2183 register Py_UNICODE ch = *p++;
2184 int decimal;
2185
2186 if (Py_UNICODE_ISSPACE(ch)) {
2187 *output++ = ' ';
2188 continue;
2189 }
2190 decimal = Py_UNICODE_TODECIMAL(ch);
2191 if (decimal >= 0) {
2192 *output++ = '0' + decimal;
2193 continue;
2194 }
Guido van Rossumba477042000-04-06 18:18:10 +00002195 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002196 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002197 continue;
2198 }
2199 /* All other characters are considered invalid */
2200 if (errors == NULL || strcmp(errors, "strict") == 0) {
2201 PyErr_SetString(PyExc_ValueError,
2202 "invalid decimal Unicode string");
2203 goto onError;
2204 }
2205 else if (strcmp(errors, "ignore") == 0)
2206 continue;
2207 else if (strcmp(errors, "replace") == 0) {
2208 *output++ = '?';
2209 continue;
2210 }
2211 }
2212 /* 0-terminate the output string */
2213 *output++ = '\0';
2214 return 0;
2215
2216 onError:
2217 return -1;
2218}
2219
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220/* --- Helpers ------------------------------------------------------------ */
2221
2222static
2223int count(PyUnicodeObject *self,
2224 int start,
2225 int end,
2226 PyUnicodeObject *substring)
2227{
2228 int count = 0;
2229
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002230 if (substring->length == 0)
2231 return (end - start + 1);
2232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 end -= substring->length;
2234
2235 while (start <= end)
2236 if (Py_UNICODE_MATCH(self, start, substring)) {
2237 count++;
2238 start += substring->length;
2239 } else
2240 start++;
2241
2242 return count;
2243}
2244
2245int PyUnicode_Count(PyObject *str,
2246 PyObject *substr,
2247 int start,
2248 int end)
2249{
2250 int result;
2251
2252 str = PyUnicode_FromObject(str);
2253 if (str == NULL)
2254 return -1;
2255 substr = PyUnicode_FromObject(substr);
2256 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002257 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 return -1;
2259 }
2260
2261 result = count((PyUnicodeObject *)str,
2262 start, end,
2263 (PyUnicodeObject *)substr);
2264
2265 Py_DECREF(str);
2266 Py_DECREF(substr);
2267 return result;
2268}
2269
2270static
2271int findstring(PyUnicodeObject *self,
2272 PyUnicodeObject *substring,
2273 int start,
2274 int end,
2275 int direction)
2276{
2277 if (start < 0)
2278 start += self->length;
2279 if (start < 0)
2280 start = 0;
2281
2282 if (substring->length == 0)
2283 return start;
2284
2285 if (end > self->length)
2286 end = self->length;
2287 if (end < 0)
2288 end += self->length;
2289 if (end < 0)
2290 end = 0;
2291
2292 end -= substring->length;
2293
2294 if (direction < 0) {
2295 for (; end >= start; end--)
2296 if (Py_UNICODE_MATCH(self, end, substring))
2297 return end;
2298 } else {
2299 for (; start <= end; start++)
2300 if (Py_UNICODE_MATCH(self, start, substring))
2301 return start;
2302 }
2303
2304 return -1;
2305}
2306
2307int PyUnicode_Find(PyObject *str,
2308 PyObject *substr,
2309 int start,
2310 int end,
2311 int direction)
2312{
2313 int result;
2314
2315 str = PyUnicode_FromObject(str);
2316 if (str == NULL)
2317 return -1;
2318 substr = PyUnicode_FromObject(substr);
2319 if (substr == NULL) {
2320 Py_DECREF(substr);
2321 return -1;
2322 }
2323
2324 result = findstring((PyUnicodeObject *)str,
2325 (PyUnicodeObject *)substr,
2326 start, end, direction);
2327 Py_DECREF(str);
2328 Py_DECREF(substr);
2329 return result;
2330}
2331
2332static
2333int tailmatch(PyUnicodeObject *self,
2334 PyUnicodeObject *substring,
2335 int start,
2336 int end,
2337 int direction)
2338{
2339 if (start < 0)
2340 start += self->length;
2341 if (start < 0)
2342 start = 0;
2343
2344 if (substring->length == 0)
2345 return 1;
2346
2347 if (end > self->length)
2348 end = self->length;
2349 if (end < 0)
2350 end += self->length;
2351 if (end < 0)
2352 end = 0;
2353
2354 end -= substring->length;
2355 if (end < start)
2356 return 0;
2357
2358 if (direction > 0) {
2359 if (Py_UNICODE_MATCH(self, end, substring))
2360 return 1;
2361 } else {
2362 if (Py_UNICODE_MATCH(self, start, substring))
2363 return 1;
2364 }
2365
2366 return 0;
2367}
2368
2369int PyUnicode_Tailmatch(PyObject *str,
2370 PyObject *substr,
2371 int start,
2372 int end,
2373 int direction)
2374{
2375 int result;
2376
2377 str = PyUnicode_FromObject(str);
2378 if (str == NULL)
2379 return -1;
2380 substr = PyUnicode_FromObject(substr);
2381 if (substr == NULL) {
2382 Py_DECREF(substr);
2383 return -1;
2384 }
2385
2386 result = tailmatch((PyUnicodeObject *)str,
2387 (PyUnicodeObject *)substr,
2388 start, end, direction);
2389 Py_DECREF(str);
2390 Py_DECREF(substr);
2391 return result;
2392}
2393
2394static
2395const Py_UNICODE *findchar(const Py_UNICODE *s,
2396 int size,
2397 Py_UNICODE ch)
2398{
2399 /* like wcschr, but doesn't stop at NULL characters */
2400
2401 while (size-- > 0) {
2402 if (*s == ch)
2403 return s;
2404 s++;
2405 }
2406
2407 return NULL;
2408}
2409
2410/* Apply fixfct filter to the Unicode object self and return a
2411 reference to the modified object */
2412
2413static
2414PyObject *fixup(PyUnicodeObject *self,
2415 int (*fixfct)(PyUnicodeObject *s))
2416{
2417
2418 PyUnicodeObject *u;
2419
2420 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2421 self->length);
2422 if (u == NULL)
2423 return NULL;
2424 if (!fixfct(u)) {
2425 /* fixfct should return TRUE if it modified the buffer. If
2426 FALSE, return a reference to the original buffer instead
2427 (to save space, not time) */
2428 Py_INCREF(self);
2429 Py_DECREF(u);
2430 return (PyObject*) self;
2431 }
2432 return (PyObject*) u;
2433}
2434
2435static
2436int fixupper(PyUnicodeObject *self)
2437{
2438 int len = self->length;
2439 Py_UNICODE *s = self->str;
2440 int status = 0;
2441
2442 while (len-- > 0) {
2443 register Py_UNICODE ch;
2444
2445 ch = Py_UNICODE_TOUPPER(*s);
2446 if (ch != *s) {
2447 status = 1;
2448 *s = ch;
2449 }
2450 s++;
2451 }
2452
2453 return status;
2454}
2455
2456static
2457int fixlower(PyUnicodeObject *self)
2458{
2459 int len = self->length;
2460 Py_UNICODE *s = self->str;
2461 int status = 0;
2462
2463 while (len-- > 0) {
2464 register Py_UNICODE ch;
2465
2466 ch = Py_UNICODE_TOLOWER(*s);
2467 if (ch != *s) {
2468 status = 1;
2469 *s = ch;
2470 }
2471 s++;
2472 }
2473
2474 return status;
2475}
2476
2477static
2478int fixswapcase(PyUnicodeObject *self)
2479{
2480 int len = self->length;
2481 Py_UNICODE *s = self->str;
2482 int status = 0;
2483
2484 while (len-- > 0) {
2485 if (Py_UNICODE_ISUPPER(*s)) {
2486 *s = Py_UNICODE_TOLOWER(*s);
2487 status = 1;
2488 } else if (Py_UNICODE_ISLOWER(*s)) {
2489 *s = Py_UNICODE_TOUPPER(*s);
2490 status = 1;
2491 }
2492 s++;
2493 }
2494
2495 return status;
2496}
2497
2498static
2499int fixcapitalize(PyUnicodeObject *self)
2500{
2501 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2502 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2503 return 1;
2504 }
2505 return 0;
2506}
2507
2508static
2509int fixtitle(PyUnicodeObject *self)
2510{
2511 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2512 register Py_UNICODE *e;
2513 int previous_is_cased;
2514
2515 /* Shortcut for single character strings */
2516 if (PyUnicode_GET_SIZE(self) == 1) {
2517 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2518 if (*p != ch) {
2519 *p = ch;
2520 return 1;
2521 }
2522 else
2523 return 0;
2524 }
2525
2526 e = p + PyUnicode_GET_SIZE(self);
2527 previous_is_cased = 0;
2528 for (; p < e; p++) {
2529 register const Py_UNICODE ch = *p;
2530
2531 if (previous_is_cased)
2532 *p = Py_UNICODE_TOLOWER(ch);
2533 else
2534 *p = Py_UNICODE_TOTITLE(ch);
2535
2536 if (Py_UNICODE_ISLOWER(ch) ||
2537 Py_UNICODE_ISUPPER(ch) ||
2538 Py_UNICODE_ISTITLE(ch))
2539 previous_is_cased = 1;
2540 else
2541 previous_is_cased = 0;
2542 }
2543 return 1;
2544}
2545
2546PyObject *PyUnicode_Join(PyObject *separator,
2547 PyObject *seq)
2548{
2549 Py_UNICODE *sep;
2550 int seplen;
2551 PyUnicodeObject *res = NULL;
2552 int reslen = 0;
2553 Py_UNICODE *p;
2554 int seqlen = 0;
2555 int sz = 100;
2556 int i;
2557
2558 seqlen = PySequence_Length(seq);
2559 if (seqlen < 0 && PyErr_Occurred())
2560 return NULL;
2561
2562 if (separator == NULL) {
2563 Py_UNICODE blank = ' ';
2564 sep = &blank;
2565 seplen = 1;
2566 }
2567 else {
2568 separator = PyUnicode_FromObject(separator);
2569 if (separator == NULL)
2570 return NULL;
2571 sep = PyUnicode_AS_UNICODE(separator);
2572 seplen = PyUnicode_GET_SIZE(separator);
2573 }
2574
2575 res = _PyUnicode_New(sz);
2576 if (res == NULL)
2577 goto onError;
2578 p = PyUnicode_AS_UNICODE(res);
2579 reslen = 0;
2580
2581 for (i = 0; i < seqlen; i++) {
2582 int itemlen;
2583 PyObject *item;
2584
2585 item = PySequence_GetItem(seq, i);
2586 if (item == NULL)
2587 goto onError;
2588 if (!PyUnicode_Check(item)) {
2589 PyObject *v;
2590 v = PyUnicode_FromObject(item);
2591 Py_DECREF(item);
2592 item = v;
2593 if (item == NULL)
2594 goto onError;
2595 }
2596 itemlen = PyUnicode_GET_SIZE(item);
2597 while (reslen + itemlen + seplen >= sz) {
2598 if (_PyUnicode_Resize(res, sz*2))
2599 goto onError;
2600 sz *= 2;
2601 p = PyUnicode_AS_UNICODE(res) + reslen;
2602 }
2603 if (i > 0) {
2604 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2605 p += seplen;
2606 reslen += seplen;
2607 }
2608 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2609 p += itemlen;
2610 reslen += itemlen;
2611 Py_DECREF(item);
2612 }
2613 if (_PyUnicode_Resize(res, reslen))
2614 goto onError;
2615
2616 Py_XDECREF(separator);
2617 return (PyObject *)res;
2618
2619 onError:
2620 Py_XDECREF(separator);
2621 Py_DECREF(res);
2622 return NULL;
2623}
2624
2625static
2626PyUnicodeObject *pad(PyUnicodeObject *self,
2627 int left,
2628 int right,
2629 Py_UNICODE fill)
2630{
2631 PyUnicodeObject *u;
2632
2633 if (left < 0)
2634 left = 0;
2635 if (right < 0)
2636 right = 0;
2637
2638 if (left == 0 && right == 0) {
2639 Py_INCREF(self);
2640 return self;
2641 }
2642
2643 u = _PyUnicode_New(left + self->length + right);
2644 if (u) {
2645 if (left)
2646 Py_UNICODE_FILL(u->str, fill, left);
2647 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2648 if (right)
2649 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2650 }
2651
2652 return u;
2653}
2654
2655#define SPLIT_APPEND(data, left, right) \
2656 str = PyUnicode_FromUnicode(data + left, right - left); \
2657 if (!str) \
2658 goto onError; \
2659 if (PyList_Append(list, str)) { \
2660 Py_DECREF(str); \
2661 goto onError; \
2662 } \
2663 else \
2664 Py_DECREF(str);
2665
2666static
2667PyObject *split_whitespace(PyUnicodeObject *self,
2668 PyObject *list,
2669 int maxcount)
2670{
2671 register int i;
2672 register int j;
2673 int len = self->length;
2674 PyObject *str;
2675
2676 for (i = j = 0; i < len; ) {
2677 /* find a token */
2678 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2679 i++;
2680 j = i;
2681 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2682 i++;
2683 if (j < i) {
2684 if (maxcount-- <= 0)
2685 break;
2686 SPLIT_APPEND(self->str, j, i);
2687 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2688 i++;
2689 j = i;
2690 }
2691 }
2692 if (j < len) {
2693 SPLIT_APPEND(self->str, j, len);
2694 }
2695 return list;
2696
2697 onError:
2698 Py_DECREF(list);
2699 return NULL;
2700}
2701
2702PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002703 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704{
2705 register int i;
2706 register int j;
2707 int len;
2708 PyObject *list;
2709 PyObject *str;
2710 Py_UNICODE *data;
2711
2712 string = PyUnicode_FromObject(string);
2713 if (string == NULL)
2714 return NULL;
2715 data = PyUnicode_AS_UNICODE(string);
2716 len = PyUnicode_GET_SIZE(string);
2717
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 list = PyList_New(0);
2719 if (!list)
2720 goto onError;
2721
2722 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002723 int eol;
2724
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 /* Find a line and append it */
2726 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2727 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728
2729 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002730 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 if (i < len) {
2732 if (data[i] == '\r' && i + 1 < len &&
2733 data[i+1] == '\n')
2734 i += 2;
2735 else
2736 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002737 if (keepends)
2738 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 }
Guido van Rossum86662912000-04-11 15:38:46 +00002740 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 j = i;
2742 }
2743 if (j < len) {
2744 SPLIT_APPEND(data, j, len);
2745 }
2746
2747 Py_DECREF(string);
2748 return list;
2749
2750 onError:
2751 Py_DECREF(list);
2752 Py_DECREF(string);
2753 return NULL;
2754}
2755
2756static
2757PyObject *split_char(PyUnicodeObject *self,
2758 PyObject *list,
2759 Py_UNICODE ch,
2760 int maxcount)
2761{
2762 register int i;
2763 register int j;
2764 int len = self->length;
2765 PyObject *str;
2766
2767 for (i = j = 0; i < len; ) {
2768 if (self->str[i] == ch) {
2769 if (maxcount-- <= 0)
2770 break;
2771 SPLIT_APPEND(self->str, j, i);
2772 i = j = i + 1;
2773 } else
2774 i++;
2775 }
2776 if (j <= len) {
2777 SPLIT_APPEND(self->str, j, len);
2778 }
2779 return list;
2780
2781 onError:
2782 Py_DECREF(list);
2783 return NULL;
2784}
2785
2786static
2787PyObject *split_substring(PyUnicodeObject *self,
2788 PyObject *list,
2789 PyUnicodeObject *substring,
2790 int maxcount)
2791{
2792 register int i;
2793 register int j;
2794 int len = self->length;
2795 int sublen = substring->length;
2796 PyObject *str;
2797
2798 for (i = j = 0; i < len - sublen; ) {
2799 if (Py_UNICODE_MATCH(self, i, substring)) {
2800 if (maxcount-- <= 0)
2801 break;
2802 SPLIT_APPEND(self->str, j, i);
2803 i = j = i + sublen;
2804 } else
2805 i++;
2806 }
2807 if (j <= len) {
2808 SPLIT_APPEND(self->str, j, len);
2809 }
2810 return list;
2811
2812 onError:
2813 Py_DECREF(list);
2814 return NULL;
2815}
2816
2817#undef SPLIT_APPEND
2818
2819static
2820PyObject *split(PyUnicodeObject *self,
2821 PyUnicodeObject *substring,
2822 int maxcount)
2823{
2824 PyObject *list;
2825
2826 if (maxcount < 0)
2827 maxcount = INT_MAX;
2828
2829 list = PyList_New(0);
2830 if (!list)
2831 return NULL;
2832
2833 if (substring == NULL)
2834 return split_whitespace(self,list,maxcount);
2835
2836 else if (substring->length == 1)
2837 return split_char(self,list,substring->str[0],maxcount);
2838
2839 else if (substring->length == 0) {
2840 Py_DECREF(list);
2841 PyErr_SetString(PyExc_ValueError, "empty separator");
2842 return NULL;
2843 }
2844 else
2845 return split_substring(self,list,substring,maxcount);
2846}
2847
2848static
2849PyObject *strip(PyUnicodeObject *self,
2850 int left,
2851 int right)
2852{
2853 Py_UNICODE *p = self->str;
2854 int start = 0;
2855 int end = self->length;
2856
2857 if (left)
2858 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2859 start++;
2860
2861 if (right)
2862 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2863 end--;
2864
2865 if (start == 0 && end == self->length) {
2866 /* couldn't strip anything off, return original string */
2867 Py_INCREF(self);
2868 return (PyObject*) self;
2869 }
2870
2871 return (PyObject*) PyUnicode_FromUnicode(
2872 self->str + start,
2873 end - start
2874 );
2875}
2876
2877static
2878PyObject *replace(PyUnicodeObject *self,
2879 PyUnicodeObject *str1,
2880 PyUnicodeObject *str2,
2881 int maxcount)
2882{
2883 PyUnicodeObject *u;
2884
2885 if (maxcount < 0)
2886 maxcount = INT_MAX;
2887
2888 if (str1->length == 1 && str2->length == 1) {
2889 int i;
2890
2891 /* replace characters */
2892 if (!findchar(self->str, self->length, str1->str[0])) {
2893 /* nothing to replace, return original string */
2894 Py_INCREF(self);
2895 u = self;
2896 } else {
2897 Py_UNICODE u1 = str1->str[0];
2898 Py_UNICODE u2 = str2->str[0];
2899
2900 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2901 self->str,
2902 self->length
2903 );
2904 if (u)
2905 for (i = 0; i < u->length; i++)
2906 if (u->str[i] == u1) {
2907 if (--maxcount < 0)
2908 break;
2909 u->str[i] = u2;
2910 }
2911 }
2912
2913 } else {
2914 int n, i;
2915 Py_UNICODE *p;
2916
2917 /* replace strings */
2918 n = count(self, 0, self->length, str1);
2919 if (n > maxcount)
2920 n = maxcount;
2921 if (n == 0) {
2922 /* nothing to replace, return original string */
2923 Py_INCREF(self);
2924 u = self;
2925 } else {
2926 u = _PyUnicode_New(
2927 self->length + n * (str2->length - str1->length));
2928 if (u) {
2929 i = 0;
2930 p = u->str;
2931 while (i <= self->length - str1->length)
2932 if (Py_UNICODE_MATCH(self, i, str1)) {
2933 /* replace string segment */
2934 Py_UNICODE_COPY(p, str2->str, str2->length);
2935 p += str2->length;
2936 i += str1->length;
2937 if (--n <= 0) {
2938 /* copy remaining part */
2939 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2940 break;
2941 }
2942 } else
2943 *p++ = self->str[i++];
2944 }
2945 }
2946 }
2947
2948 return (PyObject *) u;
2949}
2950
2951/* --- Unicode Object Methods --------------------------------------------- */
2952
2953static char title__doc__[] =
2954"S.title() -> unicode\n\
2955\n\
2956Return a titlecased version of S, i.e. words start with title case\n\
2957characters, all remaining cased characters have lower case.";
2958
2959static PyObject*
2960unicode_title(PyUnicodeObject *self, PyObject *args)
2961{
2962 if (!PyArg_NoArgs(args))
2963 return NULL;
2964 return fixup(self, fixtitle);
2965}
2966
2967static char capitalize__doc__[] =
2968"S.capitalize() -> unicode\n\
2969\n\
2970Return a capitalized version of S, i.e. make the first character\n\
2971have upper case.";
2972
2973static PyObject*
2974unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2975{
2976 if (!PyArg_NoArgs(args))
2977 return NULL;
2978 return fixup(self, fixcapitalize);
2979}
2980
2981#if 0
2982static char capwords__doc__[] =
2983"S.capwords() -> unicode\n\
2984\n\
2985Apply .capitalize() to all words in S and return the result with\n\
2986normalized whitespace (all whitespace strings are replaced by ' ').";
2987
2988static PyObject*
2989unicode_capwords(PyUnicodeObject *self, PyObject *args)
2990{
2991 PyObject *list;
2992 PyObject *item;
2993 int i;
2994
2995 if (!PyArg_NoArgs(args))
2996 return NULL;
2997
2998 /* Split into words */
2999 list = split(self, NULL, -1);
3000 if (!list)
3001 return NULL;
3002
3003 /* Capitalize each word */
3004 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3005 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3006 fixcapitalize);
3007 if (item == NULL)
3008 goto onError;
3009 Py_DECREF(PyList_GET_ITEM(list, i));
3010 PyList_SET_ITEM(list, i, item);
3011 }
3012
3013 /* Join the words to form a new string */
3014 item = PyUnicode_Join(NULL, list);
3015
3016onError:
3017 Py_DECREF(list);
3018 return (PyObject *)item;
3019}
3020#endif
3021
3022static char center__doc__[] =
3023"S.center(width) -> unicode\n\
3024\n\
3025Return S centered in a Unicode string of length width. Padding is done\n\
3026using spaces.";
3027
3028static PyObject *
3029unicode_center(PyUnicodeObject *self, PyObject *args)
3030{
3031 int marg, left;
3032 int width;
3033
3034 if (!PyArg_ParseTuple(args, "i:center", &width))
3035 return NULL;
3036
3037 if (self->length >= width) {
3038 Py_INCREF(self);
3039 return (PyObject*) self;
3040 }
3041
3042 marg = width - self->length;
3043 left = marg / 2 + (marg & width & 1);
3044
3045 return (PyObject*) pad(self, left, marg - left, ' ');
3046}
3047
3048static int
3049unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3050{
3051 int len1, len2;
3052 Py_UNICODE *s1 = str1->str;
3053 Py_UNICODE *s2 = str2->str;
3054
3055 len1 = str1->length;
3056 len2 = str2->length;
3057
3058 while (len1 > 0 && len2 > 0) {
3059 int cmp = (*s1++) - (*s2++);
3060 if (cmp)
3061 /* This should make Christian happy! */
3062 return (cmp < 0) ? -1 : (cmp != 0);
3063 len1--, len2--;
3064 }
3065
3066 return (len1 < len2) ? -1 : (len1 != len2);
3067}
3068
3069int PyUnicode_Compare(PyObject *left,
3070 PyObject *right)
3071{
3072 PyUnicodeObject *u = NULL, *v = NULL;
3073 int result;
3074
3075 /* Coerce the two arguments */
3076 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3077 if (u == NULL)
3078 goto onError;
3079 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3080 if (v == NULL)
3081 goto onError;
3082
3083 /* Shortcut for emtpy or interned objects */
3084 if (v == u) {
3085 Py_DECREF(u);
3086 Py_DECREF(v);
3087 return 0;
3088 }
3089
3090 result = unicode_compare(u, v);
3091
3092 Py_DECREF(u);
3093 Py_DECREF(v);
3094 return result;
3095
3096onError:
3097 Py_XDECREF(u);
3098 Py_XDECREF(v);
3099 return -1;
3100}
3101
Guido van Rossum403d68b2000-03-13 15:55:09 +00003102int PyUnicode_Contains(PyObject *container,
3103 PyObject *element)
3104{
3105 PyUnicodeObject *u = NULL, *v = NULL;
3106 int result;
3107 register const Py_UNICODE *p, *e;
3108 register Py_UNICODE ch;
3109
3110 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003111 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003112 if (v == NULL) {
3113 PyErr_SetString(PyExc_TypeError,
3114 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003115 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003116 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003117 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3118 if (u == NULL) {
3119 Py_DECREF(v);
3120 goto onError;
3121 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003122
3123 /* Check v in u */
3124 if (PyUnicode_GET_SIZE(v) != 1) {
3125 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003126 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003127 goto onError;
3128 }
3129 ch = *PyUnicode_AS_UNICODE(v);
3130 p = PyUnicode_AS_UNICODE(u);
3131 e = p + PyUnicode_GET_SIZE(u);
3132 result = 0;
3133 while (p < e) {
3134 if (*p++ == ch) {
3135 result = 1;
3136 break;
3137 }
3138 }
3139
3140 Py_DECREF(u);
3141 Py_DECREF(v);
3142 return result;
3143
3144onError:
3145 Py_XDECREF(u);
3146 Py_XDECREF(v);
3147 return -1;
3148}
3149
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150/* Concat to string or Unicode object giving a new Unicode object. */
3151
3152PyObject *PyUnicode_Concat(PyObject *left,
3153 PyObject *right)
3154{
3155 PyUnicodeObject *u = NULL, *v = NULL, *w;
3156
3157 /* Coerce the two arguments */
3158 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3159 if (u == NULL)
3160 goto onError;
3161 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3162 if (v == NULL)
3163 goto onError;
3164
3165 /* Shortcuts */
3166 if (v == unicode_empty) {
3167 Py_DECREF(v);
3168 return (PyObject *)u;
3169 }
3170 if (u == unicode_empty) {
3171 Py_DECREF(u);
3172 return (PyObject *)v;
3173 }
3174
3175 /* Concat the two Unicode strings */
3176 w = _PyUnicode_New(u->length + v->length);
3177 if (w == NULL)
3178 goto onError;
3179 Py_UNICODE_COPY(w->str, u->str, u->length);
3180 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3181
3182 Py_DECREF(u);
3183 Py_DECREF(v);
3184 return (PyObject *)w;
3185
3186onError:
3187 Py_XDECREF(u);
3188 Py_XDECREF(v);
3189 return NULL;
3190}
3191
3192static char count__doc__[] =
3193"S.count(sub[, start[, end]]) -> int\n\
3194\n\
3195Return the number of occurrences of substring sub in Unicode string\n\
3196S[start:end]. Optional arguments start and end are\n\
3197interpreted as in slice notation.";
3198
3199static PyObject *
3200unicode_count(PyUnicodeObject *self, PyObject *args)
3201{
3202 PyUnicodeObject *substring;
3203 int start = 0;
3204 int end = INT_MAX;
3205 PyObject *result;
3206
Guido van Rossumb8872e62000-05-09 14:14:27 +00003207 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3208 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return NULL;
3210
3211 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3212 (PyObject *)substring);
3213 if (substring == NULL)
3214 return NULL;
3215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 if (start < 0)
3217 start += self->length;
3218 if (start < 0)
3219 start = 0;
3220 if (end > self->length)
3221 end = self->length;
3222 if (end < 0)
3223 end += self->length;
3224 if (end < 0)
3225 end = 0;
3226
3227 result = PyInt_FromLong((long) count(self, start, end, substring));
3228
3229 Py_DECREF(substring);
3230 return result;
3231}
3232
3233static char encode__doc__[] =
3234"S.encode([encoding[,errors]]) -> string\n\
3235\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003236Return an encoded string version of S. Default encoding is the current\n\
3237default string encoding. errors may be given to set a different error\n\
3238handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3239a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240
3241static PyObject *
3242unicode_encode(PyUnicodeObject *self, PyObject *args)
3243{
3244 char *encoding = NULL;
3245 char *errors = NULL;
3246 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3247 return NULL;
3248 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3249}
3250
3251static char expandtabs__doc__[] =
3252"S.expandtabs([tabsize]) -> unicode\n\
3253\n\
3254Return a copy of S where all tab characters are expanded using spaces.\n\
3255If tabsize is not given, a tab size of 8 characters is assumed.";
3256
3257static PyObject*
3258unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3259{
3260 Py_UNICODE *e;
3261 Py_UNICODE *p;
3262 Py_UNICODE *q;
3263 int i, j;
3264 PyUnicodeObject *u;
3265 int tabsize = 8;
3266
3267 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3268 return NULL;
3269
3270 /* First pass: determine size of ouput string */
3271 i = j = 0;
3272 e = self->str + self->length;
3273 for (p = self->str; p < e; p++)
3274 if (*p == '\t') {
3275 if (tabsize > 0)
3276 j += tabsize - (j % tabsize);
3277 }
3278 else {
3279 j++;
3280 if (*p == '\n' || *p == '\r') {
3281 i += j;
3282 j = 0;
3283 }
3284 }
3285
3286 /* Second pass: create output string and fill it */
3287 u = _PyUnicode_New(i + j);
3288 if (!u)
3289 return NULL;
3290
3291 j = 0;
3292 q = u->str;
3293
3294 for (p = self->str; p < e; p++)
3295 if (*p == '\t') {
3296 if (tabsize > 0) {
3297 i = tabsize - (j % tabsize);
3298 j += i;
3299 while (i--)
3300 *q++ = ' ';
3301 }
3302 }
3303 else {
3304 j++;
3305 *q++ = *p;
3306 if (*p == '\n' || *p == '\r')
3307 j = 0;
3308 }
3309
3310 return (PyObject*) u;
3311}
3312
3313static char find__doc__[] =
3314"S.find(sub [,start [,end]]) -> int\n\
3315\n\
3316Return the lowest index in S where substring sub is found,\n\
3317such that sub is contained within s[start,end]. Optional\n\
3318arguments start and end are interpreted as in slice notation.\n\
3319\n\
3320Return -1 on failure.";
3321
3322static PyObject *
3323unicode_find(PyUnicodeObject *self, PyObject *args)
3324{
3325 PyUnicodeObject *substring;
3326 int start = 0;
3327 int end = INT_MAX;
3328 PyObject *result;
3329
Guido van Rossumb8872e62000-05-09 14:14:27 +00003330 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3331 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 return NULL;
3333 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3334 (PyObject *)substring);
3335 if (substring == NULL)
3336 return NULL;
3337
3338 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3339
3340 Py_DECREF(substring);
3341 return result;
3342}
3343
3344static PyObject *
3345unicode_getitem(PyUnicodeObject *self, int index)
3346{
3347 if (index < 0 || index >= self->length) {
3348 PyErr_SetString(PyExc_IndexError, "string index out of range");
3349 return NULL;
3350 }
3351
3352 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3353}
3354
3355static long
3356unicode_hash(PyUnicodeObject *self)
3357{
3358 long hash;
3359 PyObject *utf8;
3360
3361 /* Since Unicode objects compare equal to their UTF-8 string
3362 counterparts, they should also use the UTF-8 strings as basis
3363 for their hash value. This is needed to assure that strings and
3364 Unicode objects behave in the same way as dictionary
3365 keys. Unfortunately, this costs some performance and also some
3366 memory if the cached UTF-8 representation is not used later
3367 on. */
3368 if (self->hash != -1)
3369 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003370 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 if (utf8 == NULL)
3372 return -1;
3373 hash = PyObject_Hash(utf8);
3374 if (hash == -1)
3375 return -1;
3376 self->hash = hash;
3377 return hash;
3378}
3379
3380static char index__doc__[] =
3381"S.index(sub [,start [,end]]) -> int\n\
3382\n\
3383Like S.find() but raise ValueError when the substring is not found.";
3384
3385static PyObject *
3386unicode_index(PyUnicodeObject *self, PyObject *args)
3387{
3388 int result;
3389 PyUnicodeObject *substring;
3390 int start = 0;
3391 int end = INT_MAX;
3392
Guido van Rossumb8872e62000-05-09 14:14:27 +00003393 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 return NULL;
3396
3397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3398 (PyObject *)substring);
3399 if (substring == NULL)
3400 return NULL;
3401
3402 result = findstring(self, substring, start, end, 1);
3403
3404 Py_DECREF(substring);
3405 if (result < 0) {
3406 PyErr_SetString(PyExc_ValueError, "substring not found");
3407 return NULL;
3408 }
3409 return PyInt_FromLong(result);
3410}
3411
3412static char islower__doc__[] =
3413"S.islower() -> int\n\
3414\n\
3415Return 1 if all cased characters in S are lowercase and there is\n\
3416at least one cased character in S, 0 otherwise.";
3417
3418static PyObject*
3419unicode_islower(PyUnicodeObject *self, PyObject *args)
3420{
3421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3422 register const Py_UNICODE *e;
3423 int cased;
3424
3425 if (!PyArg_NoArgs(args))
3426 return NULL;
3427
3428 /* Shortcut for single character strings */
3429 if (PyUnicode_GET_SIZE(self) == 1)
3430 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003432 /* Special case for empty strings */
3433 if (PyString_GET_SIZE(self) == 0)
3434 return PyInt_FromLong(0);
3435
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 e = p + PyUnicode_GET_SIZE(self);
3437 cased = 0;
3438 for (; p < e; p++) {
3439 register const Py_UNICODE ch = *p;
3440
3441 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3442 return PyInt_FromLong(0);
3443 else if (!cased && Py_UNICODE_ISLOWER(ch))
3444 cased = 1;
3445 }
3446 return PyInt_FromLong(cased);
3447}
3448
3449static char isupper__doc__[] =
3450"S.isupper() -> int\n\
3451\n\
3452Return 1 if all cased characters in S are uppercase and there is\n\
3453at least one cased character in S, 0 otherwise.";
3454
3455static PyObject*
3456unicode_isupper(PyUnicodeObject *self, PyObject *args)
3457{
3458 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3459 register const Py_UNICODE *e;
3460 int cased;
3461
3462 if (!PyArg_NoArgs(args))
3463 return NULL;
3464
3465 /* Shortcut for single character strings */
3466 if (PyUnicode_GET_SIZE(self) == 1)
3467 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3468
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003469 /* Special case for empty strings */
3470 if (PyString_GET_SIZE(self) == 0)
3471 return PyInt_FromLong(0);
3472
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 e = p + PyUnicode_GET_SIZE(self);
3474 cased = 0;
3475 for (; p < e; p++) {
3476 register const Py_UNICODE ch = *p;
3477
3478 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3479 return PyInt_FromLong(0);
3480 else if (!cased && Py_UNICODE_ISUPPER(ch))
3481 cased = 1;
3482 }
3483 return PyInt_FromLong(cased);
3484}
3485
3486static char istitle__doc__[] =
3487"S.istitle() -> int\n\
3488\n\
3489Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3490may only follow uncased characters and lowercase characters only cased\n\
3491ones. Return 0 otherwise.";
3492
3493static PyObject*
3494unicode_istitle(PyUnicodeObject *self, PyObject *args)
3495{
3496 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3497 register const Py_UNICODE *e;
3498 int cased, previous_is_cased;
3499
3500 if (!PyArg_NoArgs(args))
3501 return NULL;
3502
3503 /* Shortcut for single character strings */
3504 if (PyUnicode_GET_SIZE(self) == 1)
3505 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3506 (Py_UNICODE_ISUPPER(*p) != 0));
3507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003508 /* Special case for empty strings */
3509 if (PyString_GET_SIZE(self) == 0)
3510 return PyInt_FromLong(0);
3511
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 e = p + PyUnicode_GET_SIZE(self);
3513 cased = 0;
3514 previous_is_cased = 0;
3515 for (; p < e; p++) {
3516 register const Py_UNICODE ch = *p;
3517
3518 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3519 if (previous_is_cased)
3520 return PyInt_FromLong(0);
3521 previous_is_cased = 1;
3522 cased = 1;
3523 }
3524 else if (Py_UNICODE_ISLOWER(ch)) {
3525 if (!previous_is_cased)
3526 return PyInt_FromLong(0);
3527 previous_is_cased = 1;
3528 cased = 1;
3529 }
3530 else
3531 previous_is_cased = 0;
3532 }
3533 return PyInt_FromLong(cased);
3534}
3535
3536static char isspace__doc__[] =
3537"S.isspace() -> int\n\
3538\n\
3539Return 1 if there are only whitespace characters in S,\n\
35400 otherwise.";
3541
3542static PyObject*
3543unicode_isspace(PyUnicodeObject *self, PyObject *args)
3544{
3545 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3546 register const Py_UNICODE *e;
3547
3548 if (!PyArg_NoArgs(args))
3549 return NULL;
3550
3551 /* Shortcut for single character strings */
3552 if (PyUnicode_GET_SIZE(self) == 1 &&
3553 Py_UNICODE_ISSPACE(*p))
3554 return PyInt_FromLong(1);
3555
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003556 /* Special case for empty strings */
3557 if (PyString_GET_SIZE(self) == 0)
3558 return PyInt_FromLong(0);
3559
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 e = p + PyUnicode_GET_SIZE(self);
3561 for (; p < e; p++) {
3562 if (!Py_UNICODE_ISSPACE(*p))
3563 return PyInt_FromLong(0);
3564 }
3565 return PyInt_FromLong(1);
3566}
3567
3568static char isdecimal__doc__[] =
3569"S.isdecimal() -> int\n\
3570\n\
3571Return 1 if there are only decimal characters in S,\n\
35720 otherwise.";
3573
3574static PyObject*
3575unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3576{
3577 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3578 register const Py_UNICODE *e;
3579
3580 if (!PyArg_NoArgs(args))
3581 return NULL;
3582
3583 /* Shortcut for single character strings */
3584 if (PyUnicode_GET_SIZE(self) == 1 &&
3585 Py_UNICODE_ISDECIMAL(*p))
3586 return PyInt_FromLong(1);
3587
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003588 /* Special case for empty strings */
3589 if (PyString_GET_SIZE(self) == 0)
3590 return PyInt_FromLong(0);
3591
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 e = p + PyUnicode_GET_SIZE(self);
3593 for (; p < e; p++) {
3594 if (!Py_UNICODE_ISDECIMAL(*p))
3595 return PyInt_FromLong(0);
3596 }
3597 return PyInt_FromLong(1);
3598}
3599
3600static char isdigit__doc__[] =
3601"S.isdigit() -> int\n\
3602\n\
3603Return 1 if there are only digit characters in S,\n\
36040 otherwise.";
3605
3606static PyObject*
3607unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3608{
3609 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3610 register const Py_UNICODE *e;
3611
3612 if (!PyArg_NoArgs(args))
3613 return NULL;
3614
3615 /* Shortcut for single character strings */
3616 if (PyUnicode_GET_SIZE(self) == 1 &&
3617 Py_UNICODE_ISDIGIT(*p))
3618 return PyInt_FromLong(1);
3619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003620 /* Special case for empty strings */
3621 if (PyString_GET_SIZE(self) == 0)
3622 return PyInt_FromLong(0);
3623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 e = p + PyUnicode_GET_SIZE(self);
3625 for (; p < e; p++) {
3626 if (!Py_UNICODE_ISDIGIT(*p))
3627 return PyInt_FromLong(0);
3628 }
3629 return PyInt_FromLong(1);
3630}
3631
3632static char isnumeric__doc__[] =
3633"S.isnumeric() -> int\n\
3634\n\
3635Return 1 if there are only numeric characters in S,\n\
36360 otherwise.";
3637
3638static PyObject*
3639unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3640{
3641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3642 register const Py_UNICODE *e;
3643
3644 if (!PyArg_NoArgs(args))
3645 return NULL;
3646
3647 /* Shortcut for single character strings */
3648 if (PyUnicode_GET_SIZE(self) == 1 &&
3649 Py_UNICODE_ISNUMERIC(*p))
3650 return PyInt_FromLong(1);
3651
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003652 /* Special case for empty strings */
3653 if (PyString_GET_SIZE(self) == 0)
3654 return PyInt_FromLong(0);
3655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 e = p + PyUnicode_GET_SIZE(self);
3657 for (; p < e; p++) {
3658 if (!Py_UNICODE_ISNUMERIC(*p))
3659 return PyInt_FromLong(0);
3660 }
3661 return PyInt_FromLong(1);
3662}
3663
3664static char join__doc__[] =
3665"S.join(sequence) -> unicode\n\
3666\n\
3667Return a string which is the concatenation of the strings in the\n\
3668sequence. The separator between elements is S.";
3669
3670static PyObject*
3671unicode_join(PyUnicodeObject *self, PyObject *args)
3672{
3673 PyObject *data;
3674 if (!PyArg_ParseTuple(args, "O:join", &data))
3675 return NULL;
3676
3677 return PyUnicode_Join((PyObject *)self, data);
3678}
3679
3680static int
3681unicode_length(PyUnicodeObject *self)
3682{
3683 return self->length;
3684}
3685
3686static char ljust__doc__[] =
3687"S.ljust(width) -> unicode\n\
3688\n\
3689Return S left justified in a Unicode string of length width. Padding is\n\
3690done using spaces.";
3691
3692static PyObject *
3693unicode_ljust(PyUnicodeObject *self, PyObject *args)
3694{
3695 int width;
3696 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3697 return NULL;
3698
3699 if (self->length >= width) {
3700 Py_INCREF(self);
3701 return (PyObject*) self;
3702 }
3703
3704 return (PyObject*) pad(self, 0, width - self->length, ' ');
3705}
3706
3707static char lower__doc__[] =
3708"S.lower() -> unicode\n\
3709\n\
3710Return a copy of the string S converted to lowercase.";
3711
3712static PyObject*
3713unicode_lower(PyUnicodeObject *self, PyObject *args)
3714{
3715 if (!PyArg_NoArgs(args))
3716 return NULL;
3717 return fixup(self, fixlower);
3718}
3719
3720static char lstrip__doc__[] =
3721"S.lstrip() -> unicode\n\
3722\n\
3723Return a copy of the string S with leading whitespace removed.";
3724
3725static PyObject *
3726unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3727{
3728 if (!PyArg_NoArgs(args))
3729 return NULL;
3730 return strip(self, 1, 0);
3731}
3732
3733static PyObject*
3734unicode_repeat(PyUnicodeObject *str, int len)
3735{
3736 PyUnicodeObject *u;
3737 Py_UNICODE *p;
3738
3739 if (len < 0)
3740 len = 0;
3741
3742 if (len == 1) {
3743 /* no repeat, return original string */
3744 Py_INCREF(str);
3745 return (PyObject*) str;
3746 }
3747
3748 u = _PyUnicode_New(len * str->length);
3749 if (!u)
3750 return NULL;
3751
3752 p = u->str;
3753
3754 while (len-- > 0) {
3755 Py_UNICODE_COPY(p, str->str, str->length);
3756 p += str->length;
3757 }
3758
3759 return (PyObject*) u;
3760}
3761
3762PyObject *PyUnicode_Replace(PyObject *obj,
3763 PyObject *subobj,
3764 PyObject *replobj,
3765 int maxcount)
3766{
3767 PyObject *self;
3768 PyObject *str1;
3769 PyObject *str2;
3770 PyObject *result;
3771
3772 self = PyUnicode_FromObject(obj);
3773 if (self == NULL)
3774 return NULL;
3775 str1 = PyUnicode_FromObject(subobj);
3776 if (str1 == NULL) {
3777 Py_DECREF(self);
3778 return NULL;
3779 }
3780 str2 = PyUnicode_FromObject(replobj);
3781 if (str2 == NULL) {
3782 Py_DECREF(self);
3783 Py_DECREF(str1);
3784 return NULL;
3785 }
3786 result = replace((PyUnicodeObject *)self,
3787 (PyUnicodeObject *)str1,
3788 (PyUnicodeObject *)str2,
3789 maxcount);
3790 Py_DECREF(self);
3791 Py_DECREF(str1);
3792 Py_DECREF(str2);
3793 return result;
3794}
3795
3796static char replace__doc__[] =
3797"S.replace (old, new[, maxsplit]) -> unicode\n\
3798\n\
3799Return a copy of S with all occurrences of substring\n\
3800old replaced by new. If the optional argument maxsplit is\n\
3801given, only the first maxsplit occurrences are replaced.";
3802
3803static PyObject*
3804unicode_replace(PyUnicodeObject *self, PyObject *args)
3805{
3806 PyUnicodeObject *str1;
3807 PyUnicodeObject *str2;
3808 int maxcount = -1;
3809 PyObject *result;
3810
3811 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3812 return NULL;
3813 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3814 if (str1 == NULL)
3815 return NULL;
3816 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3817 if (str2 == NULL)
3818 return NULL;
3819
3820 result = replace(self, str1, str2, maxcount);
3821
3822 Py_DECREF(str1);
3823 Py_DECREF(str2);
3824 return result;
3825}
3826
3827static
3828PyObject *unicode_repr(PyObject *unicode)
3829{
3830 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3831 PyUnicode_GET_SIZE(unicode),
3832 1);
3833}
3834
3835static char rfind__doc__[] =
3836"S.rfind(sub [,start [,end]]) -> int\n\
3837\n\
3838Return the highest index in S where substring sub is found,\n\
3839such that sub is contained within s[start,end]. Optional\n\
3840arguments start and end are interpreted as in slice notation.\n\
3841\n\
3842Return -1 on failure.";
3843
3844static PyObject *
3845unicode_rfind(PyUnicodeObject *self, PyObject *args)
3846{
3847 PyUnicodeObject *substring;
3848 int start = 0;
3849 int end = INT_MAX;
3850 PyObject *result;
3851
Guido van Rossumb8872e62000-05-09 14:14:27 +00003852 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3853 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 return NULL;
3855 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3856 (PyObject *)substring);
3857 if (substring == NULL)
3858 return NULL;
3859
3860 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3861
3862 Py_DECREF(substring);
3863 return result;
3864}
3865
3866static char rindex__doc__[] =
3867"S.rindex(sub [,start [,end]]) -> int\n\
3868\n\
3869Like S.rfind() but raise ValueError when the substring is not found.";
3870
3871static PyObject *
3872unicode_rindex(PyUnicodeObject *self, PyObject *args)
3873{
3874 int result;
3875 PyUnicodeObject *substring;
3876 int start = 0;
3877 int end = INT_MAX;
3878
Guido van Rossumb8872e62000-05-09 14:14:27 +00003879 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3880 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 return NULL;
3882 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3883 (PyObject *)substring);
3884 if (substring == NULL)
3885 return NULL;
3886
3887 result = findstring(self, substring, start, end, -1);
3888
3889 Py_DECREF(substring);
3890 if (result < 0) {
3891 PyErr_SetString(PyExc_ValueError, "substring not found");
3892 return NULL;
3893 }
3894 return PyInt_FromLong(result);
3895}
3896
3897static char rjust__doc__[] =
3898"S.rjust(width) -> unicode\n\
3899\n\
3900Return S right justified in a Unicode string of length width. Padding is\n\
3901done using spaces.";
3902
3903static PyObject *
3904unicode_rjust(PyUnicodeObject *self, PyObject *args)
3905{
3906 int width;
3907 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3908 return NULL;
3909
3910 if (self->length >= width) {
3911 Py_INCREF(self);
3912 return (PyObject*) self;
3913 }
3914
3915 return (PyObject*) pad(self, width - self->length, 0, ' ');
3916}
3917
3918static char rstrip__doc__[] =
3919"S.rstrip() -> unicode\n\
3920\n\
3921Return a copy of the string S with trailing whitespace removed.";
3922
3923static PyObject *
3924unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3925{
3926 if (!PyArg_NoArgs(args))
3927 return NULL;
3928 return strip(self, 0, 1);
3929}
3930
3931static PyObject*
3932unicode_slice(PyUnicodeObject *self, int start, int end)
3933{
3934 /* standard clamping */
3935 if (start < 0)
3936 start = 0;
3937 if (end < 0)
3938 end = 0;
3939 if (end > self->length)
3940 end = self->length;
3941 if (start == 0 && end == self->length) {
3942 /* full slice, return original string */
3943 Py_INCREF(self);
3944 return (PyObject*) self;
3945 }
3946 if (start > end)
3947 start = end;
3948 /* copy slice */
3949 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3950 end - start);
3951}
3952
3953PyObject *PyUnicode_Split(PyObject *s,
3954 PyObject *sep,
3955 int maxsplit)
3956{
3957 PyObject *result;
3958
3959 s = PyUnicode_FromObject(s);
3960 if (s == NULL)
3961 return NULL;
3962 if (sep != NULL) {
3963 sep = PyUnicode_FromObject(sep);
3964 if (sep == NULL) {
3965 Py_DECREF(s);
3966 return NULL;
3967 }
3968 }
3969
3970 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3971
3972 Py_DECREF(s);
3973 Py_XDECREF(sep);
3974 return result;
3975}
3976
3977static char split__doc__[] =
3978"S.split([sep [,maxsplit]]) -> list of strings\n\
3979\n\
3980Return a list of the words in S, using sep as the\n\
3981delimiter string. If maxsplit is given, at most maxsplit\n\
3982splits are done. If sep is not specified, any whitespace string\n\
3983is a separator.";
3984
3985static PyObject*
3986unicode_split(PyUnicodeObject *self, PyObject *args)
3987{
3988 PyObject *substring = Py_None;
3989 int maxcount = -1;
3990
3991 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3992 return NULL;
3993
3994 if (substring == Py_None)
3995 return split(self, NULL, maxcount);
3996 else if (PyUnicode_Check(substring))
3997 return split(self, (PyUnicodeObject *)substring, maxcount);
3998 else
3999 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4000}
4001
4002static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004003"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004\n\
4005Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004006Line breaks are not included in the resulting list unless keepends\n\
4007is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
4009static PyObject*
4010unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4011{
Guido van Rossum86662912000-04-11 15:38:46 +00004012 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013
Guido van Rossum86662912000-04-11 15:38:46 +00004014 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 return NULL;
4016
Guido van Rossum86662912000-04-11 15:38:46 +00004017 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018}
4019
4020static
4021PyObject *unicode_str(PyUnicodeObject *self)
4022{
Fred Drakee4315f52000-05-09 19:53:39 +00004023 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024}
4025
4026static char strip__doc__[] =
4027"S.strip() -> unicode\n\
4028\n\
4029Return a copy of S with leading and trailing whitespace removed.";
4030
4031static PyObject *
4032unicode_strip(PyUnicodeObject *self, PyObject *args)
4033{
4034 if (!PyArg_NoArgs(args))
4035 return NULL;
4036 return strip(self, 1, 1);
4037}
4038
4039static char swapcase__doc__[] =
4040"S.swapcase() -> unicode\n\
4041\n\
4042Return a copy of S with uppercase characters converted to lowercase\n\
4043and vice versa.";
4044
4045static PyObject*
4046unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4047{
4048 if (!PyArg_NoArgs(args))
4049 return NULL;
4050 return fixup(self, fixswapcase);
4051}
4052
4053static char translate__doc__[] =
4054"S.translate(table) -> unicode\n\
4055\n\
4056Return a copy of the string S, where all characters have been mapped\n\
4057through the given translation table, which must be a mapping of\n\
4058Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4059are left untouched. Characters mapped to None are deleted.";
4060
4061static PyObject*
4062unicode_translate(PyUnicodeObject *self, PyObject *args)
4063{
4064 PyObject *table;
4065
4066 if (!PyArg_ParseTuple(args, "O:translate", &table))
4067 return NULL;
4068 return PyUnicode_TranslateCharmap(self->str,
4069 self->length,
4070 table,
4071 "ignore");
4072}
4073
4074static char upper__doc__[] =
4075"S.upper() -> unicode\n\
4076\n\
4077Return a copy of S converted to uppercase.";
4078
4079static PyObject*
4080unicode_upper(PyUnicodeObject *self, PyObject *args)
4081{
4082 if (!PyArg_NoArgs(args))
4083 return NULL;
4084 return fixup(self, fixupper);
4085}
4086
4087#if 0
4088static char zfill__doc__[] =
4089"S.zfill(width) -> unicode\n\
4090\n\
4091Pad a numeric string x with zeros on the left, to fill a field\n\
4092of the specified width. The string x is never truncated.";
4093
4094static PyObject *
4095unicode_zfill(PyUnicodeObject *self, PyObject *args)
4096{
4097 int fill;
4098 PyUnicodeObject *u;
4099
4100 int width;
4101 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4102 return NULL;
4103
4104 if (self->length >= width) {
4105 Py_INCREF(self);
4106 return (PyObject*) self;
4107 }
4108
4109 fill = width - self->length;
4110
4111 u = pad(self, fill, 0, '0');
4112
4113 if (u->str[fill] == '+' || u->str[fill] == '-') {
4114 /* move sign to beginning of string */
4115 u->str[0] = u->str[fill];
4116 u->str[fill] = '0';
4117 }
4118
4119 return (PyObject*) u;
4120}
4121#endif
4122
4123#if 0
4124static PyObject*
4125unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4126{
4127 if (!PyArg_NoArgs(args))
4128 return NULL;
4129 return PyInt_FromLong(unicode_freelist_size);
4130}
4131#endif
4132
4133static char startswith__doc__[] =
4134"S.startswith(prefix[, start[, end]]) -> int\n\
4135\n\
4136Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4137optional start, test S beginning at that position. With optional end, stop\n\
4138comparing S at that position.";
4139
4140static PyObject *
4141unicode_startswith(PyUnicodeObject *self,
4142 PyObject *args)
4143{
4144 PyUnicodeObject *substring;
4145 int start = 0;
4146 int end = INT_MAX;
4147 PyObject *result;
4148
Guido van Rossumb8872e62000-05-09 14:14:27 +00004149 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4150 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return NULL;
4152 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4153 (PyObject *)substring);
4154 if (substring == NULL)
4155 return NULL;
4156
4157 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4158
4159 Py_DECREF(substring);
4160 return result;
4161}
4162
4163
4164static char endswith__doc__[] =
4165"S.endswith(suffix[, start[, end]]) -> int\n\
4166\n\
4167Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4168optional start, test S beginning at that position. With optional end, stop\n\
4169comparing S at that position.";
4170
4171static PyObject *
4172unicode_endswith(PyUnicodeObject *self,
4173 PyObject *args)
4174{
4175 PyUnicodeObject *substring;
4176 int start = 0;
4177 int end = INT_MAX;
4178 PyObject *result;
4179
Guido van Rossumb8872e62000-05-09 14:14:27 +00004180 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4181 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 return NULL;
4183 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4184 (PyObject *)substring);
4185 if (substring == NULL)
4186 return NULL;
4187
4188 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4189
4190 Py_DECREF(substring);
4191 return result;
4192}
4193
4194
4195static PyMethodDef unicode_methods[] = {
4196
4197 /* Order is according to common usage: often used methods should
4198 appear first, since lookup is done sequentially. */
4199
4200 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4201 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4202 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4203 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4204 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4205 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4206 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4207 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4208 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4209 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4210 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4211 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4212 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4213 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4214/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4215 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4216 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4217 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4218 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4219 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4220 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4221 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4222 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4223 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4224 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4225 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4226 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4227 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4228 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4229 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4230 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4231 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4232 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4233#if 0
4234 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4235 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4236#endif
4237
4238#if 0
4239 /* This one is just used for debugging the implementation. */
4240 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4241#endif
4242
4243 {NULL, NULL}
4244};
4245
4246static PyObject *
4247unicode_getattr(PyUnicodeObject *self, char *name)
4248{
4249 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4250}
4251
4252static PySequenceMethods unicode_as_sequence = {
4253 (inquiry) unicode_length, /* sq_length */
4254 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4255 (intargfunc) unicode_repeat, /* sq_repeat */
4256 (intargfunc) unicode_getitem, /* sq_item */
4257 (intintargfunc) unicode_slice, /* sq_slice */
4258 0, /* sq_ass_item */
4259 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004260 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261};
4262
4263static int
4264unicode_buffer_getreadbuf(PyUnicodeObject *self,
4265 int index,
4266 const void **ptr)
4267{
4268 if (index != 0) {
4269 PyErr_SetString(PyExc_SystemError,
4270 "accessing non-existent unicode segment");
4271 return -1;
4272 }
4273 *ptr = (void *) self->str;
4274 return PyUnicode_GET_DATA_SIZE(self);
4275}
4276
4277static int
4278unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4279 const void **ptr)
4280{
4281 PyErr_SetString(PyExc_TypeError,
4282 "cannot use unicode as modifyable buffer");
4283 return -1;
4284}
4285
4286static int
4287unicode_buffer_getsegcount(PyUnicodeObject *self,
4288 int *lenp)
4289{
4290 if (lenp)
4291 *lenp = PyUnicode_GET_DATA_SIZE(self);
4292 return 1;
4293}
4294
4295static int
4296unicode_buffer_getcharbuf(PyUnicodeObject *self,
4297 int index,
4298 const void **ptr)
4299{
4300 PyObject *str;
4301
4302 if (index != 0) {
4303 PyErr_SetString(PyExc_SystemError,
4304 "accessing non-existent unicode segment");
4305 return -1;
4306 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004307 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 if (str == NULL)
4309 return -1;
4310 *ptr = (void *) PyString_AS_STRING(str);
4311 return PyString_GET_SIZE(str);
4312}
4313
4314/* Helpers for PyUnicode_Format() */
4315
4316static PyObject *
4317getnextarg(args, arglen, p_argidx)
4318 PyObject *args;
4319int arglen;
4320int *p_argidx;
4321{
4322 int argidx = *p_argidx;
4323 if (argidx < arglen) {
4324 (*p_argidx)++;
4325 if (arglen < 0)
4326 return args;
4327 else
4328 return PyTuple_GetItem(args, argidx);
4329 }
4330 PyErr_SetString(PyExc_TypeError,
4331 "not enough arguments for format string");
4332 return NULL;
4333}
4334
4335#define F_LJUST (1<<0)
4336#define F_SIGN (1<<1)
4337#define F_BLANK (1<<2)
4338#define F_ALT (1<<3)
4339#define F_ZERO (1<<4)
4340
4341static
4342#ifdef HAVE_STDARG_PROTOTYPES
4343int usprintf(register Py_UNICODE *buffer, char *format, ...)
4344#else
4345int usprintf(va_alist) va_dcl
4346#endif
4347{
4348 register int i;
4349 int len;
4350 va_list va;
4351 char *charbuffer;
4352#ifdef HAVE_STDARG_PROTOTYPES
4353 va_start(va, format);
4354#else
4355 Py_UNICODE *args;
4356 char *format;
4357
4358 va_start(va);
4359 buffer = va_arg(va, Py_UNICODE *);
4360 format = va_arg(va, char *);
4361#endif
4362
4363 /* First, format the string as char array, then expand to Py_UNICODE
4364 array. */
4365 charbuffer = (char *)buffer;
4366 len = vsprintf(charbuffer, format, va);
4367 for (i = len - 1; i >= 0; i--)
4368 buffer[i] = (Py_UNICODE) charbuffer[i];
4369
4370 va_end(va);
4371 return len;
4372}
4373
4374static int
4375formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004376 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 int flags,
4378 int prec,
4379 int type,
4380 PyObject *v)
4381{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004382 /* fmt = '%#.' + `prec` + `type`
4383 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 char fmt[20];
4385 double x;
4386
4387 x = PyFloat_AsDouble(v);
4388 if (x == -1.0 && PyErr_Occurred())
4389 return -1;
4390 if (prec < 0)
4391 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4393 type = 'g';
4394 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004395 /* worst case length calc to ensure no buffer overrun:
4396 fmt = %#.<prec>g
4397 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4398 for any double rep.)
4399 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4400 If prec=0 the effective precision is 1 (the leading digit is
4401 always given), therefore increase by one to 10+prec. */
4402 if (buflen <= (size_t)10 + (size_t)prec) {
4403 PyErr_SetString(PyExc_OverflowError,
4404 "formatted float is too long (precision too long?)");
4405 return -1;
4406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 return usprintf(buf, fmt, x);
4408}
4409
4410static int
4411formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004412 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 int flags,
4414 int prec,
4415 int type,
4416 PyObject *v)
4417{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004418 /* fmt = '%#.' + `prec` + 'l' + `type`
4419 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 char fmt[20];
4421 long x;
4422
4423 x = PyInt_AsLong(v);
4424 if (x == -1 && PyErr_Occurred())
4425 return -1;
4426 if (prec < 0)
4427 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004428 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4429 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4430 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4431 PyErr_SetString(PyExc_OverflowError,
4432 "formatted integer is too long (precision too long?)");
4433 return -1;
4434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4436 return usprintf(buf, fmt, x);
4437}
4438
4439static int
4440formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004441 size_t buflen,
4442 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004444 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004445 if (PyUnicode_Check(v)) {
4446 if (PyUnicode_GET_SIZE(v) != 1)
4447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004451 else if (PyString_Check(v)) {
4452 if (PyString_GET_SIZE(v) != 1)
4453 goto onError;
4454 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456
4457 else {
4458 /* Integer input truncated to a character */
4459 long x;
4460 x = PyInt_AsLong(v);
4461 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 buf[0] = (char) x;
4464 }
4465 buf[1] = '\0';
4466 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004467
4468 onError:
4469 PyErr_SetString(PyExc_TypeError,
4470 "%c requires int or char");
4471 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472}
4473
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004474/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4475
4476 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4477 chars are formatted. XXX This is a magic number. Each formatting
4478 routine does bounds checking to ensure no overflow, but a better
4479 solution may be to malloc a buffer of appropriate size for each
4480 format. For now, the current solution is sufficient.
4481*/
4482#define FORMATBUFLEN (size_t)120
4483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484PyObject *PyUnicode_Format(PyObject *format,
4485 PyObject *args)
4486{
4487 Py_UNICODE *fmt, *res;
4488 int fmtcnt, rescnt, reslen, arglen, argidx;
4489 int args_owned = 0;
4490 PyUnicodeObject *result = NULL;
4491 PyObject *dict = NULL;
4492 PyObject *uformat;
4493
4494 if (format == NULL || args == NULL) {
4495 PyErr_BadInternalCall();
4496 return NULL;
4497 }
4498 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004499 if (uformat == NULL)
4500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 fmt = PyUnicode_AS_UNICODE(uformat);
4502 fmtcnt = PyUnicode_GET_SIZE(uformat);
4503
4504 reslen = rescnt = fmtcnt + 100;
4505 result = _PyUnicode_New(reslen);
4506 if (result == NULL)
4507 goto onError;
4508 res = PyUnicode_AS_UNICODE(result);
4509
4510 if (PyTuple_Check(args)) {
4511 arglen = PyTuple_Size(args);
4512 argidx = 0;
4513 }
4514 else {
4515 arglen = -1;
4516 argidx = -2;
4517 }
4518 if (args->ob_type->tp_as_mapping)
4519 dict = args;
4520
4521 while (--fmtcnt >= 0) {
4522 if (*fmt != '%') {
4523 if (--rescnt < 0) {
4524 rescnt = fmtcnt + 100;
4525 reslen += rescnt;
4526 if (_PyUnicode_Resize(result, reslen) < 0)
4527 return NULL;
4528 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4529 --rescnt;
4530 }
4531 *res++ = *fmt++;
4532 }
4533 else {
4534 /* Got a format specifier */
4535 int flags = 0;
4536 int width = -1;
4537 int prec = -1;
4538 int size = 0;
4539 Py_UNICODE c = '\0';
4540 Py_UNICODE fill;
4541 PyObject *v = NULL;
4542 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004543 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 Py_UNICODE sign;
4545 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004546 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547
4548 fmt++;
4549 if (*fmt == '(') {
4550 Py_UNICODE *keystart;
4551 int keylen;
4552 PyObject *key;
4553 int pcount = 1;
4554
4555 if (dict == NULL) {
4556 PyErr_SetString(PyExc_TypeError,
4557 "format requires a mapping");
4558 goto onError;
4559 }
4560 ++fmt;
4561 --fmtcnt;
4562 keystart = fmt;
4563 /* Skip over balanced parentheses */
4564 while (pcount > 0 && --fmtcnt >= 0) {
4565 if (*fmt == ')')
4566 --pcount;
4567 else if (*fmt == '(')
4568 ++pcount;
4569 fmt++;
4570 }
4571 keylen = fmt - keystart - 1;
4572 if (fmtcnt < 0 || pcount > 0) {
4573 PyErr_SetString(PyExc_ValueError,
4574 "incomplete format key");
4575 goto onError;
4576 }
Fred Drakee4315f52000-05-09 19:53:39 +00004577 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 then looked up since Python uses strings to hold
4579 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004580 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 key = PyUnicode_EncodeUTF8(keystart,
4582 keylen,
4583 NULL);
4584 if (key == NULL)
4585 goto onError;
4586 if (args_owned) {
4587 Py_DECREF(args);
4588 args_owned = 0;
4589 }
4590 args = PyObject_GetItem(dict, key);
4591 Py_DECREF(key);
4592 if (args == NULL) {
4593 goto onError;
4594 }
4595 args_owned = 1;
4596 arglen = -1;
4597 argidx = -2;
4598 }
4599 while (--fmtcnt >= 0) {
4600 switch (c = *fmt++) {
4601 case '-': flags |= F_LJUST; continue;
4602 case '+': flags |= F_SIGN; continue;
4603 case ' ': flags |= F_BLANK; continue;
4604 case '#': flags |= F_ALT; continue;
4605 case '0': flags |= F_ZERO; continue;
4606 }
4607 break;
4608 }
4609 if (c == '*') {
4610 v = getnextarg(args, arglen, &argidx);
4611 if (v == NULL)
4612 goto onError;
4613 if (!PyInt_Check(v)) {
4614 PyErr_SetString(PyExc_TypeError,
4615 "* wants int");
4616 goto onError;
4617 }
4618 width = PyInt_AsLong(v);
4619 if (width < 0) {
4620 flags |= F_LJUST;
4621 width = -width;
4622 }
4623 if (--fmtcnt >= 0)
4624 c = *fmt++;
4625 }
4626 else if (c >= '0' && c <= '9') {
4627 width = c - '0';
4628 while (--fmtcnt >= 0) {
4629 c = *fmt++;
4630 if (c < '0' || c > '9')
4631 break;
4632 if ((width*10) / 10 != width) {
4633 PyErr_SetString(PyExc_ValueError,
4634 "width too big");
4635 goto onError;
4636 }
4637 width = width*10 + (c - '0');
4638 }
4639 }
4640 if (c == '.') {
4641 prec = 0;
4642 if (--fmtcnt >= 0)
4643 c = *fmt++;
4644 if (c == '*') {
4645 v = getnextarg(args, arglen, &argidx);
4646 if (v == NULL)
4647 goto onError;
4648 if (!PyInt_Check(v)) {
4649 PyErr_SetString(PyExc_TypeError,
4650 "* wants int");
4651 goto onError;
4652 }
4653 prec = PyInt_AsLong(v);
4654 if (prec < 0)
4655 prec = 0;
4656 if (--fmtcnt >= 0)
4657 c = *fmt++;
4658 }
4659 else if (c >= '0' && c <= '9') {
4660 prec = c - '0';
4661 while (--fmtcnt >= 0) {
4662 c = Py_CHARMASK(*fmt++);
4663 if (c < '0' || c > '9')
4664 break;
4665 if ((prec*10) / 10 != prec) {
4666 PyErr_SetString(PyExc_ValueError,
4667 "prec too big");
4668 goto onError;
4669 }
4670 prec = prec*10 + (c - '0');
4671 }
4672 }
4673 } /* prec */
4674 if (fmtcnt >= 0) {
4675 if (c == 'h' || c == 'l' || c == 'L') {
4676 size = c;
4677 if (--fmtcnt >= 0)
4678 c = *fmt++;
4679 }
4680 }
4681 if (fmtcnt < 0) {
4682 PyErr_SetString(PyExc_ValueError,
4683 "incomplete format");
4684 goto onError;
4685 }
4686 if (c != '%') {
4687 v = getnextarg(args, arglen, &argidx);
4688 if (v == NULL)
4689 goto onError;
4690 }
4691 sign = 0;
4692 fill = ' ';
4693 switch (c) {
4694
4695 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004696 pbuf = formatbuf;
4697 /* presume that buffer length is at least 1 */
4698 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 len = 1;
4700 break;
4701
4702 case 's':
4703 case 'r':
4704 if (PyUnicode_Check(v) && c == 's') {
4705 temp = v;
4706 Py_INCREF(temp);
4707 }
4708 else {
4709 PyObject *unicode;
4710 if (c == 's')
4711 temp = PyObject_Str(v);
4712 else
4713 temp = PyObject_Repr(v);
4714 if (temp == NULL)
4715 goto onError;
4716 if (!PyString_Check(temp)) {
4717 /* XXX Note: this should never happen, since
4718 PyObject_Repr() and PyObject_Str() assure
4719 this */
4720 Py_DECREF(temp);
4721 PyErr_SetString(PyExc_TypeError,
4722 "%s argument has non-string str()");
4723 goto onError;
4724 }
Fred Drakee4315f52000-05-09 19:53:39 +00004725 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004727 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 "strict");
4729 Py_DECREF(temp);
4730 temp = unicode;
4731 if (temp == NULL)
4732 goto onError;
4733 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004734 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 len = PyUnicode_GET_SIZE(temp);
4736 if (prec >= 0 && len > prec)
4737 len = prec;
4738 break;
4739
4740 case 'i':
4741 case 'd':
4742 case 'u':
4743 case 'o':
4744 case 'x':
4745 case 'X':
4746 if (c == 'i')
4747 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004748 pbuf = formatbuf;
4749 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4750 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 if (len < 0)
4752 goto onError;
4753 sign = (c == 'd');
4754 if (flags & F_ZERO) {
4755 fill = '0';
4756 if ((flags&F_ALT) &&
4757 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004758 pbuf[0] == '0' && pbuf[1] == c) {
4759 *res++ = *pbuf++;
4760 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 rescnt -= 2;
4762 len -= 2;
4763 width -= 2;
4764 if (width < 0)
4765 width = 0;
4766 }
4767 }
4768 break;
4769
4770 case 'e':
4771 case 'E':
4772 case 'f':
4773 case 'g':
4774 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004775 pbuf = formatbuf;
4776 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4777 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 if (len < 0)
4779 goto onError;
4780 sign = 1;
4781 if (flags&F_ZERO)
4782 fill = '0';
4783 break;
4784
4785 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004786 pbuf = formatbuf;
4787 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 if (len < 0)
4789 goto onError;
4790 break;
4791
4792 default:
4793 PyErr_Format(PyExc_ValueError,
4794 "unsupported format character '%c' (0x%x)",
4795 c, c);
4796 goto onError;
4797 }
4798 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004799 if (*pbuf == '-' || *pbuf == '+') {
4800 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 len--;
4802 }
4803 else if (flags & F_SIGN)
4804 sign = '+';
4805 else if (flags & F_BLANK)
4806 sign = ' ';
4807 else
4808 sign = 0;
4809 }
4810 if (width < len)
4811 width = len;
4812 if (rescnt < width + (sign != 0)) {
4813 reslen -= rescnt;
4814 rescnt = width + fmtcnt + 100;
4815 reslen += rescnt;
4816 if (_PyUnicode_Resize(result, reslen) < 0)
4817 return NULL;
4818 res = PyUnicode_AS_UNICODE(result)
4819 + reslen - rescnt;
4820 }
4821 if (sign) {
4822 if (fill != ' ')
4823 *res++ = sign;
4824 rescnt--;
4825 if (width > len)
4826 width--;
4827 }
4828 if (width > len && !(flags & F_LJUST)) {
4829 do {
4830 --rescnt;
4831 *res++ = fill;
4832 } while (--width > len);
4833 }
4834 if (sign && fill == ' ')
4835 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004836 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 res += len;
4838 rescnt -= len;
4839 while (--width >= len) {
4840 --rescnt;
4841 *res++ = ' ';
4842 }
4843 if (dict && (argidx < arglen) && c != '%') {
4844 PyErr_SetString(PyExc_TypeError,
4845 "not all arguments converted");
4846 goto onError;
4847 }
4848 Py_XDECREF(temp);
4849 } /* '%' */
4850 } /* until end */
4851 if (argidx < arglen && !dict) {
4852 PyErr_SetString(PyExc_TypeError,
4853 "not all arguments converted");
4854 goto onError;
4855 }
4856
4857 if (args_owned) {
4858 Py_DECREF(args);
4859 }
4860 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004861 if (_PyUnicode_Resize(result, reslen - rescnt))
4862 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 return (PyObject *)result;
4864
4865 onError:
4866 Py_XDECREF(result);
4867 Py_DECREF(uformat);
4868 if (args_owned) {
4869 Py_DECREF(args);
4870 }
4871 return NULL;
4872}
4873
4874static PyBufferProcs unicode_as_buffer = {
4875 (getreadbufferproc) unicode_buffer_getreadbuf,
4876 (getwritebufferproc) unicode_buffer_getwritebuf,
4877 (getsegcountproc) unicode_buffer_getsegcount,
4878 (getcharbufferproc) unicode_buffer_getcharbuf,
4879};
4880
4881PyTypeObject PyUnicode_Type = {
4882 PyObject_HEAD_INIT(&PyType_Type)
4883 0, /* ob_size */
4884 "unicode", /* tp_name */
4885 sizeof(PyUnicodeObject), /* tp_size */
4886 0, /* tp_itemsize */
4887 /* Slots */
4888 (destructor)_PyUnicode_Free, /* tp_dealloc */
4889 0, /* tp_print */
4890 (getattrfunc)unicode_getattr, /* tp_getattr */
4891 0, /* tp_setattr */
4892 (cmpfunc) unicode_compare, /* tp_compare */
4893 (reprfunc) unicode_repr, /* tp_repr */
4894 0, /* tp_as_number */
4895 &unicode_as_sequence, /* tp_as_sequence */
4896 0, /* tp_as_mapping */
4897 (hashfunc) unicode_hash, /* tp_hash*/
4898 0, /* tp_call*/
4899 (reprfunc) unicode_str, /* tp_str */
4900 (getattrofunc) NULL, /* tp_getattro */
4901 (setattrofunc) NULL, /* tp_setattro */
4902 &unicode_as_buffer, /* tp_as_buffer */
4903 Py_TPFLAGS_DEFAULT, /* tp_flags */
4904};
4905
4906/* Initialize the Unicode implementation */
4907
4908void _PyUnicode_Init()
4909{
4910 /* Doublecheck the configuration... */
4911 if (sizeof(Py_UNICODE) != 2)
4912 Py_FatalError("Unicode configuration error: "
4913 "sizeof(Py_UNICODE) != 2 bytes");
4914
Fred Drakee4315f52000-05-09 19:53:39 +00004915 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004916 unicode_freelist = NULL;
4917 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00004919 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920}
4921
4922/* Finalize the Unicode implementation */
4923
4924void
4925_PyUnicode_Fini()
4926{
4927 PyUnicodeObject *u = unicode_freelist;
4928
4929 while (u != NULL) {
4930 PyUnicodeObject *v = u;
4931 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004932 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004933 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004934 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004935 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004937 unicode_freelist = NULL;
4938 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004940 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941}