blob: 7c35f1c98f710f16ffada0b249d7ac9b8c759495 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000359 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
360}
361
362PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
363 const char *encoding,
364 const char *errors)
365{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 const char *s;
367 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368 int owned = 0;
369 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371 if (obj == NULL) {
372 PyErr_BadInternalCall();
373 return NULL;
374 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000375
376 /* Coerce object */
377 if (PyInstance_Check(obj)) {
378 PyObject *func;
379 func = PyObject_GetAttrString(obj, "__str__");
380 if (func == NULL) {
381 PyErr_SetString(PyExc_TypeError,
382 "coercing to Unicode: instance doesn't define __str__");
383 return NULL;
384 }
385 obj = PyEval_CallObject(func, NULL);
386 Py_DECREF(func);
387 if (obj == NULL)
388 return NULL;
389 owned = 1;
390 }
391 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 v = obj;
394 if (encoding) {
395 PyErr_SetString(PyExc_TypeError,
396 "decoding Unicode is not supported");
397 return NULL;
398 }
399 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 }
401 else if (PyString_Check(obj)) {
402 s = PyString_AS_STRING(obj);
403 len = PyString_GET_SIZE(obj);
404 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000405 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
406 /* Overwrite the error message with something more useful in
407 case of a TypeError. */
408 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000409 PyErr_Format(PyExc_TypeError,
410 "coercing to Unicode: need string or buffer, "
411 "%.80s found",
412 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000413 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000414 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000415
416 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417 if (len == 0) {
418 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000421 else
422 v = PyUnicode_Decode(s, len, encoding, errors);
423 done:
424 if (owned)
425 Py_DECREF(obj);
426 return v;
427
428 onError:
429 if (owned)
430 Py_DECREF(obj);
431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432}
433
434PyObject *PyUnicode_Decode(const char *s,
435 int size,
436 const char *encoding,
437 const char *errors)
438{
439 PyObject *buffer = NULL, *unicode;
440
Fred Drakee4315f52000-05-09 19:53:39 +0000441 if (encoding == NULL)
442 encoding = PyUnicode_GetDefaultEncoding();
443
444 /* Shortcuts for common default encodings */
445 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000447 else if (strcmp(encoding, "latin-1") == 0)
448 return PyUnicode_DecodeLatin1(s, size, errors);
449 else if (strcmp(encoding, "ascii") == 0)
450 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451
452 /* Decode via the codec registry */
453 buffer = PyBuffer_FromMemory((void *)s, size);
454 if (buffer == NULL)
455 goto onError;
456 unicode = PyCodec_Decode(buffer, encoding, errors);
457 if (unicode == NULL)
458 goto onError;
459 if (!PyUnicode_Check(unicode)) {
460 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000461 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 unicode->ob_type->tp_name);
463 Py_DECREF(unicode);
464 goto onError;
465 }
466 Py_DECREF(buffer);
467 return unicode;
468
469 onError:
470 Py_XDECREF(buffer);
471 return NULL;
472}
473
474PyObject *PyUnicode_Encode(const Py_UNICODE *s,
475 int size,
476 const char *encoding,
477 const char *errors)
478{
479 PyObject *v, *unicode;
480
481 unicode = PyUnicode_FromUnicode(s, size);
482 if (unicode == NULL)
483 return NULL;
484 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
485 Py_DECREF(unicode);
486 return v;
487}
488
489PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
490 const char *encoding,
491 const char *errors)
492{
493 PyObject *v;
494
495 if (!PyUnicode_Check(unicode)) {
496 PyErr_BadArgument();
497 goto onError;
498 }
Fred Drakee4315f52000-05-09 19:53:39 +0000499
500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
502
503 /* Shortcuts for common default encodings */
504 if (errors == NULL) {
505 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000507 else if (strcmp(encoding, "latin-1") == 0)
508 return PyUnicode_AsLatin1String(unicode);
509 else if (strcmp(encoding, "ascii") == 0)
510 return PyUnicode_AsASCIIString(unicode);
511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512
513 /* Encode via the codec registry */
514 v = PyCodec_Encode(unicode, encoding, errors);
515 if (v == NULL)
516 goto onError;
517 /* XXX Should we really enforce this ? */
518 if (!PyString_Check(v)) {
519 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000520 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 v->ob_type->tp_name);
522 Py_DECREF(v);
523 goto onError;
524 }
525 return v;
526
527 onError:
528 return NULL;
529}
530
531Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
532{
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
537 return PyUnicode_AS_UNICODE(unicode);
538
539 onError:
540 return NULL;
541}
542
543int PyUnicode_GetSize(PyObject *unicode)
544{
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_BadArgument();
547 goto onError;
548 }
549 return PyUnicode_GET_SIZE(unicode);
550
551 onError:
552 return -1;
553}
554
Fred Drakee4315f52000-05-09 19:53:39 +0000555const char *PyUnicode_GetDefaultEncoding()
556{
557 return unicode_default_encoding;
558}
559
560int PyUnicode_SetDefaultEncoding(const char *encoding)
561{
562 PyObject *v;
563
564 /* Make sure the encoding is valid. As side effect, this also
565 loads the encoding into the codec registry cache. */
566 v = _PyCodec_Lookup(encoding);
567 if (v == NULL)
568 goto onError;
569 Py_DECREF(v);
570 strncpy(unicode_default_encoding,
571 encoding,
572 sizeof(unicode_default_encoding));
573 return 0;
574
575 onError:
576 return -1;
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579/* --- UTF-8 Codec -------------------------------------------------------- */
580
581static
582char utf8_code_length[256] = {
583 /* Map UTF-8 encoded prefix byte to sequence length. zero means
584 illegal prefix. see RFC 2279 for details */
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
591 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
592 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
598 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
599 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
600 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
601};
602
603static
604int utf8_decoding_error(const char **source,
605 Py_UNICODE **dest,
606 const char *errors,
607 const char *details)
608{
609 if ((errors == NULL) ||
610 (strcmp(errors,"strict") == 0)) {
611 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000612 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613 details);
614 return -1;
615 }
616 else if (strcmp(errors,"ignore") == 0) {
617 (*source)++;
618 return 0;
619 }
620 else if (strcmp(errors,"replace") == 0) {
621 (*source)++;
622 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
623 (*dest)++;
624 return 0;
625 }
626 else {
627 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000628 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 errors);
630 return -1;
631 }
632}
633
634#define UTF8_ERROR(details) do { \
635 if (utf8_decoding_error(&s, &p, errors, details)) \
636 goto onError; \
Marc-André Lemburgfb625842000-07-16 13:29:13 +0000637 goto nextChar; \
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638} while (0)
639
640PyObject *PyUnicode_DecodeUTF8(const char *s,
641 int size,
642 const char *errors)
643{
644 int n;
645 const char *e;
646 PyUnicodeObject *unicode;
647 Py_UNICODE *p;
648
649 /* Note: size will always be longer than the resulting Unicode
650 character count */
651 unicode = _PyUnicode_New(size);
652 if (!unicode)
653 return NULL;
654 if (size == 0)
655 return (PyObject *)unicode;
656
657 /* Unpack UTF-8 encoded data */
658 p = unicode->str;
659 e = s + size;
660
661 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000662 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663
664 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000665 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 s++;
667 continue;
668 }
669
670 n = utf8_code_length[ch];
671
672 if (s + n > e)
673 UTF8_ERROR("unexpected end of data");
674
675 switch (n) {
676
677 case 0:
678 UTF8_ERROR("unexpected code byte");
679 break;
680
681 case 1:
682 UTF8_ERROR("internal error");
683 break;
684
685 case 2:
686 if ((s[1] & 0xc0) != 0x80)
687 UTF8_ERROR("invalid data");
688 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
689 if (ch < 0x80)
690 UTF8_ERROR("illegal encoding");
691 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
697 (s[2] & 0xc0) != 0x80)
698 UTF8_ERROR("invalid data");
699 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
700 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
701 UTF8_ERROR("illegal encoding");
702 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000703 *p++ = (Py_UNICODE)ch;
704 break;
705
706 case 4:
707 if ((s[1] & 0xc0) != 0x80 ||
708 (s[2] & 0xc0) != 0x80 ||
709 (s[3] & 0xc0) != 0x80)
710 UTF8_ERROR("invalid data");
711 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
712 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
713 /* validate and convert to UTF-16 */
714 if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
715 (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
716 UTF8_ERROR("illegal encoding");
717 /* compute and append the two surrogates: */
718
719 /* translate from 10000..10FFFF to 0..FFFF */
720 ch -= 0x10000;
721
722 /* high surrogate = top 10 bits added to D800 */
723 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
724
725 /* low surrogate = bottom 10 bits added to DC00 */
726 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 break;
728
729 default:
730 /* Other sizes are only needed for UCS-4 */
731 UTF8_ERROR("unsupported Unicode code range");
732 }
733 s += n;
Marc-André Lemburgfb625842000-07-16 13:29:13 +0000734nextChar:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 }
736
737 /* Adjust length */
738 if (_PyUnicode_Resize(unicode, p - unicode->str))
739 goto onError;
740
741 return (PyObject *)unicode;
742
743onError:
744 Py_DECREF(unicode);
745 return NULL;
746}
747
748#undef UTF8_ERROR
749
750static
751int utf8_encoding_error(const Py_UNICODE **source,
752 char **dest,
753 const char *errors,
754 const char *details)
755{
756 if ((errors == NULL) ||
757 (strcmp(errors,"strict") == 0)) {
758 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000759 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 details);
761 return -1;
762 }
763 else if (strcmp(errors,"ignore") == 0) {
764 return 0;
765 }
766 else if (strcmp(errors,"replace") == 0) {
767 **dest = '?';
768 (*dest)++;
769 return 0;
770 }
771 else {
772 PyErr_Format(PyExc_ValueError,
773 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000774 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 errors);
776 return -1;
777 }
778}
779
780PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
781 int size,
782 const char *errors)
783{
784 PyObject *v;
785 char *p;
786 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000787 Py_UCS4 ch2;
788 unsigned int cbAllocated = 3 * size;
789 unsigned int cbWritten = 0;
790 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000792 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 if (v == NULL)
794 return NULL;
795 if (size == 0)
796 goto done;
797
798 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000799 while (i < size) {
800 Py_UCS4 ch = s[i++];
801 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000803 cbWritten++;
804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805 else if (ch < 0x0800) {
806 *p++ = 0xc0 | (ch >> 6);
807 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000808 cbWritten += 2;
809 }
810 else {
811 /* Check for high surrogate */
812 if (0xD800 <= ch && ch <= 0xDBFF) {
813 if (i != size) {
814 ch2 = s[i];
815 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
816
817 if (cbWritten >= (cbAllocated - 4)) {
818 /* Provide enough room for some more
819 surrogates */
820 cbAllocated += 4*10;
821 if (_PyString_Resize(&v, cbAllocated))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000823 }
824
825 /* combine the two values */
826 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
827
828 *p++ = (char)((ch >> 18) | 0xf0);
829 *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
830 i++;
831 cbWritten += 4;
832 }
833 }
834 }
835 else {
836 *p++ = (char)(0xe0 | (ch >> 12));
837 cbWritten += 3;
838 }
839 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
840 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841 }
842 }
843 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000844 if (_PyString_Resize(&v, p - q))
845 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846
847 done:
848 return v;
849
850 onError:
851 Py_DECREF(v);
852 return NULL;
853}
854
855/* Return a Python string holding the UTF-8 encoded value of the
856 Unicode object.
857
858 The resulting string is cached in the Unicode object for subsequent
859 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000860 the character buffer interface and will live (at least) as long as
861 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
863 The refcount of the string is *not* incremented.
864
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000865 *** Exported for internal use by the interpreter only !!! ***
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867*/
868
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +0000870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000872 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874 if (v)
875 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000876 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
877 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 errors);
879 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000880 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881 return v;
882}
883
884PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
885{
886 PyObject *str;
887
888 if (!PyUnicode_Check(unicode)) {
889 PyErr_BadArgument();
890 return NULL;
891 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000892 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (str == NULL)
894 return NULL;
895 Py_INCREF(str);
896 return str;
897}
898
899/* --- UTF-16 Codec ------------------------------------------------------- */
900
901static
902int utf16_decoding_error(const Py_UNICODE **source,
903 Py_UNICODE **dest,
904 const char *errors,
905 const char *details)
906{
907 if ((errors == NULL) ||
908 (strcmp(errors,"strict") == 0)) {
909 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000910 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 details);
912 return -1;
913 }
914 else if (strcmp(errors,"ignore") == 0) {
915 return 0;
916 }
917 else if (strcmp(errors,"replace") == 0) {
918 if (dest) {
919 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
920 (*dest)++;
921 }
922 return 0;
923 }
924 else {
925 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000926 "UTF-16 decoding error; "
927 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000928 errors);
929 return -1;
930 }
931}
932
933#define UTF16_ERROR(details) do { \
934 if (utf16_decoding_error(&q, &p, errors, details)) \
935 goto onError; \
936 continue; \
937} while(0)
938
939PyObject *PyUnicode_DecodeUTF16(const char *s,
940 int size,
941 const char *errors,
942 int *byteorder)
943{
944 PyUnicodeObject *unicode;
945 Py_UNICODE *p;
946 const Py_UNICODE *q, *e;
947 int bo = 0;
948
949 /* size should be an even number */
950 if (size % sizeof(Py_UNICODE) != 0) {
951 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
952 return NULL;
953 /* The remaining input chars are ignored if we fall through
954 here... */
955 }
956
957 /* Note: size will always be longer than the resulting Unicode
958 character count */
959 unicode = _PyUnicode_New(size);
960 if (!unicode)
961 return NULL;
962 if (size == 0)
963 return (PyObject *)unicode;
964
965 /* Unpack UTF-16 encoded data */
966 p = unicode->str;
967 q = (Py_UNICODE *)s;
968 e = q + (size / sizeof(Py_UNICODE));
969
970 if (byteorder)
971 bo = *byteorder;
972
973 while (q < e) {
974 register Py_UNICODE ch = *q++;
975
976 /* Check for BOM marks (U+FEFF) in the input and adjust
977 current byte order setting accordingly. Swap input
978 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
979 !) */
980#ifdef BYTEORDER_IS_LITTLE_ENDIAN
981 if (ch == 0xFEFF) {
982 bo = -1;
983 continue;
984 } else if (ch == 0xFFFE) {
985 bo = 1;
986 continue;
987 }
988 if (bo == 1)
989 ch = (ch >> 8) | (ch << 8);
990#else
991 if (ch == 0xFEFF) {
992 bo = 1;
993 continue;
994 } else if (ch == 0xFFFE) {
995 bo = -1;
996 continue;
997 }
998 if (bo == -1)
999 ch = (ch >> 8) | (ch << 8);
1000#endif
1001 if (ch < 0xD800 || ch > 0xDFFF) {
1002 *p++ = ch;
1003 continue;
1004 }
1005
1006 /* UTF-16 code pair: */
1007 if (q >= e)
1008 UTF16_ERROR("unexpected end of data");
1009 if (0xDC00 <= *q && *q <= 0xDFFF) {
1010 q++;
1011 if (0xD800 <= *q && *q <= 0xDBFF)
1012 /* This is valid data (a UTF-16 surrogate pair), but
1013 we are not able to store this information since our
1014 Py_UNICODE type only has 16 bits... this might
1015 change someday, even though it's unlikely. */
1016 UTF16_ERROR("code pairs are not supported");
1017 else
1018 continue;
1019 }
1020 UTF16_ERROR("illegal encoding");
1021 }
1022
1023 if (byteorder)
1024 *byteorder = bo;
1025
1026 /* Adjust length */
1027 if (_PyUnicode_Resize(unicode, p - unicode->str))
1028 goto onError;
1029
1030 return (PyObject *)unicode;
1031
1032onError:
1033 Py_DECREF(unicode);
1034 return NULL;
1035}
1036
1037#undef UTF16_ERROR
1038
1039PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1040 int size,
1041 const char *errors,
1042 int byteorder)
1043{
1044 PyObject *v;
1045 Py_UNICODE *p;
1046 char *q;
1047
1048 /* We don't create UTF-16 pairs... */
1049 v = PyString_FromStringAndSize(NULL,
1050 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1051 if (v == NULL)
1052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
1054 q = PyString_AS_STRING(v);
1055 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 if (byteorder == 0)
1057 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001058 if (size == 0)
1059 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 if (byteorder == 0 ||
1061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1062 byteorder == -1
1063#else
1064 byteorder == 1
1065#endif
1066 )
1067 memcpy(p, s, size * sizeof(Py_UNICODE));
1068 else
1069 while (size-- > 0) {
1070 Py_UNICODE ch = *s++;
1071 *p++ = (ch >> 8) | (ch << 8);
1072 }
1073 done:
1074 return v;
1075}
1076
1077PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1078{
1079 if (!PyUnicode_Check(unicode)) {
1080 PyErr_BadArgument();
1081 return NULL;
1082 }
1083 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1084 PyUnicode_GET_SIZE(unicode),
1085 NULL,
1086 0);
1087}
1088
1089/* --- Unicode Escape Codec ----------------------------------------------- */
1090
1091static
1092int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001093 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 const char *errors,
1095 const char *details)
1096{
1097 if ((errors == NULL) ||
1098 (strcmp(errors,"strict") == 0)) {
1099 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 details);
1102 return -1;
1103 }
1104 else if (strcmp(errors,"ignore") == 0) {
1105 return 0;
1106 }
1107 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001108 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 return 0;
1110 }
1111 else {
1112 PyErr_Format(PyExc_ValueError,
1113 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001114 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 errors);
1116 return -1;
1117 }
1118}
1119
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001120static _Py_UCNHashAPI *pucnHash = NULL;
1121
1122static
1123int mystrnicmp(const char *s1, const char *s2, size_t count)
1124{
1125 char c1, c2;
1126
1127 if (count)
1128 {
1129 do
1130 {
1131 c1 = tolower(*(s1++));
1132 c2 = tolower(*(s2++));
1133 }
1134 while(--count && c1 == c2);
1135
1136 return c1 - c2;
1137 }
1138
1139 return 0;
1140}
1141
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1143 int size,
1144 const char *errors)
1145{
1146 PyUnicodeObject *v;
1147 Py_UNICODE *p = NULL, *buf = NULL;
1148 const char *end;
1149
1150 /* Escaped strings will always be longer than the resulting
1151 Unicode string, so we start with size here and then reduce the
1152 length after conversion to the true value. */
1153 v = _PyUnicode_New(size);
1154 if (v == NULL)
1155 goto onError;
1156 if (size == 0)
1157 return (PyObject *)v;
1158 p = buf = PyUnicode_AS_UNICODE(v);
1159 end = s + size;
1160 while (s < end) {
1161 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001162 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 int i;
1164
1165 /* Non-escape characters are interpreted as Unicode ordinals */
1166 if (*s != '\\') {
1167 *p++ = (unsigned char)*s++;
1168 continue;
1169 }
1170
1171 /* \ - Escapes */
1172 s++;
1173 switch (*s++) {
1174
1175 /* \x escapes */
1176 case '\n': break;
1177 case '\\': *p++ = '\\'; break;
1178 case '\'': *p++ = '\''; break;
1179 case '\"': *p++ = '\"'; break;
1180 case 'b': *p++ = '\b'; break;
1181 case 'f': *p++ = '\014'; break; /* FF */
1182 case 't': *p++ = '\t'; break;
1183 case 'n': *p++ = '\n'; break;
1184 case 'r': *p++ = '\r'; break;
1185 case 'v': *p++ = '\013'; break; /* VT */
1186 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1187
1188 /* \OOO (octal) escapes */
1189 case '0': case '1': case '2': case '3':
1190 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001191 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001193 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001195 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001197 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 break;
1199
1200 /* \xXXXX escape with 0-4 hex digits */
1201 case 'x':
1202 x = 0;
1203 c = (unsigned char)*s;
1204 if (isxdigit(c)) {
1205 do {
1206 x = (x<<4) & ~0xF;
1207 if ('0' <= c && c <= '9')
1208 x += c - '0';
1209 else if ('a' <= c && c <= 'f')
1210 x += 10 + c - 'a';
1211 else
1212 x += 10 + c - 'A';
1213 c = (unsigned char)*++s;
1214 } while (isxdigit(c));
1215 *p++ = x;
1216 } else {
1217 *p++ = '\\';
1218 *p++ = (unsigned char)s[-1];
1219 }
1220 break;
1221
1222 /* \uXXXX with 4 hex digits */
1223 case 'u':
1224 for (x = 0, i = 0; i < 4; i++) {
1225 c = (unsigned char)s[i];
1226 if (!isxdigit(c)) {
1227 if (unicodeescape_decoding_error(&s, &x, errors,
1228 "truncated \\uXXXX"))
1229 goto onError;
1230 i++;
1231 break;
1232 }
1233 x = (x<<4) & ~0xF;
1234 if (c >= '0' && c <= '9')
1235 x += c - '0';
1236 else if (c >= 'a' && c <= 'f')
1237 x += 10 + c - 'a';
1238 else
1239 x += 10 + c - 'A';
1240 }
1241 s += i;
1242 *p++ = x;
1243 break;
1244
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 case 'N':
1246 /* Ok, we need to deal with Unicode Character Names now,
1247 * make sure we've imported the hash table data...
1248 */
1249 if (pucnHash == NULL)
1250 {
1251 PyObject *mod = 0, *v = 0;
1252
1253 mod = PyImport_ImportModule("ucnhash");
1254 if (mod == NULL)
1255 goto onError;
1256 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1257 Py_DECREF(mod);
1258 if (v == NULL)
1259 {
1260 goto onError;
1261 }
1262 pucnHash = PyCObject_AsVoidPtr(v);
1263 Py_DECREF(v);
1264 if (pucnHash == NULL)
1265 {
1266 goto onError;
1267 }
1268 }
1269
1270 if (*s == '{')
1271 {
1272 const char *start = s + 1;
1273 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001274 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001275 unsigned long j;
1276
1277 /* look for either the closing brace, or we
1278 * exceed the maximum length of the unicode character names
1279 */
1280 while (*endBrace != '}' &&
1281 (unsigned int)(endBrace - start) <=
1282 pucnHash->cchMax &&
1283 endBrace < end)
1284 {
1285 endBrace++;
1286 }
1287 if (endBrace != end && *endBrace == '}')
1288 {
1289 j = pucnHash->hash(start, endBrace - start);
1290 if (j > pucnHash->cKeys ||
1291 mystrnicmp(
1292 start,
1293 ((_Py_UnicodeCharacterName *)
1294 (pucnHash->getValue(j)))->pszUCN,
1295 (int)(endBrace - start)) != 0)
1296 {
1297 if (unicodeescape_decoding_error(
1298 &s, &x, errors,
1299 "Invalid Unicode Character Name"))
1300 {
1301 goto onError;
1302 }
1303 goto ucnFallthrough;
1304 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001305 value = ((_Py_UnicodeCharacterName *)
1306 (pucnHash->getValue(j)))->value;
1307 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001308 {
1309 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001310 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001311 }
1312 else
1313 {
1314 /* Oops, its in UCS-4 space, */
1315 /* compute and append the two surrogates: */
1316 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001317 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001318
1319 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001320 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001321
1322 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001324 }
1325 s = endBrace + 1;
1326 }
1327 else
1328 {
1329 if (unicodeescape_decoding_error(
1330 &s, &x, errors,
1331 "Unicode name missing closing brace"))
1332 goto onError;
1333 goto ucnFallthrough;
1334 }
1335 break;
1336 }
1337 if (unicodeescape_decoding_error(
1338 &s, &x, errors,
1339 "Missing opening brace for Unicode Character Name escape"))
1340 goto onError;
1341ucnFallthrough:
1342 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001343 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 *p++ = '\\';
1345 *p++ = (unsigned char)s[-1];
1346 break;
1347 }
1348 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001349 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001350 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351 return (PyObject *)v;
1352
1353 onError:
1354 Py_XDECREF(v);
1355 return NULL;
1356}
1357
1358/* Return a Unicode-Escape string version of the Unicode object.
1359
1360 If quotes is true, the string is enclosed in u"" or u'' quotes as
1361 appropriate.
1362
1363*/
1364
Barry Warsaw51ac5802000-03-20 16:36:48 +00001365static const Py_UNICODE *findchar(const Py_UNICODE *s,
1366 int size,
1367 Py_UNICODE ch);
1368
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369static
1370PyObject *unicodeescape_string(const Py_UNICODE *s,
1371 int size,
1372 int quotes)
1373{
1374 PyObject *repr;
1375 char *p;
1376 char *q;
1377
1378 static const char *hexdigit = "0123456789ABCDEF";
1379
1380 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1381 if (repr == NULL)
1382 return NULL;
1383
1384 p = q = PyString_AS_STRING(repr);
1385
1386 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 *p++ = 'u';
1388 *p++ = (findchar(s, size, '\'') &&
1389 !findchar(s, size, '"')) ? '"' : '\'';
1390 }
1391 while (size-- > 0) {
1392 Py_UNICODE ch = *s++;
1393 /* Escape quotes */
1394 if (quotes && (ch == q[1] || ch == '\\')) {
1395 *p++ = '\\';
1396 *p++ = (char) ch;
1397 }
1398 /* Map 16-bit characters to '\uxxxx' */
1399 else if (ch >= 256) {
1400 *p++ = '\\';
1401 *p++ = 'u';
1402 *p++ = hexdigit[(ch >> 12) & 0xf];
1403 *p++ = hexdigit[(ch >> 8) & 0xf];
1404 *p++ = hexdigit[(ch >> 4) & 0xf];
1405 *p++ = hexdigit[ch & 15];
1406 }
1407 /* Map non-printable US ASCII to '\ooo' */
1408 else if (ch < ' ' || ch >= 128) {
1409 *p++ = '\\';
1410 *p++ = hexdigit[(ch >> 6) & 7];
1411 *p++ = hexdigit[(ch >> 3) & 7];
1412 *p++ = hexdigit[ch & 7];
1413 }
1414 /* Copy everything else as-is */
1415 else
1416 *p++ = (char) ch;
1417 }
1418 if (quotes)
1419 *p++ = q[1];
1420
1421 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001422 if (_PyString_Resize(&repr, p - q))
1423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424
1425 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001426
1427 onError:
1428 Py_DECREF(repr);
1429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430}
1431
1432PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1433 int size)
1434{
1435 return unicodeescape_string(s, size, 0);
1436}
1437
1438PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1439{
1440 if (!PyUnicode_Check(unicode)) {
1441 PyErr_BadArgument();
1442 return NULL;
1443 }
1444 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1445 PyUnicode_GET_SIZE(unicode));
1446}
1447
1448/* --- Raw Unicode Escape Codec ------------------------------------------- */
1449
1450PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1451 int size,
1452 const char *errors)
1453{
1454 PyUnicodeObject *v;
1455 Py_UNICODE *p, *buf;
1456 const char *end;
1457 const char *bs;
1458
1459 /* Escaped strings will always be longer than the resulting
1460 Unicode string, so we start with size here and then reduce the
1461 length after conversion to the true value. */
1462 v = _PyUnicode_New(size);
1463 if (v == NULL)
1464 goto onError;
1465 if (size == 0)
1466 return (PyObject *)v;
1467 p = buf = PyUnicode_AS_UNICODE(v);
1468 end = s + size;
1469 while (s < end) {
1470 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001471 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 int i;
1473
1474 /* Non-escape characters are interpreted as Unicode ordinals */
1475 if (*s != '\\') {
1476 *p++ = (unsigned char)*s++;
1477 continue;
1478 }
1479
1480 /* \u-escapes are only interpreted iff the number of leading
1481 backslashes if odd */
1482 bs = s;
1483 for (;s < end;) {
1484 if (*s != '\\')
1485 break;
1486 *p++ = (unsigned char)*s++;
1487 }
1488 if (((s - bs) & 1) == 0 ||
1489 s >= end ||
1490 *s != 'u') {
1491 continue;
1492 }
1493 p--;
1494 s++;
1495
1496 /* \uXXXX with 4 hex digits */
1497 for (x = 0, i = 0; i < 4; i++) {
1498 c = (unsigned char)s[i];
1499 if (!isxdigit(c)) {
1500 if (unicodeescape_decoding_error(&s, &x, errors,
1501 "truncated \\uXXXX"))
1502 goto onError;
1503 i++;
1504 break;
1505 }
1506 x = (x<<4) & ~0xF;
1507 if (c >= '0' && c <= '9')
1508 x += c - '0';
1509 else if (c >= 'a' && c <= 'f')
1510 x += 10 + c - 'a';
1511 else
1512 x += 10 + c - 'A';
1513 }
1514 s += i;
1515 *p++ = x;
1516 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001517 if (_PyUnicode_Resize(v, (int)(p - buf)))
1518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 return (PyObject *)v;
1520
1521 onError:
1522 Py_XDECREF(v);
1523 return NULL;
1524}
1525
1526PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1527 int size)
1528{
1529 PyObject *repr;
1530 char *p;
1531 char *q;
1532
1533 static const char *hexdigit = "0123456789ABCDEF";
1534
1535 repr = PyString_FromStringAndSize(NULL, 6 * size);
1536 if (repr == NULL)
1537 return NULL;
1538
1539 p = q = PyString_AS_STRING(repr);
1540 while (size-- > 0) {
1541 Py_UNICODE ch = *s++;
1542 /* Map 16-bit characters to '\uxxxx' */
1543 if (ch >= 256) {
1544 *p++ = '\\';
1545 *p++ = 'u';
1546 *p++ = hexdigit[(ch >> 12) & 0xf];
1547 *p++ = hexdigit[(ch >> 8) & 0xf];
1548 *p++ = hexdigit[(ch >> 4) & 0xf];
1549 *p++ = hexdigit[ch & 15];
1550 }
1551 /* Copy everything else as-is */
1552 else
1553 *p++ = (char) ch;
1554 }
1555 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001556 if (_PyString_Resize(&repr, p - q))
1557 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
1559 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001560
1561 onError:
1562 Py_DECREF(repr);
1563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564}
1565
1566PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1567{
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 return NULL;
1571 }
1572 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1573 PyUnicode_GET_SIZE(unicode));
1574}
1575
1576/* --- Latin-1 Codec ------------------------------------------------------ */
1577
1578PyObject *PyUnicode_DecodeLatin1(const char *s,
1579 int size,
1580 const char *errors)
1581{
1582 PyUnicodeObject *v;
1583 Py_UNICODE *p;
1584
1585 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1586 v = _PyUnicode_New(size);
1587 if (v == NULL)
1588 goto onError;
1589 if (size == 0)
1590 return (PyObject *)v;
1591 p = PyUnicode_AS_UNICODE(v);
1592 while (size-- > 0)
1593 *p++ = (unsigned char)*s++;
1594 return (PyObject *)v;
1595
1596 onError:
1597 Py_XDECREF(v);
1598 return NULL;
1599}
1600
1601static
1602int latin1_encoding_error(const Py_UNICODE **source,
1603 char **dest,
1604 const char *errors,
1605 const char *details)
1606{
1607 if ((errors == NULL) ||
1608 (strcmp(errors,"strict") == 0)) {
1609 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001610 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 details);
1612 return -1;
1613 }
1614 else if (strcmp(errors,"ignore") == 0) {
1615 return 0;
1616 }
1617 else if (strcmp(errors,"replace") == 0) {
1618 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return 0;
1621 }
1622 else {
1623 PyErr_Format(PyExc_ValueError,
1624 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001625 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 errors);
1627 return -1;
1628 }
1629}
1630
1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1632 int size,
1633 const char *errors)
1634{
1635 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001636 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 repr = PyString_FromStringAndSize(NULL, size);
1638 if (repr == NULL)
1639 return NULL;
1640
1641 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001642 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 while (size-- > 0) {
1644 Py_UNICODE ch = *p++;
1645 if (ch >= 256) {
1646 if (latin1_encoding_error(&p, &s, errors,
1647 "ordinal not in range(256)"))
1648 goto onError;
1649 }
1650 else
1651 *s++ = (char)ch;
1652 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001653 /* Resize if error handling skipped some characters */
1654 if (s - start < PyString_GET_SIZE(repr))
1655 if (_PyString_Resize(&repr, s - start))
1656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 return repr;
1658
1659 onError:
1660 Py_DECREF(repr);
1661 return NULL;
1662}
1663
1664PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1665{
1666 if (!PyUnicode_Check(unicode)) {
1667 PyErr_BadArgument();
1668 return NULL;
1669 }
1670 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1671 PyUnicode_GET_SIZE(unicode),
1672 NULL);
1673}
1674
1675/* --- 7-bit ASCII Codec -------------------------------------------------- */
1676
1677static
1678int ascii_decoding_error(const char **source,
1679 Py_UNICODE **dest,
1680 const char *errors,
1681 const char *details)
1682{
1683 if ((errors == NULL) ||
1684 (strcmp(errors,"strict") == 0)) {
1685 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001686 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 details);
1688 return -1;
1689 }
1690 else if (strcmp(errors,"ignore") == 0) {
1691 return 0;
1692 }
1693 else if (strcmp(errors,"replace") == 0) {
1694 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1695 (*dest)++;
1696 return 0;
1697 }
1698 else {
1699 PyErr_Format(PyExc_ValueError,
1700 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001701 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 errors);
1703 return -1;
1704 }
1705}
1706
1707PyObject *PyUnicode_DecodeASCII(const char *s,
1708 int size,
1709 const char *errors)
1710{
1711 PyUnicodeObject *v;
1712 Py_UNICODE *p;
1713
1714 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1715 v = _PyUnicode_New(size);
1716 if (v == NULL)
1717 goto onError;
1718 if (size == 0)
1719 return (PyObject *)v;
1720 p = PyUnicode_AS_UNICODE(v);
1721 while (size-- > 0) {
1722 register unsigned char c;
1723
1724 c = (unsigned char)*s++;
1725 if (c < 128)
1726 *p++ = c;
1727 else if (ascii_decoding_error(&s, &p, errors,
1728 "ordinal not in range(128)"))
1729 goto onError;
1730 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001731 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1732 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 return (PyObject *)v;
1735
1736 onError:
1737 Py_XDECREF(v);
1738 return NULL;
1739}
1740
1741static
1742int ascii_encoding_error(const Py_UNICODE **source,
1743 char **dest,
1744 const char *errors,
1745 const char *details)
1746{
1747 if ((errors == NULL) ||
1748 (strcmp(errors,"strict") == 0)) {
1749 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001750 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 details);
1752 return -1;
1753 }
1754 else if (strcmp(errors,"ignore") == 0) {
1755 return 0;
1756 }
1757 else if (strcmp(errors,"replace") == 0) {
1758 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001759 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 return 0;
1761 }
1762 else {
1763 PyErr_Format(PyExc_ValueError,
1764 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001765 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 errors);
1767 return -1;
1768 }
1769}
1770
1771PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1772 int size,
1773 const char *errors)
1774{
1775 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001776 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 repr = PyString_FromStringAndSize(NULL, size);
1778 if (repr == NULL)
1779 return NULL;
1780
1781 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001782 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 while (size-- > 0) {
1784 Py_UNICODE ch = *p++;
1785 if (ch >= 128) {
1786 if (ascii_encoding_error(&p, &s, errors,
1787 "ordinal not in range(128)"))
1788 goto onError;
1789 }
1790 else
1791 *s++ = (char)ch;
1792 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001793 /* Resize if error handling skipped some characters */
1794 if (s - start < PyString_GET_SIZE(repr))
1795 if (_PyString_Resize(&repr, s - start))
1796 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 return repr;
1798
1799 onError:
1800 Py_DECREF(repr);
1801 return NULL;
1802}
1803
1804PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1805{
1806 if (!PyUnicode_Check(unicode)) {
1807 PyErr_BadArgument();
1808 return NULL;
1809 }
1810 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1811 PyUnicode_GET_SIZE(unicode),
1812 NULL);
1813}
1814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001815#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001816
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001817/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001818
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001819PyObject *PyUnicode_DecodeMBCS(const char *s,
1820 int size,
1821 const char *errors)
1822{
1823 PyUnicodeObject *v;
1824 Py_UNICODE *p;
1825
1826 /* First get the size of the result */
1827 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001828 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001829 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1830
1831 v = _PyUnicode_New(usize);
1832 if (v == NULL)
1833 return NULL;
1834 if (usize == 0)
1835 return (PyObject *)v;
1836 p = PyUnicode_AS_UNICODE(v);
1837 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1838 Py_DECREF(v);
1839 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1840 }
1841
1842 return (PyObject *)v;
1843}
1844
1845PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1846 int size,
1847 const char *errors)
1848{
1849 PyObject *repr;
1850 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001851 DWORD mbcssize;
1852
1853 /* If there are no characters, bail now! */
1854 if (size==0)
1855 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001856
1857 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001858 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001859 if (mbcssize==0)
1860 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1861
1862 repr = PyString_FromStringAndSize(NULL, mbcssize);
1863 if (repr == NULL)
1864 return NULL;
1865 if (mbcssize==0)
1866 return repr;
1867
1868 /* Do the conversion */
1869 s = PyString_AS_STRING(repr);
1870 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1871 Py_DECREF(repr);
1872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1873 }
1874 return repr;
1875}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001876
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001877#endif /* MS_WIN32 */
1878
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879/* --- Character Mapping Codec -------------------------------------------- */
1880
1881static
1882int charmap_decoding_error(const char **source,
1883 Py_UNICODE **dest,
1884 const char *errors,
1885 const char *details)
1886{
1887 if ((errors == NULL) ||
1888 (strcmp(errors,"strict") == 0)) {
1889 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001890 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 details);
1892 return -1;
1893 }
1894 else if (strcmp(errors,"ignore") == 0) {
1895 return 0;
1896 }
1897 else if (strcmp(errors,"replace") == 0) {
1898 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1899 (*dest)++;
1900 return 0;
1901 }
1902 else {
1903 PyErr_Format(PyExc_ValueError,
1904 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001905 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 errors);
1907 return -1;
1908 }
1909}
1910
1911PyObject *PyUnicode_DecodeCharmap(const char *s,
1912 int size,
1913 PyObject *mapping,
1914 const char *errors)
1915{
1916 PyUnicodeObject *v;
1917 Py_UNICODE *p;
1918
1919 /* Default to Latin-1 */
1920 if (mapping == NULL)
1921 return PyUnicode_DecodeLatin1(s, size, errors);
1922
1923 v = _PyUnicode_New(size);
1924 if (v == NULL)
1925 goto onError;
1926 if (size == 0)
1927 return (PyObject *)v;
1928 p = PyUnicode_AS_UNICODE(v);
1929 while (size-- > 0) {
1930 unsigned char ch = *s++;
1931 PyObject *w, *x;
1932
1933 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1934 w = PyInt_FromLong((long)ch);
1935 if (w == NULL)
1936 goto onError;
1937 x = PyObject_GetItem(mapping, w);
1938 Py_DECREF(w);
1939 if (x == NULL) {
1940 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1941 /* No mapping found: default to Latin-1 mapping */
1942 PyErr_Clear();
1943 *p++ = (Py_UNICODE)ch;
1944 continue;
1945 }
1946 goto onError;
1947 }
1948
1949 /* Apply mapping */
1950 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001951 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 if (value < 0 || value > 65535) {
1953 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001954 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 Py_DECREF(x);
1956 goto onError;
1957 }
1958 *p++ = (Py_UNICODE)value;
1959 }
1960 else if (x == Py_None) {
1961 /* undefined mapping */
1962 if (charmap_decoding_error(&s, &p, errors,
1963 "character maps to <undefined>")) {
1964 Py_DECREF(x);
1965 goto onError;
1966 }
1967 }
1968 else if (PyUnicode_Check(x)) {
1969 if (PyUnicode_GET_SIZE(x) != 1) {
1970 /* 1-n mapping */
1971 PyErr_SetString(PyExc_NotImplementedError,
1972 "1-n mappings are currently not implemented");
1973 Py_DECREF(x);
1974 goto onError;
1975 }
1976 *p++ = *PyUnicode_AS_UNICODE(x);
1977 }
1978 else {
1979 /* wrong return value */
1980 PyErr_SetString(PyExc_TypeError,
1981 "character mapping must return integer, None or unicode");
1982 Py_DECREF(x);
1983 goto onError;
1984 }
1985 Py_DECREF(x);
1986 }
1987 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1988 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1989 goto onError;
1990 return (PyObject *)v;
1991
1992 onError:
1993 Py_XDECREF(v);
1994 return NULL;
1995}
1996
1997static
1998int charmap_encoding_error(const Py_UNICODE **source,
1999 char **dest,
2000 const char *errors,
2001 const char *details)
2002{
2003 if ((errors == NULL) ||
2004 (strcmp(errors,"strict") == 0)) {
2005 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002006 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 details);
2008 return -1;
2009 }
2010 else if (strcmp(errors,"ignore") == 0) {
2011 return 0;
2012 }
2013 else if (strcmp(errors,"replace") == 0) {
2014 **dest = '?';
2015 (*dest)++;
2016 return 0;
2017 }
2018 else {
2019 PyErr_Format(PyExc_ValueError,
2020 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002021 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 errors);
2023 return -1;
2024 }
2025}
2026
2027PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2028 int size,
2029 PyObject *mapping,
2030 const char *errors)
2031{
2032 PyObject *v;
2033 char *s;
2034
2035 /* Default to Latin-1 */
2036 if (mapping == NULL)
2037 return PyUnicode_EncodeLatin1(p, size, errors);
2038
2039 v = PyString_FromStringAndSize(NULL, size);
2040 if (v == NULL)
2041 return NULL;
2042 s = PyString_AS_STRING(v);
2043 while (size-- > 0) {
2044 Py_UNICODE ch = *p++;
2045 PyObject *w, *x;
2046
2047 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2048 w = PyInt_FromLong((long)ch);
2049 if (w == NULL)
2050 goto onError;
2051 x = PyObject_GetItem(mapping, w);
2052 Py_DECREF(w);
2053 if (x == NULL) {
2054 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2055 /* No mapping found: default to Latin-1 mapping if possible */
2056 PyErr_Clear();
2057 if (ch < 256) {
2058 *s++ = (char)ch;
2059 continue;
2060 }
2061 else if (!charmap_encoding_error(&p, &s, errors,
2062 "missing character mapping"))
2063 continue;
2064 }
2065 goto onError;
2066 }
2067
2068 /* Apply mapping */
2069 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002070 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 if (value < 0 || value > 255) {
2072 PyErr_SetString(PyExc_TypeError,
2073 "character mapping must be in range(256)");
2074 Py_DECREF(x);
2075 goto onError;
2076 }
2077 *s++ = (char)value;
2078 }
2079 else if (x == Py_None) {
2080 /* undefined mapping */
2081 if (charmap_encoding_error(&p, &s, errors,
2082 "character maps to <undefined>")) {
2083 Py_DECREF(x);
2084 goto onError;
2085 }
2086 }
2087 else if (PyString_Check(x)) {
2088 if (PyString_GET_SIZE(x) != 1) {
2089 /* 1-n mapping */
2090 PyErr_SetString(PyExc_NotImplementedError,
2091 "1-n mappings are currently not implemented");
2092 Py_DECREF(x);
2093 goto onError;
2094 }
2095 *s++ = *PyString_AS_STRING(x);
2096 }
2097 else {
2098 /* wrong return value */
2099 PyErr_SetString(PyExc_TypeError,
2100 "character mapping must return integer, None or unicode");
2101 Py_DECREF(x);
2102 goto onError;
2103 }
2104 Py_DECREF(x);
2105 }
2106 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2107 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2108 goto onError;
2109 return v;
2110
2111 onError:
2112 Py_DECREF(v);
2113 return NULL;
2114}
2115
2116PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2117 PyObject *mapping)
2118{
2119 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2120 PyErr_BadArgument();
2121 return NULL;
2122 }
2123 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2124 PyUnicode_GET_SIZE(unicode),
2125 mapping,
2126 NULL);
2127}
2128
2129static
2130int translate_error(const Py_UNICODE **source,
2131 Py_UNICODE **dest,
2132 const char *errors,
2133 const char *details)
2134{
2135 if ((errors == NULL) ||
2136 (strcmp(errors,"strict") == 0)) {
2137 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002138 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 details);
2140 return -1;
2141 }
2142 else if (strcmp(errors,"ignore") == 0) {
2143 return 0;
2144 }
2145 else if (strcmp(errors,"replace") == 0) {
2146 **dest = '?';
2147 (*dest)++;
2148 return 0;
2149 }
2150 else {
2151 PyErr_Format(PyExc_ValueError,
2152 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002153 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 errors);
2155 return -1;
2156 }
2157}
2158
2159PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2160 int size,
2161 PyObject *mapping,
2162 const char *errors)
2163{
2164 PyUnicodeObject *v;
2165 Py_UNICODE *p;
2166
2167 if (mapping == NULL) {
2168 PyErr_BadArgument();
2169 return NULL;
2170 }
2171
2172 /* Output will never be longer than input */
2173 v = _PyUnicode_New(size);
2174 if (v == NULL)
2175 goto onError;
2176 if (size == 0)
2177 goto done;
2178 p = PyUnicode_AS_UNICODE(v);
2179 while (size-- > 0) {
2180 Py_UNICODE ch = *s++;
2181 PyObject *w, *x;
2182
2183 /* Get mapping */
2184 w = PyInt_FromLong(ch);
2185 if (w == NULL)
2186 goto onError;
2187 x = PyObject_GetItem(mapping, w);
2188 Py_DECREF(w);
2189 if (x == NULL) {
2190 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2191 /* No mapping found: default to 1-1 mapping */
2192 PyErr_Clear();
2193 *p++ = ch;
2194 continue;
2195 }
2196 goto onError;
2197 }
2198
2199 /* Apply mapping */
2200 if (PyInt_Check(x))
2201 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2202 else if (x == Py_None) {
2203 /* undefined mapping */
2204 if (translate_error(&s, &p, errors,
2205 "character maps to <undefined>")) {
2206 Py_DECREF(x);
2207 goto onError;
2208 }
2209 }
2210 else if (PyUnicode_Check(x)) {
2211 if (PyUnicode_GET_SIZE(x) != 1) {
2212 /* 1-n mapping */
2213 PyErr_SetString(PyExc_NotImplementedError,
2214 "1-n mappings are currently not implemented");
2215 Py_DECREF(x);
2216 goto onError;
2217 }
2218 *p++ = *PyUnicode_AS_UNICODE(x);
2219 }
2220 else {
2221 /* wrong return value */
2222 PyErr_SetString(PyExc_TypeError,
2223 "translate mapping must return integer, None or unicode");
2224 Py_DECREF(x);
2225 goto onError;
2226 }
2227 Py_DECREF(x);
2228 }
2229 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002230 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2231 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232
2233 done:
2234 return (PyObject *)v;
2235
2236 onError:
2237 Py_XDECREF(v);
2238 return NULL;
2239}
2240
2241PyObject *PyUnicode_Translate(PyObject *str,
2242 PyObject *mapping,
2243 const char *errors)
2244{
2245 PyObject *result;
2246
2247 str = PyUnicode_FromObject(str);
2248 if (str == NULL)
2249 goto onError;
2250 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2251 PyUnicode_GET_SIZE(str),
2252 mapping,
2253 errors);
2254 Py_DECREF(str);
2255 return result;
2256
2257 onError:
2258 Py_XDECREF(str);
2259 return NULL;
2260}
2261
Guido van Rossum9e896b32000-04-05 20:11:21 +00002262/* --- Decimal Encoder ---------------------------------------------------- */
2263
2264int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2265 int length,
2266 char *output,
2267 const char *errors)
2268{
2269 Py_UNICODE *p, *end;
2270
2271 if (output == NULL) {
2272 PyErr_BadArgument();
2273 return -1;
2274 }
2275
2276 p = s;
2277 end = s + length;
2278 while (p < end) {
2279 register Py_UNICODE ch = *p++;
2280 int decimal;
2281
2282 if (Py_UNICODE_ISSPACE(ch)) {
2283 *output++ = ' ';
2284 continue;
2285 }
2286 decimal = Py_UNICODE_TODECIMAL(ch);
2287 if (decimal >= 0) {
2288 *output++ = '0' + decimal;
2289 continue;
2290 }
Guido van Rossumba477042000-04-06 18:18:10 +00002291 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002292 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002293 continue;
2294 }
2295 /* All other characters are considered invalid */
2296 if (errors == NULL || strcmp(errors, "strict") == 0) {
2297 PyErr_SetString(PyExc_ValueError,
2298 "invalid decimal Unicode string");
2299 goto onError;
2300 }
2301 else if (strcmp(errors, "ignore") == 0)
2302 continue;
2303 else if (strcmp(errors, "replace") == 0) {
2304 *output++ = '?';
2305 continue;
2306 }
2307 }
2308 /* 0-terminate the output string */
2309 *output++ = '\0';
2310 return 0;
2311
2312 onError:
2313 return -1;
2314}
2315
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316/* --- Helpers ------------------------------------------------------------ */
2317
2318static
2319int count(PyUnicodeObject *self,
2320 int start,
2321 int end,
2322 PyUnicodeObject *substring)
2323{
2324 int count = 0;
2325
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002326 if (substring->length == 0)
2327 return (end - start + 1);
2328
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 end -= substring->length;
2330
2331 while (start <= end)
2332 if (Py_UNICODE_MATCH(self, start, substring)) {
2333 count++;
2334 start += substring->length;
2335 } else
2336 start++;
2337
2338 return count;
2339}
2340
2341int PyUnicode_Count(PyObject *str,
2342 PyObject *substr,
2343 int start,
2344 int end)
2345{
2346 int result;
2347
2348 str = PyUnicode_FromObject(str);
2349 if (str == NULL)
2350 return -1;
2351 substr = PyUnicode_FromObject(substr);
2352 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002353 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 return -1;
2355 }
2356
2357 result = count((PyUnicodeObject *)str,
2358 start, end,
2359 (PyUnicodeObject *)substr);
2360
2361 Py_DECREF(str);
2362 Py_DECREF(substr);
2363 return result;
2364}
2365
2366static
2367int findstring(PyUnicodeObject *self,
2368 PyUnicodeObject *substring,
2369 int start,
2370 int end,
2371 int direction)
2372{
2373 if (start < 0)
2374 start += self->length;
2375 if (start < 0)
2376 start = 0;
2377
2378 if (substring->length == 0)
2379 return start;
2380
2381 if (end > self->length)
2382 end = self->length;
2383 if (end < 0)
2384 end += self->length;
2385 if (end < 0)
2386 end = 0;
2387
2388 end -= substring->length;
2389
2390 if (direction < 0) {
2391 for (; end >= start; end--)
2392 if (Py_UNICODE_MATCH(self, end, substring))
2393 return end;
2394 } else {
2395 for (; start <= end; start++)
2396 if (Py_UNICODE_MATCH(self, start, substring))
2397 return start;
2398 }
2399
2400 return -1;
2401}
2402
2403int PyUnicode_Find(PyObject *str,
2404 PyObject *substr,
2405 int start,
2406 int end,
2407 int direction)
2408{
2409 int result;
2410
2411 str = PyUnicode_FromObject(str);
2412 if (str == NULL)
2413 return -1;
2414 substr = PyUnicode_FromObject(substr);
2415 if (substr == NULL) {
2416 Py_DECREF(substr);
2417 return -1;
2418 }
2419
2420 result = findstring((PyUnicodeObject *)str,
2421 (PyUnicodeObject *)substr,
2422 start, end, direction);
2423 Py_DECREF(str);
2424 Py_DECREF(substr);
2425 return result;
2426}
2427
2428static
2429int tailmatch(PyUnicodeObject *self,
2430 PyUnicodeObject *substring,
2431 int start,
2432 int end,
2433 int direction)
2434{
2435 if (start < 0)
2436 start += self->length;
2437 if (start < 0)
2438 start = 0;
2439
2440 if (substring->length == 0)
2441 return 1;
2442
2443 if (end > self->length)
2444 end = self->length;
2445 if (end < 0)
2446 end += self->length;
2447 if (end < 0)
2448 end = 0;
2449
2450 end -= substring->length;
2451 if (end < start)
2452 return 0;
2453
2454 if (direction > 0) {
2455 if (Py_UNICODE_MATCH(self, end, substring))
2456 return 1;
2457 } else {
2458 if (Py_UNICODE_MATCH(self, start, substring))
2459 return 1;
2460 }
2461
2462 return 0;
2463}
2464
2465int PyUnicode_Tailmatch(PyObject *str,
2466 PyObject *substr,
2467 int start,
2468 int end,
2469 int direction)
2470{
2471 int result;
2472
2473 str = PyUnicode_FromObject(str);
2474 if (str == NULL)
2475 return -1;
2476 substr = PyUnicode_FromObject(substr);
2477 if (substr == NULL) {
2478 Py_DECREF(substr);
2479 return -1;
2480 }
2481
2482 result = tailmatch((PyUnicodeObject *)str,
2483 (PyUnicodeObject *)substr,
2484 start, end, direction);
2485 Py_DECREF(str);
2486 Py_DECREF(substr);
2487 return result;
2488}
2489
2490static
2491const Py_UNICODE *findchar(const Py_UNICODE *s,
2492 int size,
2493 Py_UNICODE ch)
2494{
2495 /* like wcschr, but doesn't stop at NULL characters */
2496
2497 while (size-- > 0) {
2498 if (*s == ch)
2499 return s;
2500 s++;
2501 }
2502
2503 return NULL;
2504}
2505
2506/* Apply fixfct filter to the Unicode object self and return a
2507 reference to the modified object */
2508
2509static
2510PyObject *fixup(PyUnicodeObject *self,
2511 int (*fixfct)(PyUnicodeObject *s))
2512{
2513
2514 PyUnicodeObject *u;
2515
2516 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2517 self->length);
2518 if (u == NULL)
2519 return NULL;
2520 if (!fixfct(u)) {
2521 /* fixfct should return TRUE if it modified the buffer. If
2522 FALSE, return a reference to the original buffer instead
2523 (to save space, not time) */
2524 Py_INCREF(self);
2525 Py_DECREF(u);
2526 return (PyObject*) self;
2527 }
2528 return (PyObject*) u;
2529}
2530
2531static
2532int fixupper(PyUnicodeObject *self)
2533{
2534 int len = self->length;
2535 Py_UNICODE *s = self->str;
2536 int status = 0;
2537
2538 while (len-- > 0) {
2539 register Py_UNICODE ch;
2540
2541 ch = Py_UNICODE_TOUPPER(*s);
2542 if (ch != *s) {
2543 status = 1;
2544 *s = ch;
2545 }
2546 s++;
2547 }
2548
2549 return status;
2550}
2551
2552static
2553int fixlower(PyUnicodeObject *self)
2554{
2555 int len = self->length;
2556 Py_UNICODE *s = self->str;
2557 int status = 0;
2558
2559 while (len-- > 0) {
2560 register Py_UNICODE ch;
2561
2562 ch = Py_UNICODE_TOLOWER(*s);
2563 if (ch != *s) {
2564 status = 1;
2565 *s = ch;
2566 }
2567 s++;
2568 }
2569
2570 return status;
2571}
2572
2573static
2574int fixswapcase(PyUnicodeObject *self)
2575{
2576 int len = self->length;
2577 Py_UNICODE *s = self->str;
2578 int status = 0;
2579
2580 while (len-- > 0) {
2581 if (Py_UNICODE_ISUPPER(*s)) {
2582 *s = Py_UNICODE_TOLOWER(*s);
2583 status = 1;
2584 } else if (Py_UNICODE_ISLOWER(*s)) {
2585 *s = Py_UNICODE_TOUPPER(*s);
2586 status = 1;
2587 }
2588 s++;
2589 }
2590
2591 return status;
2592}
2593
2594static
2595int fixcapitalize(PyUnicodeObject *self)
2596{
2597 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2598 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2599 return 1;
2600 }
2601 return 0;
2602}
2603
2604static
2605int fixtitle(PyUnicodeObject *self)
2606{
2607 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2608 register Py_UNICODE *e;
2609 int previous_is_cased;
2610
2611 /* Shortcut for single character strings */
2612 if (PyUnicode_GET_SIZE(self) == 1) {
2613 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2614 if (*p != ch) {
2615 *p = ch;
2616 return 1;
2617 }
2618 else
2619 return 0;
2620 }
2621
2622 e = p + PyUnicode_GET_SIZE(self);
2623 previous_is_cased = 0;
2624 for (; p < e; p++) {
2625 register const Py_UNICODE ch = *p;
2626
2627 if (previous_is_cased)
2628 *p = Py_UNICODE_TOLOWER(ch);
2629 else
2630 *p = Py_UNICODE_TOTITLE(ch);
2631
2632 if (Py_UNICODE_ISLOWER(ch) ||
2633 Py_UNICODE_ISUPPER(ch) ||
2634 Py_UNICODE_ISTITLE(ch))
2635 previous_is_cased = 1;
2636 else
2637 previous_is_cased = 0;
2638 }
2639 return 1;
2640}
2641
2642PyObject *PyUnicode_Join(PyObject *separator,
2643 PyObject *seq)
2644{
2645 Py_UNICODE *sep;
2646 int seplen;
2647 PyUnicodeObject *res = NULL;
2648 int reslen = 0;
2649 Py_UNICODE *p;
2650 int seqlen = 0;
2651 int sz = 100;
2652 int i;
2653
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002654 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 if (seqlen < 0 && PyErr_Occurred())
2656 return NULL;
2657
2658 if (separator == NULL) {
2659 Py_UNICODE blank = ' ';
2660 sep = &blank;
2661 seplen = 1;
2662 }
2663 else {
2664 separator = PyUnicode_FromObject(separator);
2665 if (separator == NULL)
2666 return NULL;
2667 sep = PyUnicode_AS_UNICODE(separator);
2668 seplen = PyUnicode_GET_SIZE(separator);
2669 }
2670
2671 res = _PyUnicode_New(sz);
2672 if (res == NULL)
2673 goto onError;
2674 p = PyUnicode_AS_UNICODE(res);
2675 reslen = 0;
2676
2677 for (i = 0; i < seqlen; i++) {
2678 int itemlen;
2679 PyObject *item;
2680
2681 item = PySequence_GetItem(seq, i);
2682 if (item == NULL)
2683 goto onError;
2684 if (!PyUnicode_Check(item)) {
2685 PyObject *v;
2686 v = PyUnicode_FromObject(item);
2687 Py_DECREF(item);
2688 item = v;
2689 if (item == NULL)
2690 goto onError;
2691 }
2692 itemlen = PyUnicode_GET_SIZE(item);
2693 while (reslen + itemlen + seplen >= sz) {
2694 if (_PyUnicode_Resize(res, sz*2))
2695 goto onError;
2696 sz *= 2;
2697 p = PyUnicode_AS_UNICODE(res) + reslen;
2698 }
2699 if (i > 0) {
2700 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2701 p += seplen;
2702 reslen += seplen;
2703 }
2704 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2705 p += itemlen;
2706 reslen += itemlen;
2707 Py_DECREF(item);
2708 }
2709 if (_PyUnicode_Resize(res, reslen))
2710 goto onError;
2711
2712 Py_XDECREF(separator);
2713 return (PyObject *)res;
2714
2715 onError:
2716 Py_XDECREF(separator);
2717 Py_DECREF(res);
2718 return NULL;
2719}
2720
2721static
2722PyUnicodeObject *pad(PyUnicodeObject *self,
2723 int left,
2724 int right,
2725 Py_UNICODE fill)
2726{
2727 PyUnicodeObject *u;
2728
2729 if (left < 0)
2730 left = 0;
2731 if (right < 0)
2732 right = 0;
2733
2734 if (left == 0 && right == 0) {
2735 Py_INCREF(self);
2736 return self;
2737 }
2738
2739 u = _PyUnicode_New(left + self->length + right);
2740 if (u) {
2741 if (left)
2742 Py_UNICODE_FILL(u->str, fill, left);
2743 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2744 if (right)
2745 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2746 }
2747
2748 return u;
2749}
2750
2751#define SPLIT_APPEND(data, left, right) \
2752 str = PyUnicode_FromUnicode(data + left, right - left); \
2753 if (!str) \
2754 goto onError; \
2755 if (PyList_Append(list, str)) { \
2756 Py_DECREF(str); \
2757 goto onError; \
2758 } \
2759 else \
2760 Py_DECREF(str);
2761
2762static
2763PyObject *split_whitespace(PyUnicodeObject *self,
2764 PyObject *list,
2765 int maxcount)
2766{
2767 register int i;
2768 register int j;
2769 int len = self->length;
2770 PyObject *str;
2771
2772 for (i = j = 0; i < len; ) {
2773 /* find a token */
2774 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2775 i++;
2776 j = i;
2777 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2778 i++;
2779 if (j < i) {
2780 if (maxcount-- <= 0)
2781 break;
2782 SPLIT_APPEND(self->str, j, i);
2783 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2784 i++;
2785 j = i;
2786 }
2787 }
2788 if (j < len) {
2789 SPLIT_APPEND(self->str, j, len);
2790 }
2791 return list;
2792
2793 onError:
2794 Py_DECREF(list);
2795 return NULL;
2796}
2797
2798PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002799 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800{
2801 register int i;
2802 register int j;
2803 int len;
2804 PyObject *list;
2805 PyObject *str;
2806 Py_UNICODE *data;
2807
2808 string = PyUnicode_FromObject(string);
2809 if (string == NULL)
2810 return NULL;
2811 data = PyUnicode_AS_UNICODE(string);
2812 len = PyUnicode_GET_SIZE(string);
2813
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 list = PyList_New(0);
2815 if (!list)
2816 goto onError;
2817
2818 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002819 int eol;
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 /* Find a line and append it */
2822 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2823 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824
2825 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002826 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 if (i < len) {
2828 if (data[i] == '\r' && i + 1 < len &&
2829 data[i+1] == '\n')
2830 i += 2;
2831 else
2832 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002833 if (keepends)
2834 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 }
Guido van Rossum86662912000-04-11 15:38:46 +00002836 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 j = i;
2838 }
2839 if (j < len) {
2840 SPLIT_APPEND(data, j, len);
2841 }
2842
2843 Py_DECREF(string);
2844 return list;
2845
2846 onError:
2847 Py_DECREF(list);
2848 Py_DECREF(string);
2849 return NULL;
2850}
2851
2852static
2853PyObject *split_char(PyUnicodeObject *self,
2854 PyObject *list,
2855 Py_UNICODE ch,
2856 int maxcount)
2857{
2858 register int i;
2859 register int j;
2860 int len = self->length;
2861 PyObject *str;
2862
2863 for (i = j = 0; i < len; ) {
2864 if (self->str[i] == ch) {
2865 if (maxcount-- <= 0)
2866 break;
2867 SPLIT_APPEND(self->str, j, i);
2868 i = j = i + 1;
2869 } else
2870 i++;
2871 }
2872 if (j <= len) {
2873 SPLIT_APPEND(self->str, j, len);
2874 }
2875 return list;
2876
2877 onError:
2878 Py_DECREF(list);
2879 return NULL;
2880}
2881
2882static
2883PyObject *split_substring(PyUnicodeObject *self,
2884 PyObject *list,
2885 PyUnicodeObject *substring,
2886 int maxcount)
2887{
2888 register int i;
2889 register int j;
2890 int len = self->length;
2891 int sublen = substring->length;
2892 PyObject *str;
2893
2894 for (i = j = 0; i < len - sublen; ) {
2895 if (Py_UNICODE_MATCH(self, i, substring)) {
2896 if (maxcount-- <= 0)
2897 break;
2898 SPLIT_APPEND(self->str, j, i);
2899 i = j = i + sublen;
2900 } else
2901 i++;
2902 }
2903 if (j <= len) {
2904 SPLIT_APPEND(self->str, j, len);
2905 }
2906 return list;
2907
2908 onError:
2909 Py_DECREF(list);
2910 return NULL;
2911}
2912
2913#undef SPLIT_APPEND
2914
2915static
2916PyObject *split(PyUnicodeObject *self,
2917 PyUnicodeObject *substring,
2918 int maxcount)
2919{
2920 PyObject *list;
2921
2922 if (maxcount < 0)
2923 maxcount = INT_MAX;
2924
2925 list = PyList_New(0);
2926 if (!list)
2927 return NULL;
2928
2929 if (substring == NULL)
2930 return split_whitespace(self,list,maxcount);
2931
2932 else if (substring->length == 1)
2933 return split_char(self,list,substring->str[0],maxcount);
2934
2935 else if (substring->length == 0) {
2936 Py_DECREF(list);
2937 PyErr_SetString(PyExc_ValueError, "empty separator");
2938 return NULL;
2939 }
2940 else
2941 return split_substring(self,list,substring,maxcount);
2942}
2943
2944static
2945PyObject *strip(PyUnicodeObject *self,
2946 int left,
2947 int right)
2948{
2949 Py_UNICODE *p = self->str;
2950 int start = 0;
2951 int end = self->length;
2952
2953 if (left)
2954 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2955 start++;
2956
2957 if (right)
2958 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2959 end--;
2960
2961 if (start == 0 && end == self->length) {
2962 /* couldn't strip anything off, return original string */
2963 Py_INCREF(self);
2964 return (PyObject*) self;
2965 }
2966
2967 return (PyObject*) PyUnicode_FromUnicode(
2968 self->str + start,
2969 end - start
2970 );
2971}
2972
2973static
2974PyObject *replace(PyUnicodeObject *self,
2975 PyUnicodeObject *str1,
2976 PyUnicodeObject *str2,
2977 int maxcount)
2978{
2979 PyUnicodeObject *u;
2980
2981 if (maxcount < 0)
2982 maxcount = INT_MAX;
2983
2984 if (str1->length == 1 && str2->length == 1) {
2985 int i;
2986
2987 /* replace characters */
2988 if (!findchar(self->str, self->length, str1->str[0])) {
2989 /* nothing to replace, return original string */
2990 Py_INCREF(self);
2991 u = self;
2992 } else {
2993 Py_UNICODE u1 = str1->str[0];
2994 Py_UNICODE u2 = str2->str[0];
2995
2996 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2997 self->str,
2998 self->length
2999 );
3000 if (u)
3001 for (i = 0; i < u->length; i++)
3002 if (u->str[i] == u1) {
3003 if (--maxcount < 0)
3004 break;
3005 u->str[i] = u2;
3006 }
3007 }
3008
3009 } else {
3010 int n, i;
3011 Py_UNICODE *p;
3012
3013 /* replace strings */
3014 n = count(self, 0, self->length, str1);
3015 if (n > maxcount)
3016 n = maxcount;
3017 if (n == 0) {
3018 /* nothing to replace, return original string */
3019 Py_INCREF(self);
3020 u = self;
3021 } else {
3022 u = _PyUnicode_New(
3023 self->length + n * (str2->length - str1->length));
3024 if (u) {
3025 i = 0;
3026 p = u->str;
3027 while (i <= self->length - str1->length)
3028 if (Py_UNICODE_MATCH(self, i, str1)) {
3029 /* replace string segment */
3030 Py_UNICODE_COPY(p, str2->str, str2->length);
3031 p += str2->length;
3032 i += str1->length;
3033 if (--n <= 0) {
3034 /* copy remaining part */
3035 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3036 break;
3037 }
3038 } else
3039 *p++ = self->str[i++];
3040 }
3041 }
3042 }
3043
3044 return (PyObject *) u;
3045}
3046
3047/* --- Unicode Object Methods --------------------------------------------- */
3048
3049static char title__doc__[] =
3050"S.title() -> unicode\n\
3051\n\
3052Return a titlecased version of S, i.e. words start with title case\n\
3053characters, all remaining cased characters have lower case.";
3054
3055static PyObject*
3056unicode_title(PyUnicodeObject *self, PyObject *args)
3057{
3058 if (!PyArg_NoArgs(args))
3059 return NULL;
3060 return fixup(self, fixtitle);
3061}
3062
3063static char capitalize__doc__[] =
3064"S.capitalize() -> unicode\n\
3065\n\
3066Return a capitalized version of S, i.e. make the first character\n\
3067have upper case.";
3068
3069static PyObject*
3070unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3071{
3072 if (!PyArg_NoArgs(args))
3073 return NULL;
3074 return fixup(self, fixcapitalize);
3075}
3076
3077#if 0
3078static char capwords__doc__[] =
3079"S.capwords() -> unicode\n\
3080\n\
3081Apply .capitalize() to all words in S and return the result with\n\
3082normalized whitespace (all whitespace strings are replaced by ' ').";
3083
3084static PyObject*
3085unicode_capwords(PyUnicodeObject *self, PyObject *args)
3086{
3087 PyObject *list;
3088 PyObject *item;
3089 int i;
3090
3091 if (!PyArg_NoArgs(args))
3092 return NULL;
3093
3094 /* Split into words */
3095 list = split(self, NULL, -1);
3096 if (!list)
3097 return NULL;
3098
3099 /* Capitalize each word */
3100 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3101 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3102 fixcapitalize);
3103 if (item == NULL)
3104 goto onError;
3105 Py_DECREF(PyList_GET_ITEM(list, i));
3106 PyList_SET_ITEM(list, i, item);
3107 }
3108
3109 /* Join the words to form a new string */
3110 item = PyUnicode_Join(NULL, list);
3111
3112onError:
3113 Py_DECREF(list);
3114 return (PyObject *)item;
3115}
3116#endif
3117
3118static char center__doc__[] =
3119"S.center(width) -> unicode\n\
3120\n\
3121Return S centered in a Unicode string of length width. Padding is done\n\
3122using spaces.";
3123
3124static PyObject *
3125unicode_center(PyUnicodeObject *self, PyObject *args)
3126{
3127 int marg, left;
3128 int width;
3129
3130 if (!PyArg_ParseTuple(args, "i:center", &width))
3131 return NULL;
3132
3133 if (self->length >= width) {
3134 Py_INCREF(self);
3135 return (PyObject*) self;
3136 }
3137
3138 marg = width - self->length;
3139 left = marg / 2 + (marg & width & 1);
3140
3141 return (PyObject*) pad(self, left, marg - left, ' ');
3142}
3143
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003144/* speedy UTF-16 code point order comparison */
3145/* gleaned from: */
3146/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3147
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003148static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003149{
3150 0, 0, 0, 0, 0, 0, 0, 0,
3151 0, 0, 0, 0, 0, 0, 0, 0,
3152 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003153 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003154};
3155
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156static int
3157unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3158{
3159 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003160
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 Py_UNICODE *s1 = str1->str;
3162 Py_UNICODE *s2 = str2->str;
3163
3164 len1 = str1->length;
3165 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003168 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003169 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003170
3171 c1 = *s1++;
3172 c2 = *s2++;
3173 if (c1 > (1<<11) * 26)
3174 c1 += utf16Fixup[c1>>11];
3175 if (c2 > (1<<11) * 26)
3176 c2 += utf16Fixup[c2>>11];
3177
3178 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003179 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003180 if (diff)
3181 return (diff < 0) ? -1 : (diff != 0);
3182 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 }
3184
3185 return (len1 < len2) ? -1 : (len1 != len2);
3186}
3187
3188int PyUnicode_Compare(PyObject *left,
3189 PyObject *right)
3190{
3191 PyUnicodeObject *u = NULL, *v = NULL;
3192 int result;
3193
3194 /* Coerce the two arguments */
3195 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3196 if (u == NULL)
3197 goto onError;
3198 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3199 if (v == NULL)
3200 goto onError;
3201
Thomas Wouters7e474022000-07-16 12:04:32 +00003202 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 if (v == u) {
3204 Py_DECREF(u);
3205 Py_DECREF(v);
3206 return 0;
3207 }
3208
3209 result = unicode_compare(u, v);
3210
3211 Py_DECREF(u);
3212 Py_DECREF(v);
3213 return result;
3214
3215onError:
3216 Py_XDECREF(u);
3217 Py_XDECREF(v);
3218 return -1;
3219}
3220
Guido van Rossum403d68b2000-03-13 15:55:09 +00003221int PyUnicode_Contains(PyObject *container,
3222 PyObject *element)
3223{
3224 PyUnicodeObject *u = NULL, *v = NULL;
3225 int result;
3226 register const Py_UNICODE *p, *e;
3227 register Py_UNICODE ch;
3228
3229 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003230 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003231 if (v == NULL) {
3232 PyErr_SetString(PyExc_TypeError,
3233 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003234 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003235 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003236 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3237 if (u == NULL) {
3238 Py_DECREF(v);
3239 goto onError;
3240 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003241
3242 /* Check v in u */
3243 if (PyUnicode_GET_SIZE(v) != 1) {
3244 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003245 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003246 goto onError;
3247 }
3248 ch = *PyUnicode_AS_UNICODE(v);
3249 p = PyUnicode_AS_UNICODE(u);
3250 e = p + PyUnicode_GET_SIZE(u);
3251 result = 0;
3252 while (p < e) {
3253 if (*p++ == ch) {
3254 result = 1;
3255 break;
3256 }
3257 }
3258
3259 Py_DECREF(u);
3260 Py_DECREF(v);
3261 return result;
3262
3263onError:
3264 Py_XDECREF(u);
3265 Py_XDECREF(v);
3266 return -1;
3267}
3268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269/* Concat to string or Unicode object giving a new Unicode object. */
3270
3271PyObject *PyUnicode_Concat(PyObject *left,
3272 PyObject *right)
3273{
3274 PyUnicodeObject *u = NULL, *v = NULL, *w;
3275
3276 /* Coerce the two arguments */
3277 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3278 if (u == NULL)
3279 goto onError;
3280 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3281 if (v == NULL)
3282 goto onError;
3283
3284 /* Shortcuts */
3285 if (v == unicode_empty) {
3286 Py_DECREF(v);
3287 return (PyObject *)u;
3288 }
3289 if (u == unicode_empty) {
3290 Py_DECREF(u);
3291 return (PyObject *)v;
3292 }
3293
3294 /* Concat the two Unicode strings */
3295 w = _PyUnicode_New(u->length + v->length);
3296 if (w == NULL)
3297 goto onError;
3298 Py_UNICODE_COPY(w->str, u->str, u->length);
3299 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3300
3301 Py_DECREF(u);
3302 Py_DECREF(v);
3303 return (PyObject *)w;
3304
3305onError:
3306 Py_XDECREF(u);
3307 Py_XDECREF(v);
3308 return NULL;
3309}
3310
3311static char count__doc__[] =
3312"S.count(sub[, start[, end]]) -> int\n\
3313\n\
3314Return the number of occurrences of substring sub in Unicode string\n\
3315S[start:end]. Optional arguments start and end are\n\
3316interpreted as in slice notation.";
3317
3318static PyObject *
3319unicode_count(PyUnicodeObject *self, PyObject *args)
3320{
3321 PyUnicodeObject *substring;
3322 int start = 0;
3323 int end = INT_MAX;
3324 PyObject *result;
3325
Guido van Rossumb8872e62000-05-09 14:14:27 +00003326 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3327 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 return NULL;
3329
3330 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3331 (PyObject *)substring);
3332 if (substring == NULL)
3333 return NULL;
3334
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 if (start < 0)
3336 start += self->length;
3337 if (start < 0)
3338 start = 0;
3339 if (end > self->length)
3340 end = self->length;
3341 if (end < 0)
3342 end += self->length;
3343 if (end < 0)
3344 end = 0;
3345
3346 result = PyInt_FromLong((long) count(self, start, end, substring));
3347
3348 Py_DECREF(substring);
3349 return result;
3350}
3351
3352static char encode__doc__[] =
3353"S.encode([encoding[,errors]]) -> string\n\
3354\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003355Return an encoded string version of S. Default encoding is the current\n\
3356default string encoding. errors may be given to set a different error\n\
3357handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3358a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360static PyObject *
3361unicode_encode(PyUnicodeObject *self, PyObject *args)
3362{
3363 char *encoding = NULL;
3364 char *errors = NULL;
3365 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3366 return NULL;
3367 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3368}
3369
3370static char expandtabs__doc__[] =
3371"S.expandtabs([tabsize]) -> unicode\n\
3372\n\
3373Return a copy of S where all tab characters are expanded using spaces.\n\
3374If tabsize is not given, a tab size of 8 characters is assumed.";
3375
3376static PyObject*
3377unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3378{
3379 Py_UNICODE *e;
3380 Py_UNICODE *p;
3381 Py_UNICODE *q;
3382 int i, j;
3383 PyUnicodeObject *u;
3384 int tabsize = 8;
3385
3386 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3387 return NULL;
3388
Thomas Wouters7e474022000-07-16 12:04:32 +00003389 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 i = j = 0;
3391 e = self->str + self->length;
3392 for (p = self->str; p < e; p++)
3393 if (*p == '\t') {
3394 if (tabsize > 0)
3395 j += tabsize - (j % tabsize);
3396 }
3397 else {
3398 j++;
3399 if (*p == '\n' || *p == '\r') {
3400 i += j;
3401 j = 0;
3402 }
3403 }
3404
3405 /* Second pass: create output string and fill it */
3406 u = _PyUnicode_New(i + j);
3407 if (!u)
3408 return NULL;
3409
3410 j = 0;
3411 q = u->str;
3412
3413 for (p = self->str; p < e; p++)
3414 if (*p == '\t') {
3415 if (tabsize > 0) {
3416 i = tabsize - (j % tabsize);
3417 j += i;
3418 while (i--)
3419 *q++ = ' ';
3420 }
3421 }
3422 else {
3423 j++;
3424 *q++ = *p;
3425 if (*p == '\n' || *p == '\r')
3426 j = 0;
3427 }
3428
3429 return (PyObject*) u;
3430}
3431
3432static char find__doc__[] =
3433"S.find(sub [,start [,end]]) -> int\n\
3434\n\
3435Return the lowest index in S where substring sub is found,\n\
3436such that sub is contained within s[start,end]. Optional\n\
3437arguments start and end are interpreted as in slice notation.\n\
3438\n\
3439Return -1 on failure.";
3440
3441static PyObject *
3442unicode_find(PyUnicodeObject *self, PyObject *args)
3443{
3444 PyUnicodeObject *substring;
3445 int start = 0;
3446 int end = INT_MAX;
3447 PyObject *result;
3448
Guido van Rossumb8872e62000-05-09 14:14:27 +00003449 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 return NULL;
3452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3453 (PyObject *)substring);
3454 if (substring == NULL)
3455 return NULL;
3456
3457 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3458
3459 Py_DECREF(substring);
3460 return result;
3461}
3462
3463static PyObject *
3464unicode_getitem(PyUnicodeObject *self, int index)
3465{
3466 if (index < 0 || index >= self->length) {
3467 PyErr_SetString(PyExc_IndexError, "string index out of range");
3468 return NULL;
3469 }
3470
3471 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3472}
3473
3474static long
3475unicode_hash(PyUnicodeObject *self)
3476{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003477 /* Since Unicode objects compare equal to their ASCII string
3478 counterparts, they should use the individual character values
3479 as basis for their hash value. This is needed to assure that
3480 strings and Unicode objects behave in the same way as
3481 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482
Fredrik Lundhdde61642000-07-10 18:27:47 +00003483 register int len;
3484 register Py_UNICODE *p;
3485 register long x;
3486
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 if (self->hash != -1)
3488 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003489 len = PyUnicode_GET_SIZE(self);
3490 p = PyUnicode_AS_UNICODE(self);
3491 x = *p << 7;
3492 while (--len >= 0)
3493 x = (1000003*x) ^ *p++;
3494 x ^= PyUnicode_GET_SIZE(self);
3495 if (x == -1)
3496 x = -2;
3497 self->hash = x;
3498 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499}
3500
3501static char index__doc__[] =
3502"S.index(sub [,start [,end]]) -> int\n\
3503\n\
3504Like S.find() but raise ValueError when the substring is not found.";
3505
3506static PyObject *
3507unicode_index(PyUnicodeObject *self, PyObject *args)
3508{
3509 int result;
3510 PyUnicodeObject *substring;
3511 int start = 0;
3512 int end = INT_MAX;
3513
Guido van Rossumb8872e62000-05-09 14:14:27 +00003514 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3515 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 return NULL;
3517
3518 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3519 (PyObject *)substring);
3520 if (substring == NULL)
3521 return NULL;
3522
3523 result = findstring(self, substring, start, end, 1);
3524
3525 Py_DECREF(substring);
3526 if (result < 0) {
3527 PyErr_SetString(PyExc_ValueError, "substring not found");
3528 return NULL;
3529 }
3530 return PyInt_FromLong(result);
3531}
3532
3533static char islower__doc__[] =
3534"S.islower() -> int\n\
3535\n\
3536Return 1 if all cased characters in S are lowercase and there is\n\
3537at least one cased character in S, 0 otherwise.";
3538
3539static PyObject*
3540unicode_islower(PyUnicodeObject *self, PyObject *args)
3541{
3542 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3543 register const Py_UNICODE *e;
3544 int cased;
3545
3546 if (!PyArg_NoArgs(args))
3547 return NULL;
3548
3549 /* Shortcut for single character strings */
3550 if (PyUnicode_GET_SIZE(self) == 1)
3551 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003553 /* Special case for empty strings */
3554 if (PyString_GET_SIZE(self) == 0)
3555 return PyInt_FromLong(0);
3556
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 e = p + PyUnicode_GET_SIZE(self);
3558 cased = 0;
3559 for (; p < e; p++) {
3560 register const Py_UNICODE ch = *p;
3561
3562 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3563 return PyInt_FromLong(0);
3564 else if (!cased && Py_UNICODE_ISLOWER(ch))
3565 cased = 1;
3566 }
3567 return PyInt_FromLong(cased);
3568}
3569
3570static char isupper__doc__[] =
3571"S.isupper() -> int\n\
3572\n\
3573Return 1 if all cased characters in S are uppercase and there is\n\
3574at least one cased character in S, 0 otherwise.";
3575
3576static PyObject*
3577unicode_isupper(PyUnicodeObject *self, PyObject *args)
3578{
3579 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3580 register const Py_UNICODE *e;
3581 int cased;
3582
3583 if (!PyArg_NoArgs(args))
3584 return NULL;
3585
3586 /* Shortcut for single character strings */
3587 if (PyUnicode_GET_SIZE(self) == 1)
3588 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3589
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003590 /* Special case for empty strings */
3591 if (PyString_GET_SIZE(self) == 0)
3592 return PyInt_FromLong(0);
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 e = p + PyUnicode_GET_SIZE(self);
3595 cased = 0;
3596 for (; p < e; p++) {
3597 register const Py_UNICODE ch = *p;
3598
3599 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3600 return PyInt_FromLong(0);
3601 else if (!cased && Py_UNICODE_ISUPPER(ch))
3602 cased = 1;
3603 }
3604 return PyInt_FromLong(cased);
3605}
3606
3607static char istitle__doc__[] =
3608"S.istitle() -> int\n\
3609\n\
3610Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3611may only follow uncased characters and lowercase characters only cased\n\
3612ones. Return 0 otherwise.";
3613
3614static PyObject*
3615unicode_istitle(PyUnicodeObject *self, PyObject *args)
3616{
3617 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3618 register const Py_UNICODE *e;
3619 int cased, previous_is_cased;
3620
3621 if (!PyArg_NoArgs(args))
3622 return NULL;
3623
3624 /* Shortcut for single character strings */
3625 if (PyUnicode_GET_SIZE(self) == 1)
3626 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3627 (Py_UNICODE_ISUPPER(*p) != 0));
3628
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003629 /* Special case for empty strings */
3630 if (PyString_GET_SIZE(self) == 0)
3631 return PyInt_FromLong(0);
3632
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 e = p + PyUnicode_GET_SIZE(self);
3634 cased = 0;
3635 previous_is_cased = 0;
3636 for (; p < e; p++) {
3637 register const Py_UNICODE ch = *p;
3638
3639 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3640 if (previous_is_cased)
3641 return PyInt_FromLong(0);
3642 previous_is_cased = 1;
3643 cased = 1;
3644 }
3645 else if (Py_UNICODE_ISLOWER(ch)) {
3646 if (!previous_is_cased)
3647 return PyInt_FromLong(0);
3648 previous_is_cased = 1;
3649 cased = 1;
3650 }
3651 else
3652 previous_is_cased = 0;
3653 }
3654 return PyInt_FromLong(cased);
3655}
3656
3657static char isspace__doc__[] =
3658"S.isspace() -> int\n\
3659\n\
3660Return 1 if there are only whitespace characters in S,\n\
36610 otherwise.";
3662
3663static PyObject*
3664unicode_isspace(PyUnicodeObject *self, PyObject *args)
3665{
3666 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3667 register const Py_UNICODE *e;
3668
3669 if (!PyArg_NoArgs(args))
3670 return NULL;
3671
3672 /* Shortcut for single character strings */
3673 if (PyUnicode_GET_SIZE(self) == 1 &&
3674 Py_UNICODE_ISSPACE(*p))
3675 return PyInt_FromLong(1);
3676
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003677 /* Special case for empty strings */
3678 if (PyString_GET_SIZE(self) == 0)
3679 return PyInt_FromLong(0);
3680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 e = p + PyUnicode_GET_SIZE(self);
3682 for (; p < e; p++) {
3683 if (!Py_UNICODE_ISSPACE(*p))
3684 return PyInt_FromLong(0);
3685 }
3686 return PyInt_FromLong(1);
3687}
3688
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003689static char isalpha__doc__[] =
3690"S.isalpha() -> int\n\
3691\n\
3692Return 1 if all characters in S are alphabetic\n\
3693and there is at least one character in S, 0 otherwise.";
3694
3695static PyObject*
3696unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3697{
3698 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3699 register const Py_UNICODE *e;
3700
3701 if (!PyArg_NoArgs(args))
3702 return NULL;
3703
3704 /* Shortcut for single character strings */
3705 if (PyUnicode_GET_SIZE(self) == 1 &&
3706 Py_UNICODE_ISALPHA(*p))
3707 return PyInt_FromLong(1);
3708
3709 /* Special case for empty strings */
3710 if (PyString_GET_SIZE(self) == 0)
3711 return PyInt_FromLong(0);
3712
3713 e = p + PyUnicode_GET_SIZE(self);
3714 for (; p < e; p++) {
3715 if (!Py_UNICODE_ISALPHA(*p))
3716 return PyInt_FromLong(0);
3717 }
3718 return PyInt_FromLong(1);
3719}
3720
3721static char isalnum__doc__[] =
3722"S.isalnum() -> int\n\
3723\n\
3724Return 1 if all characters in S are alphanumeric\n\
3725and there is at least one character in S, 0 otherwise.";
3726
3727static PyObject*
3728unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3729{
3730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3731 register const Py_UNICODE *e;
3732
3733 if (!PyArg_NoArgs(args))
3734 return NULL;
3735
3736 /* Shortcut for single character strings */
3737 if (PyUnicode_GET_SIZE(self) == 1 &&
3738 Py_UNICODE_ISALNUM(*p))
3739 return PyInt_FromLong(1);
3740
3741 /* Special case for empty strings */
3742 if (PyString_GET_SIZE(self) == 0)
3743 return PyInt_FromLong(0);
3744
3745 e = p + PyUnicode_GET_SIZE(self);
3746 for (; p < e; p++) {
3747 if (!Py_UNICODE_ISALNUM(*p))
3748 return PyInt_FromLong(0);
3749 }
3750 return PyInt_FromLong(1);
3751}
3752
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753static char isdecimal__doc__[] =
3754"S.isdecimal() -> int\n\
3755\n\
3756Return 1 if there are only decimal characters in S,\n\
37570 otherwise.";
3758
3759static PyObject*
3760unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3761{
3762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3763 register const Py_UNICODE *e;
3764
3765 if (!PyArg_NoArgs(args))
3766 return NULL;
3767
3768 /* Shortcut for single character strings */
3769 if (PyUnicode_GET_SIZE(self) == 1 &&
3770 Py_UNICODE_ISDECIMAL(*p))
3771 return PyInt_FromLong(1);
3772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003773 /* Special case for empty strings */
3774 if (PyString_GET_SIZE(self) == 0)
3775 return PyInt_FromLong(0);
3776
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 e = p + PyUnicode_GET_SIZE(self);
3778 for (; p < e; p++) {
3779 if (!Py_UNICODE_ISDECIMAL(*p))
3780 return PyInt_FromLong(0);
3781 }
3782 return PyInt_FromLong(1);
3783}
3784
3785static char isdigit__doc__[] =
3786"S.isdigit() -> int\n\
3787\n\
3788Return 1 if there are only digit characters in S,\n\
37890 otherwise.";
3790
3791static PyObject*
3792unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3793{
3794 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3795 register const Py_UNICODE *e;
3796
3797 if (!PyArg_NoArgs(args))
3798 return NULL;
3799
3800 /* Shortcut for single character strings */
3801 if (PyUnicode_GET_SIZE(self) == 1 &&
3802 Py_UNICODE_ISDIGIT(*p))
3803 return PyInt_FromLong(1);
3804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003805 /* Special case for empty strings */
3806 if (PyString_GET_SIZE(self) == 0)
3807 return PyInt_FromLong(0);
3808
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 e = p + PyUnicode_GET_SIZE(self);
3810 for (; p < e; p++) {
3811 if (!Py_UNICODE_ISDIGIT(*p))
3812 return PyInt_FromLong(0);
3813 }
3814 return PyInt_FromLong(1);
3815}
3816
3817static char isnumeric__doc__[] =
3818"S.isnumeric() -> int\n\
3819\n\
3820Return 1 if there are only numeric characters in S,\n\
38210 otherwise.";
3822
3823static PyObject*
3824unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3825{
3826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3827 register const Py_UNICODE *e;
3828
3829 if (!PyArg_NoArgs(args))
3830 return NULL;
3831
3832 /* Shortcut for single character strings */
3833 if (PyUnicode_GET_SIZE(self) == 1 &&
3834 Py_UNICODE_ISNUMERIC(*p))
3835 return PyInt_FromLong(1);
3836
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003837 /* Special case for empty strings */
3838 if (PyString_GET_SIZE(self) == 0)
3839 return PyInt_FromLong(0);
3840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 e = p + PyUnicode_GET_SIZE(self);
3842 for (; p < e; p++) {
3843 if (!Py_UNICODE_ISNUMERIC(*p))
3844 return PyInt_FromLong(0);
3845 }
3846 return PyInt_FromLong(1);
3847}
3848
3849static char join__doc__[] =
3850"S.join(sequence) -> unicode\n\
3851\n\
3852Return a string which is the concatenation of the strings in the\n\
3853sequence. The separator between elements is S.";
3854
3855static PyObject*
3856unicode_join(PyUnicodeObject *self, PyObject *args)
3857{
3858 PyObject *data;
3859 if (!PyArg_ParseTuple(args, "O:join", &data))
3860 return NULL;
3861
3862 return PyUnicode_Join((PyObject *)self, data);
3863}
3864
3865static int
3866unicode_length(PyUnicodeObject *self)
3867{
3868 return self->length;
3869}
3870
3871static char ljust__doc__[] =
3872"S.ljust(width) -> unicode\n\
3873\n\
3874Return S left justified in a Unicode string of length width. Padding is\n\
3875done using spaces.";
3876
3877static PyObject *
3878unicode_ljust(PyUnicodeObject *self, PyObject *args)
3879{
3880 int width;
3881 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3882 return NULL;
3883
3884 if (self->length >= width) {
3885 Py_INCREF(self);
3886 return (PyObject*) self;
3887 }
3888
3889 return (PyObject*) pad(self, 0, width - self->length, ' ');
3890}
3891
3892static char lower__doc__[] =
3893"S.lower() -> unicode\n\
3894\n\
3895Return a copy of the string S converted to lowercase.";
3896
3897static PyObject*
3898unicode_lower(PyUnicodeObject *self, PyObject *args)
3899{
3900 if (!PyArg_NoArgs(args))
3901 return NULL;
3902 return fixup(self, fixlower);
3903}
3904
3905static char lstrip__doc__[] =
3906"S.lstrip() -> unicode\n\
3907\n\
3908Return a copy of the string S with leading whitespace removed.";
3909
3910static PyObject *
3911unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3912{
3913 if (!PyArg_NoArgs(args))
3914 return NULL;
3915 return strip(self, 1, 0);
3916}
3917
3918static PyObject*
3919unicode_repeat(PyUnicodeObject *str, int len)
3920{
3921 PyUnicodeObject *u;
3922 Py_UNICODE *p;
3923
3924 if (len < 0)
3925 len = 0;
3926
3927 if (len == 1) {
3928 /* no repeat, return original string */
3929 Py_INCREF(str);
3930 return (PyObject*) str;
3931 }
3932
3933 u = _PyUnicode_New(len * str->length);
3934 if (!u)
3935 return NULL;
3936
3937 p = u->str;
3938
3939 while (len-- > 0) {
3940 Py_UNICODE_COPY(p, str->str, str->length);
3941 p += str->length;
3942 }
3943
3944 return (PyObject*) u;
3945}
3946
3947PyObject *PyUnicode_Replace(PyObject *obj,
3948 PyObject *subobj,
3949 PyObject *replobj,
3950 int maxcount)
3951{
3952 PyObject *self;
3953 PyObject *str1;
3954 PyObject *str2;
3955 PyObject *result;
3956
3957 self = PyUnicode_FromObject(obj);
3958 if (self == NULL)
3959 return NULL;
3960 str1 = PyUnicode_FromObject(subobj);
3961 if (str1 == NULL) {
3962 Py_DECREF(self);
3963 return NULL;
3964 }
3965 str2 = PyUnicode_FromObject(replobj);
3966 if (str2 == NULL) {
3967 Py_DECREF(self);
3968 Py_DECREF(str1);
3969 return NULL;
3970 }
3971 result = replace((PyUnicodeObject *)self,
3972 (PyUnicodeObject *)str1,
3973 (PyUnicodeObject *)str2,
3974 maxcount);
3975 Py_DECREF(self);
3976 Py_DECREF(str1);
3977 Py_DECREF(str2);
3978 return result;
3979}
3980
3981static char replace__doc__[] =
3982"S.replace (old, new[, maxsplit]) -> unicode\n\
3983\n\
3984Return a copy of S with all occurrences of substring\n\
3985old replaced by new. If the optional argument maxsplit is\n\
3986given, only the first maxsplit occurrences are replaced.";
3987
3988static PyObject*
3989unicode_replace(PyUnicodeObject *self, PyObject *args)
3990{
3991 PyUnicodeObject *str1;
3992 PyUnicodeObject *str2;
3993 int maxcount = -1;
3994 PyObject *result;
3995
3996 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3997 return NULL;
3998 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3999 if (str1 == NULL)
4000 return NULL;
4001 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4002 if (str2 == NULL)
4003 return NULL;
4004
4005 result = replace(self, str1, str2, maxcount);
4006
4007 Py_DECREF(str1);
4008 Py_DECREF(str2);
4009 return result;
4010}
4011
4012static
4013PyObject *unicode_repr(PyObject *unicode)
4014{
4015 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4016 PyUnicode_GET_SIZE(unicode),
4017 1);
4018}
4019
4020static char rfind__doc__[] =
4021"S.rfind(sub [,start [,end]]) -> int\n\
4022\n\
4023Return the highest index in S where substring sub is found,\n\
4024such that sub is contained within s[start,end]. Optional\n\
4025arguments start and end are interpreted as in slice notation.\n\
4026\n\
4027Return -1 on failure.";
4028
4029static PyObject *
4030unicode_rfind(PyUnicodeObject *self, PyObject *args)
4031{
4032 PyUnicodeObject *substring;
4033 int start = 0;
4034 int end = INT_MAX;
4035 PyObject *result;
4036
Guido van Rossumb8872e62000-05-09 14:14:27 +00004037 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4038 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 return NULL;
4040 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4041 (PyObject *)substring);
4042 if (substring == NULL)
4043 return NULL;
4044
4045 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4046
4047 Py_DECREF(substring);
4048 return result;
4049}
4050
4051static char rindex__doc__[] =
4052"S.rindex(sub [,start [,end]]) -> int\n\
4053\n\
4054Like S.rfind() but raise ValueError when the substring is not found.";
4055
4056static PyObject *
4057unicode_rindex(PyUnicodeObject *self, PyObject *args)
4058{
4059 int result;
4060 PyUnicodeObject *substring;
4061 int start = 0;
4062 int end = INT_MAX;
4063
Guido van Rossumb8872e62000-05-09 14:14:27 +00004064 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4065 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 return NULL;
4067 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4068 (PyObject *)substring);
4069 if (substring == NULL)
4070 return NULL;
4071
4072 result = findstring(self, substring, start, end, -1);
4073
4074 Py_DECREF(substring);
4075 if (result < 0) {
4076 PyErr_SetString(PyExc_ValueError, "substring not found");
4077 return NULL;
4078 }
4079 return PyInt_FromLong(result);
4080}
4081
4082static char rjust__doc__[] =
4083"S.rjust(width) -> unicode\n\
4084\n\
4085Return S right justified in a Unicode string of length width. Padding is\n\
4086done using spaces.";
4087
4088static PyObject *
4089unicode_rjust(PyUnicodeObject *self, PyObject *args)
4090{
4091 int width;
4092 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4093 return NULL;
4094
4095 if (self->length >= width) {
4096 Py_INCREF(self);
4097 return (PyObject*) self;
4098 }
4099
4100 return (PyObject*) pad(self, width - self->length, 0, ' ');
4101}
4102
4103static char rstrip__doc__[] =
4104"S.rstrip() -> unicode\n\
4105\n\
4106Return a copy of the string S with trailing whitespace removed.";
4107
4108static PyObject *
4109unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4110{
4111 if (!PyArg_NoArgs(args))
4112 return NULL;
4113 return strip(self, 0, 1);
4114}
4115
4116static PyObject*
4117unicode_slice(PyUnicodeObject *self, int start, int end)
4118{
4119 /* standard clamping */
4120 if (start < 0)
4121 start = 0;
4122 if (end < 0)
4123 end = 0;
4124 if (end > self->length)
4125 end = self->length;
4126 if (start == 0 && end == self->length) {
4127 /* full slice, return original string */
4128 Py_INCREF(self);
4129 return (PyObject*) self;
4130 }
4131 if (start > end)
4132 start = end;
4133 /* copy slice */
4134 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4135 end - start);
4136}
4137
4138PyObject *PyUnicode_Split(PyObject *s,
4139 PyObject *sep,
4140 int maxsplit)
4141{
4142 PyObject *result;
4143
4144 s = PyUnicode_FromObject(s);
4145 if (s == NULL)
4146 return NULL;
4147 if (sep != NULL) {
4148 sep = PyUnicode_FromObject(sep);
4149 if (sep == NULL) {
4150 Py_DECREF(s);
4151 return NULL;
4152 }
4153 }
4154
4155 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4156
4157 Py_DECREF(s);
4158 Py_XDECREF(sep);
4159 return result;
4160}
4161
4162static char split__doc__[] =
4163"S.split([sep [,maxsplit]]) -> list of strings\n\
4164\n\
4165Return a list of the words in S, using sep as the\n\
4166delimiter string. If maxsplit is given, at most maxsplit\n\
4167splits are done. If sep is not specified, any whitespace string\n\
4168is a separator.";
4169
4170static PyObject*
4171unicode_split(PyUnicodeObject *self, PyObject *args)
4172{
4173 PyObject *substring = Py_None;
4174 int maxcount = -1;
4175
4176 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4177 return NULL;
4178
4179 if (substring == Py_None)
4180 return split(self, NULL, maxcount);
4181 else if (PyUnicode_Check(substring))
4182 return split(self, (PyUnicodeObject *)substring, maxcount);
4183 else
4184 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4185}
4186
4187static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004188"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189\n\
4190Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004191Line breaks are not included in the resulting list unless keepends\n\
4192is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193
4194static PyObject*
4195unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4196{
Guido van Rossum86662912000-04-11 15:38:46 +00004197 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198
Guido van Rossum86662912000-04-11 15:38:46 +00004199 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 return NULL;
4201
Guido van Rossum86662912000-04-11 15:38:46 +00004202 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203}
4204
4205static
4206PyObject *unicode_str(PyUnicodeObject *self)
4207{
Fred Drakee4315f52000-05-09 19:53:39 +00004208 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209}
4210
4211static char strip__doc__[] =
4212"S.strip() -> unicode\n\
4213\n\
4214Return a copy of S with leading and trailing whitespace removed.";
4215
4216static PyObject *
4217unicode_strip(PyUnicodeObject *self, PyObject *args)
4218{
4219 if (!PyArg_NoArgs(args))
4220 return NULL;
4221 return strip(self, 1, 1);
4222}
4223
4224static char swapcase__doc__[] =
4225"S.swapcase() -> unicode\n\
4226\n\
4227Return a copy of S with uppercase characters converted to lowercase\n\
4228and vice versa.";
4229
4230static PyObject*
4231unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4232{
4233 if (!PyArg_NoArgs(args))
4234 return NULL;
4235 return fixup(self, fixswapcase);
4236}
4237
4238static char translate__doc__[] =
4239"S.translate(table) -> unicode\n\
4240\n\
4241Return a copy of the string S, where all characters have been mapped\n\
4242through the given translation table, which must be a mapping of\n\
4243Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4244are left untouched. Characters mapped to None are deleted.";
4245
4246static PyObject*
4247unicode_translate(PyUnicodeObject *self, PyObject *args)
4248{
4249 PyObject *table;
4250
4251 if (!PyArg_ParseTuple(args, "O:translate", &table))
4252 return NULL;
4253 return PyUnicode_TranslateCharmap(self->str,
4254 self->length,
4255 table,
4256 "ignore");
4257}
4258
4259static char upper__doc__[] =
4260"S.upper() -> unicode\n\
4261\n\
4262Return a copy of S converted to uppercase.";
4263
4264static PyObject*
4265unicode_upper(PyUnicodeObject *self, PyObject *args)
4266{
4267 if (!PyArg_NoArgs(args))
4268 return NULL;
4269 return fixup(self, fixupper);
4270}
4271
4272#if 0
4273static char zfill__doc__[] =
4274"S.zfill(width) -> unicode\n\
4275\n\
4276Pad a numeric string x with zeros on the left, to fill a field\n\
4277of the specified width. The string x is never truncated.";
4278
4279static PyObject *
4280unicode_zfill(PyUnicodeObject *self, PyObject *args)
4281{
4282 int fill;
4283 PyUnicodeObject *u;
4284
4285 int width;
4286 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4287 return NULL;
4288
4289 if (self->length >= width) {
4290 Py_INCREF(self);
4291 return (PyObject*) self;
4292 }
4293
4294 fill = width - self->length;
4295
4296 u = pad(self, fill, 0, '0');
4297
4298 if (u->str[fill] == '+' || u->str[fill] == '-') {
4299 /* move sign to beginning of string */
4300 u->str[0] = u->str[fill];
4301 u->str[fill] = '0';
4302 }
4303
4304 return (PyObject*) u;
4305}
4306#endif
4307
4308#if 0
4309static PyObject*
4310unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4311{
4312 if (!PyArg_NoArgs(args))
4313 return NULL;
4314 return PyInt_FromLong(unicode_freelist_size);
4315}
4316#endif
4317
4318static char startswith__doc__[] =
4319"S.startswith(prefix[, start[, end]]) -> int\n\
4320\n\
4321Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4322optional start, test S beginning at that position. With optional end, stop\n\
4323comparing S at that position.";
4324
4325static PyObject *
4326unicode_startswith(PyUnicodeObject *self,
4327 PyObject *args)
4328{
4329 PyUnicodeObject *substring;
4330 int start = 0;
4331 int end = INT_MAX;
4332 PyObject *result;
4333
Guido van Rossumb8872e62000-05-09 14:14:27 +00004334 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4335 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 return NULL;
4337 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4338 (PyObject *)substring);
4339 if (substring == NULL)
4340 return NULL;
4341
4342 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4343
4344 Py_DECREF(substring);
4345 return result;
4346}
4347
4348
4349static char endswith__doc__[] =
4350"S.endswith(suffix[, start[, end]]) -> int\n\
4351\n\
4352Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4353optional start, test S beginning at that position. With optional end, stop\n\
4354comparing S at that position.";
4355
4356static PyObject *
4357unicode_endswith(PyUnicodeObject *self,
4358 PyObject *args)
4359{
4360 PyUnicodeObject *substring;
4361 int start = 0;
4362 int end = INT_MAX;
4363 PyObject *result;
4364
Guido van Rossumb8872e62000-05-09 14:14:27 +00004365 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4366 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367 return NULL;
4368 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4369 (PyObject *)substring);
4370 if (substring == NULL)
4371 return NULL;
4372
4373 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4374
4375 Py_DECREF(substring);
4376 return result;
4377}
4378
4379
4380static PyMethodDef unicode_methods[] = {
4381
4382 /* Order is according to common usage: often used methods should
4383 appear first, since lookup is done sequentially. */
4384
4385 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4386 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4387 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4388 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4389 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4390 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4391 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4392 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4393 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4394 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4395 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4396 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4397 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4398 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4399/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4400 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4401 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4402 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4403 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4404 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4405 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4406 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4407 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4408 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4409 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4410 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4411 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4412 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4413 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4414 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4415 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4416 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4417 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004418 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4419 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420#if 0
4421 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4422 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4423#endif
4424
4425#if 0
4426 /* This one is just used for debugging the implementation. */
4427 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4428#endif
4429
4430 {NULL, NULL}
4431};
4432
4433static PyObject *
4434unicode_getattr(PyUnicodeObject *self, char *name)
4435{
4436 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4437}
4438
4439static PySequenceMethods unicode_as_sequence = {
4440 (inquiry) unicode_length, /* sq_length */
4441 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4442 (intargfunc) unicode_repeat, /* sq_repeat */
4443 (intargfunc) unicode_getitem, /* sq_item */
4444 (intintargfunc) unicode_slice, /* sq_slice */
4445 0, /* sq_ass_item */
4446 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004447 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448};
4449
4450static int
4451unicode_buffer_getreadbuf(PyUnicodeObject *self,
4452 int index,
4453 const void **ptr)
4454{
4455 if (index != 0) {
4456 PyErr_SetString(PyExc_SystemError,
4457 "accessing non-existent unicode segment");
4458 return -1;
4459 }
4460 *ptr = (void *) self->str;
4461 return PyUnicode_GET_DATA_SIZE(self);
4462}
4463
4464static int
4465unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4466 const void **ptr)
4467{
4468 PyErr_SetString(PyExc_TypeError,
4469 "cannot use unicode as modifyable buffer");
4470 return -1;
4471}
4472
4473static int
4474unicode_buffer_getsegcount(PyUnicodeObject *self,
4475 int *lenp)
4476{
4477 if (lenp)
4478 *lenp = PyUnicode_GET_DATA_SIZE(self);
4479 return 1;
4480}
4481
4482static int
4483unicode_buffer_getcharbuf(PyUnicodeObject *self,
4484 int index,
4485 const void **ptr)
4486{
4487 PyObject *str;
4488
4489 if (index != 0) {
4490 PyErr_SetString(PyExc_SystemError,
4491 "accessing non-existent unicode segment");
4492 return -1;
4493 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004494 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 if (str == NULL)
4496 return -1;
4497 *ptr = (void *) PyString_AS_STRING(str);
4498 return PyString_GET_SIZE(str);
4499}
4500
4501/* Helpers for PyUnicode_Format() */
4502
4503static PyObject *
4504getnextarg(args, arglen, p_argidx)
4505 PyObject *args;
4506int arglen;
4507int *p_argidx;
4508{
4509 int argidx = *p_argidx;
4510 if (argidx < arglen) {
4511 (*p_argidx)++;
4512 if (arglen < 0)
4513 return args;
4514 else
4515 return PyTuple_GetItem(args, argidx);
4516 }
4517 PyErr_SetString(PyExc_TypeError,
4518 "not enough arguments for format string");
4519 return NULL;
4520}
4521
4522#define F_LJUST (1<<0)
4523#define F_SIGN (1<<1)
4524#define F_BLANK (1<<2)
4525#define F_ALT (1<<3)
4526#define F_ZERO (1<<4)
4527
4528static
4529#ifdef HAVE_STDARG_PROTOTYPES
4530int usprintf(register Py_UNICODE *buffer, char *format, ...)
4531#else
4532int usprintf(va_alist) va_dcl
4533#endif
4534{
4535 register int i;
4536 int len;
4537 va_list va;
4538 char *charbuffer;
4539#ifdef HAVE_STDARG_PROTOTYPES
4540 va_start(va, format);
4541#else
4542 Py_UNICODE *args;
4543 char *format;
4544
4545 va_start(va);
4546 buffer = va_arg(va, Py_UNICODE *);
4547 format = va_arg(va, char *);
4548#endif
4549
4550 /* First, format the string as char array, then expand to Py_UNICODE
4551 array. */
4552 charbuffer = (char *)buffer;
4553 len = vsprintf(charbuffer, format, va);
4554 for (i = len - 1; i >= 0; i--)
4555 buffer[i] = (Py_UNICODE) charbuffer[i];
4556
4557 va_end(va);
4558 return len;
4559}
4560
4561static int
4562formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004563 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 int flags,
4565 int prec,
4566 int type,
4567 PyObject *v)
4568{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004569 /* fmt = '%#.' + `prec` + `type`
4570 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 char fmt[20];
4572 double x;
4573
4574 x = PyFloat_AsDouble(v);
4575 if (x == -1.0 && PyErr_Occurred())
4576 return -1;
4577 if (prec < 0)
4578 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4580 type = 'g';
4581 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004582 /* worst case length calc to ensure no buffer overrun:
4583 fmt = %#.<prec>g
4584 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4585 for any double rep.)
4586 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4587 If prec=0 the effective precision is 1 (the leading digit is
4588 always given), therefore increase by one to 10+prec. */
4589 if (buflen <= (size_t)10 + (size_t)prec) {
4590 PyErr_SetString(PyExc_OverflowError,
4591 "formatted float is too long (precision too long?)");
4592 return -1;
4593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 return usprintf(buf, fmt, x);
4595}
4596
4597static int
4598formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004599 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 int flags,
4601 int prec,
4602 int type,
4603 PyObject *v)
4604{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004605 /* fmt = '%#.' + `prec` + 'l' + `type`
4606 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 char fmt[20];
4608 long x;
4609
4610 x = PyInt_AsLong(v);
4611 if (x == -1 && PyErr_Occurred())
4612 return -1;
4613 if (prec < 0)
4614 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004615 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4616 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4617 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4618 PyErr_SetString(PyExc_OverflowError,
4619 "formatted integer is too long (precision too long?)");
4620 return -1;
4621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4623 return usprintf(buf, fmt, x);
4624}
4625
4626static int
4627formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004628 size_t buflen,
4629 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004631 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004632 if (PyUnicode_Check(v)) {
4633 if (PyUnicode_GET_SIZE(v) != 1)
4634 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004638 else if (PyString_Check(v)) {
4639 if (PyString_GET_SIZE(v) != 1)
4640 goto onError;
4641 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643
4644 else {
4645 /* Integer input truncated to a character */
4646 long x;
4647 x = PyInt_AsLong(v);
4648 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004649 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 buf[0] = (char) x;
4651 }
4652 buf[1] = '\0';
4653 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004654
4655 onError:
4656 PyErr_SetString(PyExc_TypeError,
4657 "%c requires int or char");
4658 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659}
4660
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004661/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4662
4663 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4664 chars are formatted. XXX This is a magic number. Each formatting
4665 routine does bounds checking to ensure no overflow, but a better
4666 solution may be to malloc a buffer of appropriate size for each
4667 format. For now, the current solution is sufficient.
4668*/
4669#define FORMATBUFLEN (size_t)120
4670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671PyObject *PyUnicode_Format(PyObject *format,
4672 PyObject *args)
4673{
4674 Py_UNICODE *fmt, *res;
4675 int fmtcnt, rescnt, reslen, arglen, argidx;
4676 int args_owned = 0;
4677 PyUnicodeObject *result = NULL;
4678 PyObject *dict = NULL;
4679 PyObject *uformat;
4680
4681 if (format == NULL || args == NULL) {
4682 PyErr_BadInternalCall();
4683 return NULL;
4684 }
4685 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004686 if (uformat == NULL)
4687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 fmt = PyUnicode_AS_UNICODE(uformat);
4689 fmtcnt = PyUnicode_GET_SIZE(uformat);
4690
4691 reslen = rescnt = fmtcnt + 100;
4692 result = _PyUnicode_New(reslen);
4693 if (result == NULL)
4694 goto onError;
4695 res = PyUnicode_AS_UNICODE(result);
4696
4697 if (PyTuple_Check(args)) {
4698 arglen = PyTuple_Size(args);
4699 argidx = 0;
4700 }
4701 else {
4702 arglen = -1;
4703 argidx = -2;
4704 }
4705 if (args->ob_type->tp_as_mapping)
4706 dict = args;
4707
4708 while (--fmtcnt >= 0) {
4709 if (*fmt != '%') {
4710 if (--rescnt < 0) {
4711 rescnt = fmtcnt + 100;
4712 reslen += rescnt;
4713 if (_PyUnicode_Resize(result, reslen) < 0)
4714 return NULL;
4715 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4716 --rescnt;
4717 }
4718 *res++ = *fmt++;
4719 }
4720 else {
4721 /* Got a format specifier */
4722 int flags = 0;
4723 int width = -1;
4724 int prec = -1;
4725 int size = 0;
4726 Py_UNICODE c = '\0';
4727 Py_UNICODE fill;
4728 PyObject *v = NULL;
4729 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004730 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 Py_UNICODE sign;
4732 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004733 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
4735 fmt++;
4736 if (*fmt == '(') {
4737 Py_UNICODE *keystart;
4738 int keylen;
4739 PyObject *key;
4740 int pcount = 1;
4741
4742 if (dict == NULL) {
4743 PyErr_SetString(PyExc_TypeError,
4744 "format requires a mapping");
4745 goto onError;
4746 }
4747 ++fmt;
4748 --fmtcnt;
4749 keystart = fmt;
4750 /* Skip over balanced parentheses */
4751 while (pcount > 0 && --fmtcnt >= 0) {
4752 if (*fmt == ')')
4753 --pcount;
4754 else if (*fmt == '(')
4755 ++pcount;
4756 fmt++;
4757 }
4758 keylen = fmt - keystart - 1;
4759 if (fmtcnt < 0 || pcount > 0) {
4760 PyErr_SetString(PyExc_ValueError,
4761 "incomplete format key");
4762 goto onError;
4763 }
Fred Drakee4315f52000-05-09 19:53:39 +00004764 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 then looked up since Python uses strings to hold
4766 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004767 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 key = PyUnicode_EncodeUTF8(keystart,
4769 keylen,
4770 NULL);
4771 if (key == NULL)
4772 goto onError;
4773 if (args_owned) {
4774 Py_DECREF(args);
4775 args_owned = 0;
4776 }
4777 args = PyObject_GetItem(dict, key);
4778 Py_DECREF(key);
4779 if (args == NULL) {
4780 goto onError;
4781 }
4782 args_owned = 1;
4783 arglen = -1;
4784 argidx = -2;
4785 }
4786 while (--fmtcnt >= 0) {
4787 switch (c = *fmt++) {
4788 case '-': flags |= F_LJUST; continue;
4789 case '+': flags |= F_SIGN; continue;
4790 case ' ': flags |= F_BLANK; continue;
4791 case '#': flags |= F_ALT; continue;
4792 case '0': flags |= F_ZERO; continue;
4793 }
4794 break;
4795 }
4796 if (c == '*') {
4797 v = getnextarg(args, arglen, &argidx);
4798 if (v == NULL)
4799 goto onError;
4800 if (!PyInt_Check(v)) {
4801 PyErr_SetString(PyExc_TypeError,
4802 "* wants int");
4803 goto onError;
4804 }
4805 width = PyInt_AsLong(v);
4806 if (width < 0) {
4807 flags |= F_LJUST;
4808 width = -width;
4809 }
4810 if (--fmtcnt >= 0)
4811 c = *fmt++;
4812 }
4813 else if (c >= '0' && c <= '9') {
4814 width = c - '0';
4815 while (--fmtcnt >= 0) {
4816 c = *fmt++;
4817 if (c < '0' || c > '9')
4818 break;
4819 if ((width*10) / 10 != width) {
4820 PyErr_SetString(PyExc_ValueError,
4821 "width too big");
4822 goto onError;
4823 }
4824 width = width*10 + (c - '0');
4825 }
4826 }
4827 if (c == '.') {
4828 prec = 0;
4829 if (--fmtcnt >= 0)
4830 c = *fmt++;
4831 if (c == '*') {
4832 v = getnextarg(args, arglen, &argidx);
4833 if (v == NULL)
4834 goto onError;
4835 if (!PyInt_Check(v)) {
4836 PyErr_SetString(PyExc_TypeError,
4837 "* wants int");
4838 goto onError;
4839 }
4840 prec = PyInt_AsLong(v);
4841 if (prec < 0)
4842 prec = 0;
4843 if (--fmtcnt >= 0)
4844 c = *fmt++;
4845 }
4846 else if (c >= '0' && c <= '9') {
4847 prec = c - '0';
4848 while (--fmtcnt >= 0) {
4849 c = Py_CHARMASK(*fmt++);
4850 if (c < '0' || c > '9')
4851 break;
4852 if ((prec*10) / 10 != prec) {
4853 PyErr_SetString(PyExc_ValueError,
4854 "prec too big");
4855 goto onError;
4856 }
4857 prec = prec*10 + (c - '0');
4858 }
4859 }
4860 } /* prec */
4861 if (fmtcnt >= 0) {
4862 if (c == 'h' || c == 'l' || c == 'L') {
4863 size = c;
4864 if (--fmtcnt >= 0)
4865 c = *fmt++;
4866 }
4867 }
4868 if (fmtcnt < 0) {
4869 PyErr_SetString(PyExc_ValueError,
4870 "incomplete format");
4871 goto onError;
4872 }
4873 if (c != '%') {
4874 v = getnextarg(args, arglen, &argidx);
4875 if (v == NULL)
4876 goto onError;
4877 }
4878 sign = 0;
4879 fill = ' ';
4880 switch (c) {
4881
4882 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004883 pbuf = formatbuf;
4884 /* presume that buffer length is at least 1 */
4885 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 len = 1;
4887 break;
4888
4889 case 's':
4890 case 'r':
4891 if (PyUnicode_Check(v) && c == 's') {
4892 temp = v;
4893 Py_INCREF(temp);
4894 }
4895 else {
4896 PyObject *unicode;
4897 if (c == 's')
4898 temp = PyObject_Str(v);
4899 else
4900 temp = PyObject_Repr(v);
4901 if (temp == NULL)
4902 goto onError;
4903 if (!PyString_Check(temp)) {
4904 /* XXX Note: this should never happen, since
4905 PyObject_Repr() and PyObject_Str() assure
4906 this */
4907 Py_DECREF(temp);
4908 PyErr_SetString(PyExc_TypeError,
4909 "%s argument has non-string str()");
4910 goto onError;
4911 }
Fred Drakee4315f52000-05-09 19:53:39 +00004912 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004914 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 "strict");
4916 Py_DECREF(temp);
4917 temp = unicode;
4918 if (temp == NULL)
4919 goto onError;
4920 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004921 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 len = PyUnicode_GET_SIZE(temp);
4923 if (prec >= 0 && len > prec)
4924 len = prec;
4925 break;
4926
4927 case 'i':
4928 case 'd':
4929 case 'u':
4930 case 'o':
4931 case 'x':
4932 case 'X':
4933 if (c == 'i')
4934 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004935 pbuf = formatbuf;
4936 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4937 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 if (len < 0)
4939 goto onError;
4940 sign = (c == 'd');
4941 if (flags & F_ZERO) {
4942 fill = '0';
4943 if ((flags&F_ALT) &&
4944 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004945 pbuf[0] == '0' && pbuf[1] == c) {
4946 *res++ = *pbuf++;
4947 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 rescnt -= 2;
4949 len -= 2;
4950 width -= 2;
4951 if (width < 0)
4952 width = 0;
4953 }
4954 }
4955 break;
4956
4957 case 'e':
4958 case 'E':
4959 case 'f':
4960 case 'g':
4961 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004962 pbuf = formatbuf;
4963 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4964 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 if (len < 0)
4966 goto onError;
4967 sign = 1;
4968 if (flags&F_ZERO)
4969 fill = '0';
4970 break;
4971
4972 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004973 pbuf = formatbuf;
4974 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 if (len < 0)
4976 goto onError;
4977 break;
4978
4979 default:
4980 PyErr_Format(PyExc_ValueError,
4981 "unsupported format character '%c' (0x%x)",
4982 c, c);
4983 goto onError;
4984 }
4985 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004986 if (*pbuf == '-' || *pbuf == '+') {
4987 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 len--;
4989 }
4990 else if (flags & F_SIGN)
4991 sign = '+';
4992 else if (flags & F_BLANK)
4993 sign = ' ';
4994 else
4995 sign = 0;
4996 }
4997 if (width < len)
4998 width = len;
4999 if (rescnt < width + (sign != 0)) {
5000 reslen -= rescnt;
5001 rescnt = width + fmtcnt + 100;
5002 reslen += rescnt;
5003 if (_PyUnicode_Resize(result, reslen) < 0)
5004 return NULL;
5005 res = PyUnicode_AS_UNICODE(result)
5006 + reslen - rescnt;
5007 }
5008 if (sign) {
5009 if (fill != ' ')
5010 *res++ = sign;
5011 rescnt--;
5012 if (width > len)
5013 width--;
5014 }
5015 if (width > len && !(flags & F_LJUST)) {
5016 do {
5017 --rescnt;
5018 *res++ = fill;
5019 } while (--width > len);
5020 }
5021 if (sign && fill == ' ')
5022 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005023 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 res += len;
5025 rescnt -= len;
5026 while (--width >= len) {
5027 --rescnt;
5028 *res++ = ' ';
5029 }
5030 if (dict && (argidx < arglen) && c != '%') {
5031 PyErr_SetString(PyExc_TypeError,
5032 "not all arguments converted");
5033 goto onError;
5034 }
5035 Py_XDECREF(temp);
5036 } /* '%' */
5037 } /* until end */
5038 if (argidx < arglen && !dict) {
5039 PyErr_SetString(PyExc_TypeError,
5040 "not all arguments converted");
5041 goto onError;
5042 }
5043
5044 if (args_owned) {
5045 Py_DECREF(args);
5046 }
5047 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005048 if (_PyUnicode_Resize(result, reslen - rescnt))
5049 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return (PyObject *)result;
5051
5052 onError:
5053 Py_XDECREF(result);
5054 Py_DECREF(uformat);
5055 if (args_owned) {
5056 Py_DECREF(args);
5057 }
5058 return NULL;
5059}
5060
5061static PyBufferProcs unicode_as_buffer = {
5062 (getreadbufferproc) unicode_buffer_getreadbuf,
5063 (getwritebufferproc) unicode_buffer_getwritebuf,
5064 (getsegcountproc) unicode_buffer_getsegcount,
5065 (getcharbufferproc) unicode_buffer_getcharbuf,
5066};
5067
5068PyTypeObject PyUnicode_Type = {
5069 PyObject_HEAD_INIT(&PyType_Type)
5070 0, /* ob_size */
5071 "unicode", /* tp_name */
5072 sizeof(PyUnicodeObject), /* tp_size */
5073 0, /* tp_itemsize */
5074 /* Slots */
5075 (destructor)_PyUnicode_Free, /* tp_dealloc */
5076 0, /* tp_print */
5077 (getattrfunc)unicode_getattr, /* tp_getattr */
5078 0, /* tp_setattr */
5079 (cmpfunc) unicode_compare, /* tp_compare */
5080 (reprfunc) unicode_repr, /* tp_repr */
5081 0, /* tp_as_number */
5082 &unicode_as_sequence, /* tp_as_sequence */
5083 0, /* tp_as_mapping */
5084 (hashfunc) unicode_hash, /* tp_hash*/
5085 0, /* tp_call*/
5086 (reprfunc) unicode_str, /* tp_str */
5087 (getattrofunc) NULL, /* tp_getattro */
5088 (setattrofunc) NULL, /* tp_setattro */
5089 &unicode_as_buffer, /* tp_as_buffer */
5090 Py_TPFLAGS_DEFAULT, /* tp_flags */
5091};
5092
5093/* Initialize the Unicode implementation */
5094
5095void _PyUnicode_Init()
5096{
5097 /* Doublecheck the configuration... */
5098 if (sizeof(Py_UNICODE) != 2)
5099 Py_FatalError("Unicode configuration error: "
5100 "sizeof(Py_UNICODE) != 2 bytes");
5101
Fred Drakee4315f52000-05-09 19:53:39 +00005102 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005103 unicode_freelist = NULL;
5104 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005106 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107}
5108
5109/* Finalize the Unicode implementation */
5110
5111void
5112_PyUnicode_Fini()
5113{
5114 PyUnicodeObject *u = unicode_freelist;
5115
5116 while (u != NULL) {
5117 PyUnicodeObject *v = u;
5118 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005119 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005120 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005121 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005122 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005124 unicode_freelist = NULL;
5125 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005127 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128}