blob: 02d1b0d5d8ab535dbbe3609c614a2253d825189e [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000359 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
360}
361
362PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
363 const char *encoding,
364 const char *errors)
365{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 const char *s;
367 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368 int owned = 0;
369 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371 if (obj == NULL) {
372 PyErr_BadInternalCall();
373 return NULL;
374 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000375
376 /* Coerce object */
377 if (PyInstance_Check(obj)) {
378 PyObject *func;
379 func = PyObject_GetAttrString(obj, "__str__");
380 if (func == NULL) {
381 PyErr_SetString(PyExc_TypeError,
382 "coercing to Unicode: instance doesn't define __str__");
383 return NULL;
384 }
385 obj = PyEval_CallObject(func, NULL);
386 Py_DECREF(func);
387 if (obj == NULL)
388 return NULL;
389 owned = 1;
390 }
391 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 v = obj;
394 if (encoding) {
395 PyErr_SetString(PyExc_TypeError,
396 "decoding Unicode is not supported");
397 return NULL;
398 }
399 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 }
401 else if (PyString_Check(obj)) {
402 s = PyString_AS_STRING(obj);
403 len = PyString_GET_SIZE(obj);
404 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000405 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
406 /* Overwrite the error message with something more useful in
407 case of a TypeError. */
408 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000409 PyErr_Format(PyExc_TypeError,
410 "coercing to Unicode: need string or buffer, "
411 "%.80s found",
412 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000413 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000414 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000415
416 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417 if (len == 0) {
418 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000421 else
422 v = PyUnicode_Decode(s, len, encoding, errors);
423 done:
424 if (owned)
425 Py_DECREF(obj);
426 return v;
427
428 onError:
429 if (owned)
430 Py_DECREF(obj);
431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432}
433
434PyObject *PyUnicode_Decode(const char *s,
435 int size,
436 const char *encoding,
437 const char *errors)
438{
439 PyObject *buffer = NULL, *unicode;
440
Fred Drakee4315f52000-05-09 19:53:39 +0000441 if (encoding == NULL)
442 encoding = PyUnicode_GetDefaultEncoding();
443
444 /* Shortcuts for common default encodings */
445 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000447 else if (strcmp(encoding, "latin-1") == 0)
448 return PyUnicode_DecodeLatin1(s, size, errors);
449 else if (strcmp(encoding, "ascii") == 0)
450 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451
452 /* Decode via the codec registry */
453 buffer = PyBuffer_FromMemory((void *)s, size);
454 if (buffer == NULL)
455 goto onError;
456 unicode = PyCodec_Decode(buffer, encoding, errors);
457 if (unicode == NULL)
458 goto onError;
459 if (!PyUnicode_Check(unicode)) {
460 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000461 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 unicode->ob_type->tp_name);
463 Py_DECREF(unicode);
464 goto onError;
465 }
466 Py_DECREF(buffer);
467 return unicode;
468
469 onError:
470 Py_XDECREF(buffer);
471 return NULL;
472}
473
474PyObject *PyUnicode_Encode(const Py_UNICODE *s,
475 int size,
476 const char *encoding,
477 const char *errors)
478{
479 PyObject *v, *unicode;
480
481 unicode = PyUnicode_FromUnicode(s, size);
482 if (unicode == NULL)
483 return NULL;
484 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
485 Py_DECREF(unicode);
486 return v;
487}
488
489PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
490 const char *encoding,
491 const char *errors)
492{
493 PyObject *v;
494
495 if (!PyUnicode_Check(unicode)) {
496 PyErr_BadArgument();
497 goto onError;
498 }
Fred Drakee4315f52000-05-09 19:53:39 +0000499
500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
502
503 /* Shortcuts for common default encodings */
504 if (errors == NULL) {
505 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000507 else if (strcmp(encoding, "latin-1") == 0)
508 return PyUnicode_AsLatin1String(unicode);
509 else if (strcmp(encoding, "ascii") == 0)
510 return PyUnicode_AsASCIIString(unicode);
511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512
513 /* Encode via the codec registry */
514 v = PyCodec_Encode(unicode, encoding, errors);
515 if (v == NULL)
516 goto onError;
517 /* XXX Should we really enforce this ? */
518 if (!PyString_Check(v)) {
519 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000520 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 v->ob_type->tp_name);
522 Py_DECREF(v);
523 goto onError;
524 }
525 return v;
526
527 onError:
528 return NULL;
529}
530
531Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
532{
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
537 return PyUnicode_AS_UNICODE(unicode);
538
539 onError:
540 return NULL;
541}
542
543int PyUnicode_GetSize(PyObject *unicode)
544{
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_BadArgument();
547 goto onError;
548 }
549 return PyUnicode_GET_SIZE(unicode);
550
551 onError:
552 return -1;
553}
554
Fred Drakee4315f52000-05-09 19:53:39 +0000555const char *PyUnicode_GetDefaultEncoding()
556{
557 return unicode_default_encoding;
558}
559
560int PyUnicode_SetDefaultEncoding(const char *encoding)
561{
562 PyObject *v;
563
564 /* Make sure the encoding is valid. As side effect, this also
565 loads the encoding into the codec registry cache. */
566 v = _PyCodec_Lookup(encoding);
567 if (v == NULL)
568 goto onError;
569 Py_DECREF(v);
570 strncpy(unicode_default_encoding,
571 encoding,
572 sizeof(unicode_default_encoding));
573 return 0;
574
575 onError:
576 return -1;
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579/* --- UTF-8 Codec -------------------------------------------------------- */
580
581static
582char utf8_code_length[256] = {
583 /* Map UTF-8 encoded prefix byte to sequence length. zero means
584 illegal prefix. see RFC 2279 for details */
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
591 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
592 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
598 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
599 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
600 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
601};
602
603static
604int utf8_decoding_error(const char **source,
605 Py_UNICODE **dest,
606 const char *errors,
607 const char *details)
608{
609 if ((errors == NULL) ||
610 (strcmp(errors,"strict") == 0)) {
611 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000612 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613 details);
614 return -1;
615 }
616 else if (strcmp(errors,"ignore") == 0) {
617 (*source)++;
618 return 0;
619 }
620 else if (strcmp(errors,"replace") == 0) {
621 (*source)++;
622 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
623 (*dest)++;
624 return 0;
625 }
626 else {
627 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000628 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 errors);
630 return -1;
631 }
632}
633
634#define UTF8_ERROR(details) do { \
635 if (utf8_decoding_error(&s, &p, errors, details)) \
636 goto onError; \
637 continue; \
638} while (0)
639
640PyObject *PyUnicode_DecodeUTF8(const char *s,
641 int size,
642 const char *errors)
643{
644 int n;
645 const char *e;
646 PyUnicodeObject *unicode;
647 Py_UNICODE *p;
648
649 /* Note: size will always be longer than the resulting Unicode
650 character count */
651 unicode = _PyUnicode_New(size);
652 if (!unicode)
653 return NULL;
654 if (size == 0)
655 return (PyObject *)unicode;
656
657 /* Unpack UTF-8 encoded data */
658 p = unicode->str;
659 e = s + size;
660
661 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000662 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663
664 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000665 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 s++;
667 continue;
668 }
669
670 n = utf8_code_length[ch];
671
672 if (s + n > e)
673 UTF8_ERROR("unexpected end of data");
674
675 switch (n) {
676
677 case 0:
678 UTF8_ERROR("unexpected code byte");
679 break;
680
681 case 1:
682 UTF8_ERROR("internal error");
683 break;
684
685 case 2:
686 if ((s[1] & 0xc0) != 0x80)
687 UTF8_ERROR("invalid data");
688 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
689 if (ch < 0x80)
690 UTF8_ERROR("illegal encoding");
691 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
697 (s[2] & 0xc0) != 0x80)
698 UTF8_ERROR("invalid data");
699 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
700 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
701 UTF8_ERROR("illegal encoding");
702 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000703 *p++ = (Py_UNICODE)ch;
704 break;
705
706 case 4:
707 if ((s[1] & 0xc0) != 0x80 ||
708 (s[2] & 0xc0) != 0x80 ||
709 (s[3] & 0xc0) != 0x80)
710 UTF8_ERROR("invalid data");
711 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
712 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
713 /* validate and convert to UTF-16 */
714 if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
715 (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
716 UTF8_ERROR("illegal encoding");
717 /* compute and append the two surrogates: */
718
719 /* translate from 10000..10FFFF to 0..FFFF */
720 ch -= 0x10000;
721
722 /* high surrogate = top 10 bits added to D800 */
723 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
724
725 /* low surrogate = bottom 10 bits added to DC00 */
726 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 break;
728
729 default:
730 /* Other sizes are only needed for UCS-4 */
731 UTF8_ERROR("unsupported Unicode code range");
732 }
733 s += n;
734 }
735
736 /* Adjust length */
737 if (_PyUnicode_Resize(unicode, p - unicode->str))
738 goto onError;
739
740 return (PyObject *)unicode;
741
742onError:
743 Py_DECREF(unicode);
744 return NULL;
745}
746
747#undef UTF8_ERROR
748
749static
750int utf8_encoding_error(const Py_UNICODE **source,
751 char **dest,
752 const char *errors,
753 const char *details)
754{
755 if ((errors == NULL) ||
756 (strcmp(errors,"strict") == 0)) {
757 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000758 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759 details);
760 return -1;
761 }
762 else if (strcmp(errors,"ignore") == 0) {
763 return 0;
764 }
765 else if (strcmp(errors,"replace") == 0) {
766 **dest = '?';
767 (*dest)++;
768 return 0;
769 }
770 else {
771 PyErr_Format(PyExc_ValueError,
772 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000773 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774 errors);
775 return -1;
776 }
777}
778
779PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
780 int size,
781 const char *errors)
782{
783 PyObject *v;
784 char *p;
785 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000786 Py_UCS4 ch2;
787 unsigned int cbAllocated = 3 * size;
788 unsigned int cbWritten = 0;
789 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000791 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 if (v == NULL)
793 return NULL;
794 if (size == 0)
795 goto done;
796
797 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000798 while (i < size) {
799 Py_UCS4 ch = s[i++];
800 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000802 cbWritten++;
803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 else if (ch < 0x0800) {
805 *p++ = 0xc0 | (ch >> 6);
806 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000807 cbWritten += 2;
808 }
809 else {
810 /* Check for high surrogate */
811 if (0xD800 <= ch && ch <= 0xDBFF) {
812 if (i != size) {
813 ch2 = s[i];
814 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
815
816 if (cbWritten >= (cbAllocated - 4)) {
817 /* Provide enough room for some more
818 surrogates */
819 cbAllocated += 4*10;
820 if (_PyString_Resize(&v, cbAllocated))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000822 }
823
824 /* combine the two values */
825 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
826
827 *p++ = (char)((ch >> 18) | 0xf0);
828 *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
829 i++;
830 cbWritten += 4;
831 }
832 }
833 }
834 else {
835 *p++ = (char)(0xe0 | (ch >> 12));
836 cbWritten += 3;
837 }
838 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
839 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841 }
842 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000843 if (_PyString_Resize(&v, p - q))
844 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845
846 done:
847 return v;
848
849 onError:
850 Py_DECREF(v);
851 return NULL;
852}
853
854/* Return a Python string holding the UTF-8 encoded value of the
855 Unicode object.
856
857 The resulting string is cached in the Unicode object for subsequent
858 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000859 the character buffer interface and will live (at least) as long as
860 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000861
862 The refcount of the string is *not* incremented.
863
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000864 *** Exported for internal use by the interpreter only !!! ***
865
Guido van Rossumd57fd912000-03-10 22:53:23 +0000866*/
867
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000868PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +0000869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000871 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 if (v)
874 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000875 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
876 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877 errors);
878 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000879 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880 return v;
881}
882
883PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
884{
885 PyObject *str;
886
887 if (!PyUnicode_Check(unicode)) {
888 PyErr_BadArgument();
889 return NULL;
890 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000891 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (str == NULL)
893 return NULL;
894 Py_INCREF(str);
895 return str;
896}
897
898/* --- UTF-16 Codec ------------------------------------------------------- */
899
900static
901int utf16_decoding_error(const Py_UNICODE **source,
902 Py_UNICODE **dest,
903 const char *errors,
904 const char *details)
905{
906 if ((errors == NULL) ||
907 (strcmp(errors,"strict") == 0)) {
908 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000909 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 details);
911 return -1;
912 }
913 else if (strcmp(errors,"ignore") == 0) {
914 return 0;
915 }
916 else if (strcmp(errors,"replace") == 0) {
917 if (dest) {
918 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
919 (*dest)++;
920 }
921 return 0;
922 }
923 else {
924 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000925 "UTF-16 decoding error; "
926 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 errors);
928 return -1;
929 }
930}
931
932#define UTF16_ERROR(details) do { \
933 if (utf16_decoding_error(&q, &p, errors, details)) \
934 goto onError; \
935 continue; \
936} while(0)
937
938PyObject *PyUnicode_DecodeUTF16(const char *s,
939 int size,
940 const char *errors,
941 int *byteorder)
942{
943 PyUnicodeObject *unicode;
944 Py_UNICODE *p;
945 const Py_UNICODE *q, *e;
946 int bo = 0;
947
948 /* size should be an even number */
949 if (size % sizeof(Py_UNICODE) != 0) {
950 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
951 return NULL;
952 /* The remaining input chars are ignored if we fall through
953 here... */
954 }
955
956 /* Note: size will always be longer than the resulting Unicode
957 character count */
958 unicode = _PyUnicode_New(size);
959 if (!unicode)
960 return NULL;
961 if (size == 0)
962 return (PyObject *)unicode;
963
964 /* Unpack UTF-16 encoded data */
965 p = unicode->str;
966 q = (Py_UNICODE *)s;
967 e = q + (size / sizeof(Py_UNICODE));
968
969 if (byteorder)
970 bo = *byteorder;
971
972 while (q < e) {
973 register Py_UNICODE ch = *q++;
974
975 /* Check for BOM marks (U+FEFF) in the input and adjust
976 current byte order setting accordingly. Swap input
977 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
978 !) */
979#ifdef BYTEORDER_IS_LITTLE_ENDIAN
980 if (ch == 0xFEFF) {
981 bo = -1;
982 continue;
983 } else if (ch == 0xFFFE) {
984 bo = 1;
985 continue;
986 }
987 if (bo == 1)
988 ch = (ch >> 8) | (ch << 8);
989#else
990 if (ch == 0xFEFF) {
991 bo = 1;
992 continue;
993 } else if (ch == 0xFFFE) {
994 bo = -1;
995 continue;
996 }
997 if (bo == -1)
998 ch = (ch >> 8) | (ch << 8);
999#endif
1000 if (ch < 0xD800 || ch > 0xDFFF) {
1001 *p++ = ch;
1002 continue;
1003 }
1004
1005 /* UTF-16 code pair: */
1006 if (q >= e)
1007 UTF16_ERROR("unexpected end of data");
1008 if (0xDC00 <= *q && *q <= 0xDFFF) {
1009 q++;
1010 if (0xD800 <= *q && *q <= 0xDBFF)
1011 /* This is valid data (a UTF-16 surrogate pair), but
1012 we are not able to store this information since our
1013 Py_UNICODE type only has 16 bits... this might
1014 change someday, even though it's unlikely. */
1015 UTF16_ERROR("code pairs are not supported");
1016 else
1017 continue;
1018 }
1019 UTF16_ERROR("illegal encoding");
1020 }
1021
1022 if (byteorder)
1023 *byteorder = bo;
1024
1025 /* Adjust length */
1026 if (_PyUnicode_Resize(unicode, p - unicode->str))
1027 goto onError;
1028
1029 return (PyObject *)unicode;
1030
1031onError:
1032 Py_DECREF(unicode);
1033 return NULL;
1034}
1035
1036#undef UTF16_ERROR
1037
1038PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1039 int size,
1040 const char *errors,
1041 int byteorder)
1042{
1043 PyObject *v;
1044 Py_UNICODE *p;
1045 char *q;
1046
1047 /* We don't create UTF-16 pairs... */
1048 v = PyString_FromStringAndSize(NULL,
1049 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1050 if (v == NULL)
1051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052
1053 q = PyString_AS_STRING(v);
1054 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (byteorder == 0)
1056 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001057 if (size == 0)
1058 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 if (byteorder == 0 ||
1060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1061 byteorder == -1
1062#else
1063 byteorder == 1
1064#endif
1065 )
1066 memcpy(p, s, size * sizeof(Py_UNICODE));
1067 else
1068 while (size-- > 0) {
1069 Py_UNICODE ch = *s++;
1070 *p++ = (ch >> 8) | (ch << 8);
1071 }
1072 done:
1073 return v;
1074}
1075
1076PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1077{
1078 if (!PyUnicode_Check(unicode)) {
1079 PyErr_BadArgument();
1080 return NULL;
1081 }
1082 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1083 PyUnicode_GET_SIZE(unicode),
1084 NULL,
1085 0);
1086}
1087
1088/* --- Unicode Escape Codec ----------------------------------------------- */
1089
1090static
1091int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001092 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 const char *errors,
1094 const char *details)
1095{
1096 if ((errors == NULL) ||
1097 (strcmp(errors,"strict") == 0)) {
1098 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001099 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 details);
1101 return -1;
1102 }
1103 else if (strcmp(errors,"ignore") == 0) {
1104 return 0;
1105 }
1106 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001107 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 return 0;
1109 }
1110 else {
1111 PyErr_Format(PyExc_ValueError,
1112 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001113 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 errors);
1115 return -1;
1116 }
1117}
1118
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001119static _Py_UCNHashAPI *pucnHash = NULL;
1120
1121static
1122int mystrnicmp(const char *s1, const char *s2, size_t count)
1123{
1124 char c1, c2;
1125
1126 if (count)
1127 {
1128 do
1129 {
1130 c1 = tolower(*(s1++));
1131 c2 = tolower(*(s2++));
1132 }
1133 while(--count && c1 == c2);
1134
1135 return c1 - c2;
1136 }
1137
1138 return 0;
1139}
1140
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1142 int size,
1143 const char *errors)
1144{
1145 PyUnicodeObject *v;
1146 Py_UNICODE *p = NULL, *buf = NULL;
1147 const char *end;
1148
1149 /* Escaped strings will always be longer than the resulting
1150 Unicode string, so we start with size here and then reduce the
1151 length after conversion to the true value. */
1152 v = _PyUnicode_New(size);
1153 if (v == NULL)
1154 goto onError;
1155 if (size == 0)
1156 return (PyObject *)v;
1157 p = buf = PyUnicode_AS_UNICODE(v);
1158 end = s + size;
1159 while (s < end) {
1160 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001161 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int i;
1163
1164 /* Non-escape characters are interpreted as Unicode ordinals */
1165 if (*s != '\\') {
1166 *p++ = (unsigned char)*s++;
1167 continue;
1168 }
1169
1170 /* \ - Escapes */
1171 s++;
1172 switch (*s++) {
1173
1174 /* \x escapes */
1175 case '\n': break;
1176 case '\\': *p++ = '\\'; break;
1177 case '\'': *p++ = '\''; break;
1178 case '\"': *p++ = '\"'; break;
1179 case 'b': *p++ = '\b'; break;
1180 case 'f': *p++ = '\014'; break; /* FF */
1181 case 't': *p++ = '\t'; break;
1182 case 'n': *p++ = '\n'; break;
1183 case 'r': *p++ = '\r'; break;
1184 case 'v': *p++ = '\013'; break; /* VT */
1185 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1186
1187 /* \OOO (octal) escapes */
1188 case '0': case '1': case '2': case '3':
1189 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001190 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001192 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001194 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001196 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 break;
1198
1199 /* \xXXXX escape with 0-4 hex digits */
1200 case 'x':
1201 x = 0;
1202 c = (unsigned char)*s;
1203 if (isxdigit(c)) {
1204 do {
1205 x = (x<<4) & ~0xF;
1206 if ('0' <= c && c <= '9')
1207 x += c - '0';
1208 else if ('a' <= c && c <= 'f')
1209 x += 10 + c - 'a';
1210 else
1211 x += 10 + c - 'A';
1212 c = (unsigned char)*++s;
1213 } while (isxdigit(c));
1214 *p++ = x;
1215 } else {
1216 *p++ = '\\';
1217 *p++ = (unsigned char)s[-1];
1218 }
1219 break;
1220
1221 /* \uXXXX with 4 hex digits */
1222 case 'u':
1223 for (x = 0, i = 0; i < 4; i++) {
1224 c = (unsigned char)s[i];
1225 if (!isxdigit(c)) {
1226 if (unicodeescape_decoding_error(&s, &x, errors,
1227 "truncated \\uXXXX"))
1228 goto onError;
1229 i++;
1230 break;
1231 }
1232 x = (x<<4) & ~0xF;
1233 if (c >= '0' && c <= '9')
1234 x += c - '0';
1235 else if (c >= 'a' && c <= 'f')
1236 x += 10 + c - 'a';
1237 else
1238 x += 10 + c - 'A';
1239 }
1240 s += i;
1241 *p++ = x;
1242 break;
1243
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001244 case 'N':
1245 /* Ok, we need to deal with Unicode Character Names now,
1246 * make sure we've imported the hash table data...
1247 */
1248 if (pucnHash == NULL)
1249 {
1250 PyObject *mod = 0, *v = 0;
1251
1252 mod = PyImport_ImportModule("ucnhash");
1253 if (mod == NULL)
1254 goto onError;
1255 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1256 Py_DECREF(mod);
1257 if (v == NULL)
1258 {
1259 goto onError;
1260 }
1261 pucnHash = PyCObject_AsVoidPtr(v);
1262 Py_DECREF(v);
1263 if (pucnHash == NULL)
1264 {
1265 goto onError;
1266 }
1267 }
1268
1269 if (*s == '{')
1270 {
1271 const char *start = s + 1;
1272 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001273 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001274 unsigned long j;
1275
1276 /* look for either the closing brace, or we
1277 * exceed the maximum length of the unicode character names
1278 */
1279 while (*endBrace != '}' &&
1280 (unsigned int)(endBrace - start) <=
1281 pucnHash->cchMax &&
1282 endBrace < end)
1283 {
1284 endBrace++;
1285 }
1286 if (endBrace != end && *endBrace == '}')
1287 {
1288 j = pucnHash->hash(start, endBrace - start);
1289 if (j > pucnHash->cKeys ||
1290 mystrnicmp(
1291 start,
1292 ((_Py_UnicodeCharacterName *)
1293 (pucnHash->getValue(j)))->pszUCN,
1294 (int)(endBrace - start)) != 0)
1295 {
1296 if (unicodeescape_decoding_error(
1297 &s, &x, errors,
1298 "Invalid Unicode Character Name"))
1299 {
1300 goto onError;
1301 }
1302 goto ucnFallthrough;
1303 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001304 value = ((_Py_UnicodeCharacterName *)
1305 (pucnHash->getValue(j)))->value;
1306 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001307 {
1308 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001309 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001310 }
1311 else
1312 {
1313 /* Oops, its in UCS-4 space, */
1314 /* compute and append the two surrogates: */
1315 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001316 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001317
1318 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001320
1321 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001322 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001323 }
1324 s = endBrace + 1;
1325 }
1326 else
1327 {
1328 if (unicodeescape_decoding_error(
1329 &s, &x, errors,
1330 "Unicode name missing closing brace"))
1331 goto onError;
1332 goto ucnFallthrough;
1333 }
1334 break;
1335 }
1336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Missing opening brace for Unicode Character Name escape"))
1339 goto onError;
1340ucnFallthrough:
1341 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001342 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343 *p++ = '\\';
1344 *p++ = (unsigned char)s[-1];
1345 break;
1346 }
1347 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001348 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 return (PyObject *)v;
1351
1352 onError:
1353 Py_XDECREF(v);
1354 return NULL;
1355}
1356
1357/* Return a Unicode-Escape string version of the Unicode object.
1358
1359 If quotes is true, the string is enclosed in u"" or u'' quotes as
1360 appropriate.
1361
1362*/
1363
Barry Warsaw51ac5802000-03-20 16:36:48 +00001364static const Py_UNICODE *findchar(const Py_UNICODE *s,
1365 int size,
1366 Py_UNICODE ch);
1367
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368static
1369PyObject *unicodeescape_string(const Py_UNICODE *s,
1370 int size,
1371 int quotes)
1372{
1373 PyObject *repr;
1374 char *p;
1375 char *q;
1376
1377 static const char *hexdigit = "0123456789ABCDEF";
1378
1379 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1380 if (repr == NULL)
1381 return NULL;
1382
1383 p = q = PyString_AS_STRING(repr);
1384
1385 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 *p++ = 'u';
1387 *p++ = (findchar(s, size, '\'') &&
1388 !findchar(s, size, '"')) ? '"' : '\'';
1389 }
1390 while (size-- > 0) {
1391 Py_UNICODE ch = *s++;
1392 /* Escape quotes */
1393 if (quotes && (ch == q[1] || ch == '\\')) {
1394 *p++ = '\\';
1395 *p++ = (char) ch;
1396 }
1397 /* Map 16-bit characters to '\uxxxx' */
1398 else if (ch >= 256) {
1399 *p++ = '\\';
1400 *p++ = 'u';
1401 *p++ = hexdigit[(ch >> 12) & 0xf];
1402 *p++ = hexdigit[(ch >> 8) & 0xf];
1403 *p++ = hexdigit[(ch >> 4) & 0xf];
1404 *p++ = hexdigit[ch & 15];
1405 }
1406 /* Map non-printable US ASCII to '\ooo' */
1407 else if (ch < ' ' || ch >= 128) {
1408 *p++ = '\\';
1409 *p++ = hexdigit[(ch >> 6) & 7];
1410 *p++ = hexdigit[(ch >> 3) & 7];
1411 *p++ = hexdigit[ch & 7];
1412 }
1413 /* Copy everything else as-is */
1414 else
1415 *p++ = (char) ch;
1416 }
1417 if (quotes)
1418 *p++ = q[1];
1419
1420 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001421 if (_PyString_Resize(&repr, p - q))
1422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423
1424 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001425
1426 onError:
1427 Py_DECREF(repr);
1428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429}
1430
1431PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1432 int size)
1433{
1434 return unicodeescape_string(s, size, 0);
1435}
1436
1437PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1438{
1439 if (!PyUnicode_Check(unicode)) {
1440 PyErr_BadArgument();
1441 return NULL;
1442 }
1443 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1444 PyUnicode_GET_SIZE(unicode));
1445}
1446
1447/* --- Raw Unicode Escape Codec ------------------------------------------- */
1448
1449PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1450 int size,
1451 const char *errors)
1452{
1453 PyUnicodeObject *v;
1454 Py_UNICODE *p, *buf;
1455 const char *end;
1456 const char *bs;
1457
1458 /* Escaped strings will always be longer than the resulting
1459 Unicode string, so we start with size here and then reduce the
1460 length after conversion to the true value. */
1461 v = _PyUnicode_New(size);
1462 if (v == NULL)
1463 goto onError;
1464 if (size == 0)
1465 return (PyObject *)v;
1466 p = buf = PyUnicode_AS_UNICODE(v);
1467 end = s + size;
1468 while (s < end) {
1469 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001470 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471 int i;
1472
1473 /* Non-escape characters are interpreted as Unicode ordinals */
1474 if (*s != '\\') {
1475 *p++ = (unsigned char)*s++;
1476 continue;
1477 }
1478
1479 /* \u-escapes are only interpreted iff the number of leading
1480 backslashes if odd */
1481 bs = s;
1482 for (;s < end;) {
1483 if (*s != '\\')
1484 break;
1485 *p++ = (unsigned char)*s++;
1486 }
1487 if (((s - bs) & 1) == 0 ||
1488 s >= end ||
1489 *s != 'u') {
1490 continue;
1491 }
1492 p--;
1493 s++;
1494
1495 /* \uXXXX with 4 hex digits */
1496 for (x = 0, i = 0; i < 4; i++) {
1497 c = (unsigned char)s[i];
1498 if (!isxdigit(c)) {
1499 if (unicodeescape_decoding_error(&s, &x, errors,
1500 "truncated \\uXXXX"))
1501 goto onError;
1502 i++;
1503 break;
1504 }
1505 x = (x<<4) & ~0xF;
1506 if (c >= '0' && c <= '9')
1507 x += c - '0';
1508 else if (c >= 'a' && c <= 'f')
1509 x += 10 + c - 'a';
1510 else
1511 x += 10 + c - 'A';
1512 }
1513 s += i;
1514 *p++ = x;
1515 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001516 if (_PyUnicode_Resize(v, (int)(p - buf)))
1517 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 return (PyObject *)v;
1519
1520 onError:
1521 Py_XDECREF(v);
1522 return NULL;
1523}
1524
1525PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1526 int size)
1527{
1528 PyObject *repr;
1529 char *p;
1530 char *q;
1531
1532 static const char *hexdigit = "0123456789ABCDEF";
1533
1534 repr = PyString_FromStringAndSize(NULL, 6 * size);
1535 if (repr == NULL)
1536 return NULL;
1537
1538 p = q = PyString_AS_STRING(repr);
1539 while (size-- > 0) {
1540 Py_UNICODE ch = *s++;
1541 /* Map 16-bit characters to '\uxxxx' */
1542 if (ch >= 256) {
1543 *p++ = '\\';
1544 *p++ = 'u';
1545 *p++ = hexdigit[(ch >> 12) & 0xf];
1546 *p++ = hexdigit[(ch >> 8) & 0xf];
1547 *p++ = hexdigit[(ch >> 4) & 0xf];
1548 *p++ = hexdigit[ch & 15];
1549 }
1550 /* Copy everything else as-is */
1551 else
1552 *p++ = (char) ch;
1553 }
1554 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001555 if (_PyString_Resize(&repr, p - q))
1556 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557
1558 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001559
1560 onError:
1561 Py_DECREF(repr);
1562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563}
1564
1565PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1566{
1567 if (!PyUnicode_Check(unicode)) {
1568 PyErr_BadArgument();
1569 return NULL;
1570 }
1571 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1572 PyUnicode_GET_SIZE(unicode));
1573}
1574
1575/* --- Latin-1 Codec ------------------------------------------------------ */
1576
1577PyObject *PyUnicode_DecodeLatin1(const char *s,
1578 int size,
1579 const char *errors)
1580{
1581 PyUnicodeObject *v;
1582 Py_UNICODE *p;
1583
1584 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1585 v = _PyUnicode_New(size);
1586 if (v == NULL)
1587 goto onError;
1588 if (size == 0)
1589 return (PyObject *)v;
1590 p = PyUnicode_AS_UNICODE(v);
1591 while (size-- > 0)
1592 *p++ = (unsigned char)*s++;
1593 return (PyObject *)v;
1594
1595 onError:
1596 Py_XDECREF(v);
1597 return NULL;
1598}
1599
1600static
1601int latin1_encoding_error(const Py_UNICODE **source,
1602 char **dest,
1603 const char *errors,
1604 const char *details)
1605{
1606 if ((errors == NULL) ||
1607 (strcmp(errors,"strict") == 0)) {
1608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001609 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 details);
1611 return -1;
1612 }
1613 else if (strcmp(errors,"ignore") == 0) {
1614 return 0;
1615 }
1616 else if (strcmp(errors,"replace") == 0) {
1617 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001618 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 return 0;
1620 }
1621 else {
1622 PyErr_Format(PyExc_ValueError,
1623 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001624 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625 errors);
1626 return -1;
1627 }
1628}
1629
1630PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1631 int size,
1632 const char *errors)
1633{
1634 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001635 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 repr = PyString_FromStringAndSize(NULL, size);
1637 if (repr == NULL)
1638 return NULL;
1639
1640 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001641 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 while (size-- > 0) {
1643 Py_UNICODE ch = *p++;
1644 if (ch >= 256) {
1645 if (latin1_encoding_error(&p, &s, errors,
1646 "ordinal not in range(256)"))
1647 goto onError;
1648 }
1649 else
1650 *s++ = (char)ch;
1651 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001652 /* Resize if error handling skipped some characters */
1653 if (s - start < PyString_GET_SIZE(repr))
1654 if (_PyString_Resize(&repr, s - start))
1655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656 return repr;
1657
1658 onError:
1659 Py_DECREF(repr);
1660 return NULL;
1661}
1662
1663PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1664{
1665 if (!PyUnicode_Check(unicode)) {
1666 PyErr_BadArgument();
1667 return NULL;
1668 }
1669 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1670 PyUnicode_GET_SIZE(unicode),
1671 NULL);
1672}
1673
1674/* --- 7-bit ASCII Codec -------------------------------------------------- */
1675
1676static
1677int ascii_decoding_error(const char **source,
1678 Py_UNICODE **dest,
1679 const char *errors,
1680 const char *details)
1681{
1682 if ((errors == NULL) ||
1683 (strcmp(errors,"strict") == 0)) {
1684 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001685 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 details);
1687 return -1;
1688 }
1689 else if (strcmp(errors,"ignore") == 0) {
1690 return 0;
1691 }
1692 else if (strcmp(errors,"replace") == 0) {
1693 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1694 (*dest)++;
1695 return 0;
1696 }
1697 else {
1698 PyErr_Format(PyExc_ValueError,
1699 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001700 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 errors);
1702 return -1;
1703 }
1704}
1705
1706PyObject *PyUnicode_DecodeASCII(const char *s,
1707 int size,
1708 const char *errors)
1709{
1710 PyUnicodeObject *v;
1711 Py_UNICODE *p;
1712
1713 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1714 v = _PyUnicode_New(size);
1715 if (v == NULL)
1716 goto onError;
1717 if (size == 0)
1718 return (PyObject *)v;
1719 p = PyUnicode_AS_UNICODE(v);
1720 while (size-- > 0) {
1721 register unsigned char c;
1722
1723 c = (unsigned char)*s++;
1724 if (c < 128)
1725 *p++ = c;
1726 else if (ascii_decoding_error(&s, &p, errors,
1727 "ordinal not in range(128)"))
1728 goto onError;
1729 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001730 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1731 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1732 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 return (PyObject *)v;
1734
1735 onError:
1736 Py_XDECREF(v);
1737 return NULL;
1738}
1739
1740static
1741int ascii_encoding_error(const Py_UNICODE **source,
1742 char **dest,
1743 const char *errors,
1744 const char *details)
1745{
1746 if ((errors == NULL) ||
1747 (strcmp(errors,"strict") == 0)) {
1748 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001749 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 details);
1751 return -1;
1752 }
1753 else if (strcmp(errors,"ignore") == 0) {
1754 return 0;
1755 }
1756 else if (strcmp(errors,"replace") == 0) {
1757 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001758 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 return 0;
1760 }
1761 else {
1762 PyErr_Format(PyExc_ValueError,
1763 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001764 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 errors);
1766 return -1;
1767 }
1768}
1769
1770PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1771 int size,
1772 const char *errors)
1773{
1774 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001775 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 repr = PyString_FromStringAndSize(NULL, size);
1777 if (repr == NULL)
1778 return NULL;
1779
1780 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001781 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 while (size-- > 0) {
1783 Py_UNICODE ch = *p++;
1784 if (ch >= 128) {
1785 if (ascii_encoding_error(&p, &s, errors,
1786 "ordinal not in range(128)"))
1787 goto onError;
1788 }
1789 else
1790 *s++ = (char)ch;
1791 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001792 /* Resize if error handling skipped some characters */
1793 if (s - start < PyString_GET_SIZE(repr))
1794 if (_PyString_Resize(&repr, s - start))
1795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 return repr;
1797
1798 onError:
1799 Py_DECREF(repr);
1800 return NULL;
1801}
1802
1803PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1804{
1805 if (!PyUnicode_Check(unicode)) {
1806 PyErr_BadArgument();
1807 return NULL;
1808 }
1809 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1810 PyUnicode_GET_SIZE(unicode),
1811 NULL);
1812}
1813
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001814#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001815
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001816/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001817
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001818PyObject *PyUnicode_DecodeMBCS(const char *s,
1819 int size,
1820 const char *errors)
1821{
1822 PyUnicodeObject *v;
1823 Py_UNICODE *p;
1824
1825 /* First get the size of the result */
1826 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001827 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001828 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1829
1830 v = _PyUnicode_New(usize);
1831 if (v == NULL)
1832 return NULL;
1833 if (usize == 0)
1834 return (PyObject *)v;
1835 p = PyUnicode_AS_UNICODE(v);
1836 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1837 Py_DECREF(v);
1838 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1839 }
1840
1841 return (PyObject *)v;
1842}
1843
1844PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1845 int size,
1846 const char *errors)
1847{
1848 PyObject *repr;
1849 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001850 DWORD mbcssize;
1851
1852 /* If there are no characters, bail now! */
1853 if (size==0)
1854 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001855
1856 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001857 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001858 if (mbcssize==0)
1859 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1860
1861 repr = PyString_FromStringAndSize(NULL, mbcssize);
1862 if (repr == NULL)
1863 return NULL;
1864 if (mbcssize==0)
1865 return repr;
1866
1867 /* Do the conversion */
1868 s = PyString_AS_STRING(repr);
1869 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1870 Py_DECREF(repr);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872 }
1873 return repr;
1874}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001875
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001876#endif /* MS_WIN32 */
1877
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878/* --- Character Mapping Codec -------------------------------------------- */
1879
1880static
1881int charmap_decoding_error(const char **source,
1882 Py_UNICODE **dest,
1883 const char *errors,
1884 const char *details)
1885{
1886 if ((errors == NULL) ||
1887 (strcmp(errors,"strict") == 0)) {
1888 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001889 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 details);
1891 return -1;
1892 }
1893 else if (strcmp(errors,"ignore") == 0) {
1894 return 0;
1895 }
1896 else if (strcmp(errors,"replace") == 0) {
1897 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1898 (*dest)++;
1899 return 0;
1900 }
1901 else {
1902 PyErr_Format(PyExc_ValueError,
1903 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001904 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 errors);
1906 return -1;
1907 }
1908}
1909
1910PyObject *PyUnicode_DecodeCharmap(const char *s,
1911 int size,
1912 PyObject *mapping,
1913 const char *errors)
1914{
1915 PyUnicodeObject *v;
1916 Py_UNICODE *p;
1917
1918 /* Default to Latin-1 */
1919 if (mapping == NULL)
1920 return PyUnicode_DecodeLatin1(s, size, errors);
1921
1922 v = _PyUnicode_New(size);
1923 if (v == NULL)
1924 goto onError;
1925 if (size == 0)
1926 return (PyObject *)v;
1927 p = PyUnicode_AS_UNICODE(v);
1928 while (size-- > 0) {
1929 unsigned char ch = *s++;
1930 PyObject *w, *x;
1931
1932 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1933 w = PyInt_FromLong((long)ch);
1934 if (w == NULL)
1935 goto onError;
1936 x = PyObject_GetItem(mapping, w);
1937 Py_DECREF(w);
1938 if (x == NULL) {
1939 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1940 /* No mapping found: default to Latin-1 mapping */
1941 PyErr_Clear();
1942 *p++ = (Py_UNICODE)ch;
1943 continue;
1944 }
1945 goto onError;
1946 }
1947
1948 /* Apply mapping */
1949 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001950 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 if (value < 0 || value > 65535) {
1952 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001953 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 Py_DECREF(x);
1955 goto onError;
1956 }
1957 *p++ = (Py_UNICODE)value;
1958 }
1959 else if (x == Py_None) {
1960 /* undefined mapping */
1961 if (charmap_decoding_error(&s, &p, errors,
1962 "character maps to <undefined>")) {
1963 Py_DECREF(x);
1964 goto onError;
1965 }
1966 }
1967 else if (PyUnicode_Check(x)) {
1968 if (PyUnicode_GET_SIZE(x) != 1) {
1969 /* 1-n mapping */
1970 PyErr_SetString(PyExc_NotImplementedError,
1971 "1-n mappings are currently not implemented");
1972 Py_DECREF(x);
1973 goto onError;
1974 }
1975 *p++ = *PyUnicode_AS_UNICODE(x);
1976 }
1977 else {
1978 /* wrong return value */
1979 PyErr_SetString(PyExc_TypeError,
1980 "character mapping must return integer, None or unicode");
1981 Py_DECREF(x);
1982 goto onError;
1983 }
1984 Py_DECREF(x);
1985 }
1986 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1987 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1988 goto onError;
1989 return (PyObject *)v;
1990
1991 onError:
1992 Py_XDECREF(v);
1993 return NULL;
1994}
1995
1996static
1997int charmap_encoding_error(const Py_UNICODE **source,
1998 char **dest,
1999 const char *errors,
2000 const char *details)
2001{
2002 if ((errors == NULL) ||
2003 (strcmp(errors,"strict") == 0)) {
2004 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002005 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 details);
2007 return -1;
2008 }
2009 else if (strcmp(errors,"ignore") == 0) {
2010 return 0;
2011 }
2012 else if (strcmp(errors,"replace") == 0) {
2013 **dest = '?';
2014 (*dest)++;
2015 return 0;
2016 }
2017 else {
2018 PyErr_Format(PyExc_ValueError,
2019 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002020 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 errors);
2022 return -1;
2023 }
2024}
2025
2026PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2027 int size,
2028 PyObject *mapping,
2029 const char *errors)
2030{
2031 PyObject *v;
2032 char *s;
2033
2034 /* Default to Latin-1 */
2035 if (mapping == NULL)
2036 return PyUnicode_EncodeLatin1(p, size, errors);
2037
2038 v = PyString_FromStringAndSize(NULL, size);
2039 if (v == NULL)
2040 return NULL;
2041 s = PyString_AS_STRING(v);
2042 while (size-- > 0) {
2043 Py_UNICODE ch = *p++;
2044 PyObject *w, *x;
2045
2046 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2047 w = PyInt_FromLong((long)ch);
2048 if (w == NULL)
2049 goto onError;
2050 x = PyObject_GetItem(mapping, w);
2051 Py_DECREF(w);
2052 if (x == NULL) {
2053 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2054 /* No mapping found: default to Latin-1 mapping if possible */
2055 PyErr_Clear();
2056 if (ch < 256) {
2057 *s++ = (char)ch;
2058 continue;
2059 }
2060 else if (!charmap_encoding_error(&p, &s, errors,
2061 "missing character mapping"))
2062 continue;
2063 }
2064 goto onError;
2065 }
2066
2067 /* Apply mapping */
2068 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002069 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 if (value < 0 || value > 255) {
2071 PyErr_SetString(PyExc_TypeError,
2072 "character mapping must be in range(256)");
2073 Py_DECREF(x);
2074 goto onError;
2075 }
2076 *s++ = (char)value;
2077 }
2078 else if (x == Py_None) {
2079 /* undefined mapping */
2080 if (charmap_encoding_error(&p, &s, errors,
2081 "character maps to <undefined>")) {
2082 Py_DECREF(x);
2083 goto onError;
2084 }
2085 }
2086 else if (PyString_Check(x)) {
2087 if (PyString_GET_SIZE(x) != 1) {
2088 /* 1-n mapping */
2089 PyErr_SetString(PyExc_NotImplementedError,
2090 "1-n mappings are currently not implemented");
2091 Py_DECREF(x);
2092 goto onError;
2093 }
2094 *s++ = *PyString_AS_STRING(x);
2095 }
2096 else {
2097 /* wrong return value */
2098 PyErr_SetString(PyExc_TypeError,
2099 "character mapping must return integer, None or unicode");
2100 Py_DECREF(x);
2101 goto onError;
2102 }
2103 Py_DECREF(x);
2104 }
2105 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2106 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2107 goto onError;
2108 return v;
2109
2110 onError:
2111 Py_DECREF(v);
2112 return NULL;
2113}
2114
2115PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2116 PyObject *mapping)
2117{
2118 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2119 PyErr_BadArgument();
2120 return NULL;
2121 }
2122 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2123 PyUnicode_GET_SIZE(unicode),
2124 mapping,
2125 NULL);
2126}
2127
2128static
2129int translate_error(const Py_UNICODE **source,
2130 Py_UNICODE **dest,
2131 const char *errors,
2132 const char *details)
2133{
2134 if ((errors == NULL) ||
2135 (strcmp(errors,"strict") == 0)) {
2136 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002137 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 details);
2139 return -1;
2140 }
2141 else if (strcmp(errors,"ignore") == 0) {
2142 return 0;
2143 }
2144 else if (strcmp(errors,"replace") == 0) {
2145 **dest = '?';
2146 (*dest)++;
2147 return 0;
2148 }
2149 else {
2150 PyErr_Format(PyExc_ValueError,
2151 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002152 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 errors);
2154 return -1;
2155 }
2156}
2157
2158PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2159 int size,
2160 PyObject *mapping,
2161 const char *errors)
2162{
2163 PyUnicodeObject *v;
2164 Py_UNICODE *p;
2165
2166 if (mapping == NULL) {
2167 PyErr_BadArgument();
2168 return NULL;
2169 }
2170
2171 /* Output will never be longer than input */
2172 v = _PyUnicode_New(size);
2173 if (v == NULL)
2174 goto onError;
2175 if (size == 0)
2176 goto done;
2177 p = PyUnicode_AS_UNICODE(v);
2178 while (size-- > 0) {
2179 Py_UNICODE ch = *s++;
2180 PyObject *w, *x;
2181
2182 /* Get mapping */
2183 w = PyInt_FromLong(ch);
2184 if (w == NULL)
2185 goto onError;
2186 x = PyObject_GetItem(mapping, w);
2187 Py_DECREF(w);
2188 if (x == NULL) {
2189 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2190 /* No mapping found: default to 1-1 mapping */
2191 PyErr_Clear();
2192 *p++ = ch;
2193 continue;
2194 }
2195 goto onError;
2196 }
2197
2198 /* Apply mapping */
2199 if (PyInt_Check(x))
2200 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2201 else if (x == Py_None) {
2202 /* undefined mapping */
2203 if (translate_error(&s, &p, errors,
2204 "character maps to <undefined>")) {
2205 Py_DECREF(x);
2206 goto onError;
2207 }
2208 }
2209 else if (PyUnicode_Check(x)) {
2210 if (PyUnicode_GET_SIZE(x) != 1) {
2211 /* 1-n mapping */
2212 PyErr_SetString(PyExc_NotImplementedError,
2213 "1-n mappings are currently not implemented");
2214 Py_DECREF(x);
2215 goto onError;
2216 }
2217 *p++ = *PyUnicode_AS_UNICODE(x);
2218 }
2219 else {
2220 /* wrong return value */
2221 PyErr_SetString(PyExc_TypeError,
2222 "translate mapping must return integer, None or unicode");
2223 Py_DECREF(x);
2224 goto onError;
2225 }
2226 Py_DECREF(x);
2227 }
2228 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002229 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2230 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231
2232 done:
2233 return (PyObject *)v;
2234
2235 onError:
2236 Py_XDECREF(v);
2237 return NULL;
2238}
2239
2240PyObject *PyUnicode_Translate(PyObject *str,
2241 PyObject *mapping,
2242 const char *errors)
2243{
2244 PyObject *result;
2245
2246 str = PyUnicode_FromObject(str);
2247 if (str == NULL)
2248 goto onError;
2249 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2250 PyUnicode_GET_SIZE(str),
2251 mapping,
2252 errors);
2253 Py_DECREF(str);
2254 return result;
2255
2256 onError:
2257 Py_XDECREF(str);
2258 return NULL;
2259}
2260
Guido van Rossum9e896b32000-04-05 20:11:21 +00002261/* --- Decimal Encoder ---------------------------------------------------- */
2262
2263int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2264 int length,
2265 char *output,
2266 const char *errors)
2267{
2268 Py_UNICODE *p, *end;
2269
2270 if (output == NULL) {
2271 PyErr_BadArgument();
2272 return -1;
2273 }
2274
2275 p = s;
2276 end = s + length;
2277 while (p < end) {
2278 register Py_UNICODE ch = *p++;
2279 int decimal;
2280
2281 if (Py_UNICODE_ISSPACE(ch)) {
2282 *output++ = ' ';
2283 continue;
2284 }
2285 decimal = Py_UNICODE_TODECIMAL(ch);
2286 if (decimal >= 0) {
2287 *output++ = '0' + decimal;
2288 continue;
2289 }
Guido van Rossumba477042000-04-06 18:18:10 +00002290 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002291 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002292 continue;
2293 }
2294 /* All other characters are considered invalid */
2295 if (errors == NULL || strcmp(errors, "strict") == 0) {
2296 PyErr_SetString(PyExc_ValueError,
2297 "invalid decimal Unicode string");
2298 goto onError;
2299 }
2300 else if (strcmp(errors, "ignore") == 0)
2301 continue;
2302 else if (strcmp(errors, "replace") == 0) {
2303 *output++ = '?';
2304 continue;
2305 }
2306 }
2307 /* 0-terminate the output string */
2308 *output++ = '\0';
2309 return 0;
2310
2311 onError:
2312 return -1;
2313}
2314
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315/* --- Helpers ------------------------------------------------------------ */
2316
2317static
2318int count(PyUnicodeObject *self,
2319 int start,
2320 int end,
2321 PyUnicodeObject *substring)
2322{
2323 int count = 0;
2324
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002325 if (substring->length == 0)
2326 return (end - start + 1);
2327
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 end -= substring->length;
2329
2330 while (start <= end)
2331 if (Py_UNICODE_MATCH(self, start, substring)) {
2332 count++;
2333 start += substring->length;
2334 } else
2335 start++;
2336
2337 return count;
2338}
2339
2340int PyUnicode_Count(PyObject *str,
2341 PyObject *substr,
2342 int start,
2343 int end)
2344{
2345 int result;
2346
2347 str = PyUnicode_FromObject(str);
2348 if (str == NULL)
2349 return -1;
2350 substr = PyUnicode_FromObject(substr);
2351 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002352 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 return -1;
2354 }
2355
2356 result = count((PyUnicodeObject *)str,
2357 start, end,
2358 (PyUnicodeObject *)substr);
2359
2360 Py_DECREF(str);
2361 Py_DECREF(substr);
2362 return result;
2363}
2364
2365static
2366int findstring(PyUnicodeObject *self,
2367 PyUnicodeObject *substring,
2368 int start,
2369 int end,
2370 int direction)
2371{
2372 if (start < 0)
2373 start += self->length;
2374 if (start < 0)
2375 start = 0;
2376
2377 if (substring->length == 0)
2378 return start;
2379
2380 if (end > self->length)
2381 end = self->length;
2382 if (end < 0)
2383 end += self->length;
2384 if (end < 0)
2385 end = 0;
2386
2387 end -= substring->length;
2388
2389 if (direction < 0) {
2390 for (; end >= start; end--)
2391 if (Py_UNICODE_MATCH(self, end, substring))
2392 return end;
2393 } else {
2394 for (; start <= end; start++)
2395 if (Py_UNICODE_MATCH(self, start, substring))
2396 return start;
2397 }
2398
2399 return -1;
2400}
2401
2402int PyUnicode_Find(PyObject *str,
2403 PyObject *substr,
2404 int start,
2405 int end,
2406 int direction)
2407{
2408 int result;
2409
2410 str = PyUnicode_FromObject(str);
2411 if (str == NULL)
2412 return -1;
2413 substr = PyUnicode_FromObject(substr);
2414 if (substr == NULL) {
2415 Py_DECREF(substr);
2416 return -1;
2417 }
2418
2419 result = findstring((PyUnicodeObject *)str,
2420 (PyUnicodeObject *)substr,
2421 start, end, direction);
2422 Py_DECREF(str);
2423 Py_DECREF(substr);
2424 return result;
2425}
2426
2427static
2428int tailmatch(PyUnicodeObject *self,
2429 PyUnicodeObject *substring,
2430 int start,
2431 int end,
2432 int direction)
2433{
2434 if (start < 0)
2435 start += self->length;
2436 if (start < 0)
2437 start = 0;
2438
2439 if (substring->length == 0)
2440 return 1;
2441
2442 if (end > self->length)
2443 end = self->length;
2444 if (end < 0)
2445 end += self->length;
2446 if (end < 0)
2447 end = 0;
2448
2449 end -= substring->length;
2450 if (end < start)
2451 return 0;
2452
2453 if (direction > 0) {
2454 if (Py_UNICODE_MATCH(self, end, substring))
2455 return 1;
2456 } else {
2457 if (Py_UNICODE_MATCH(self, start, substring))
2458 return 1;
2459 }
2460
2461 return 0;
2462}
2463
2464int PyUnicode_Tailmatch(PyObject *str,
2465 PyObject *substr,
2466 int start,
2467 int end,
2468 int direction)
2469{
2470 int result;
2471
2472 str = PyUnicode_FromObject(str);
2473 if (str == NULL)
2474 return -1;
2475 substr = PyUnicode_FromObject(substr);
2476 if (substr == NULL) {
2477 Py_DECREF(substr);
2478 return -1;
2479 }
2480
2481 result = tailmatch((PyUnicodeObject *)str,
2482 (PyUnicodeObject *)substr,
2483 start, end, direction);
2484 Py_DECREF(str);
2485 Py_DECREF(substr);
2486 return result;
2487}
2488
2489static
2490const Py_UNICODE *findchar(const Py_UNICODE *s,
2491 int size,
2492 Py_UNICODE ch)
2493{
2494 /* like wcschr, but doesn't stop at NULL characters */
2495
2496 while (size-- > 0) {
2497 if (*s == ch)
2498 return s;
2499 s++;
2500 }
2501
2502 return NULL;
2503}
2504
2505/* Apply fixfct filter to the Unicode object self and return a
2506 reference to the modified object */
2507
2508static
2509PyObject *fixup(PyUnicodeObject *self,
2510 int (*fixfct)(PyUnicodeObject *s))
2511{
2512
2513 PyUnicodeObject *u;
2514
2515 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2516 self->length);
2517 if (u == NULL)
2518 return NULL;
2519 if (!fixfct(u)) {
2520 /* fixfct should return TRUE if it modified the buffer. If
2521 FALSE, return a reference to the original buffer instead
2522 (to save space, not time) */
2523 Py_INCREF(self);
2524 Py_DECREF(u);
2525 return (PyObject*) self;
2526 }
2527 return (PyObject*) u;
2528}
2529
2530static
2531int fixupper(PyUnicodeObject *self)
2532{
2533 int len = self->length;
2534 Py_UNICODE *s = self->str;
2535 int status = 0;
2536
2537 while (len-- > 0) {
2538 register Py_UNICODE ch;
2539
2540 ch = Py_UNICODE_TOUPPER(*s);
2541 if (ch != *s) {
2542 status = 1;
2543 *s = ch;
2544 }
2545 s++;
2546 }
2547
2548 return status;
2549}
2550
2551static
2552int fixlower(PyUnicodeObject *self)
2553{
2554 int len = self->length;
2555 Py_UNICODE *s = self->str;
2556 int status = 0;
2557
2558 while (len-- > 0) {
2559 register Py_UNICODE ch;
2560
2561 ch = Py_UNICODE_TOLOWER(*s);
2562 if (ch != *s) {
2563 status = 1;
2564 *s = ch;
2565 }
2566 s++;
2567 }
2568
2569 return status;
2570}
2571
2572static
2573int fixswapcase(PyUnicodeObject *self)
2574{
2575 int len = self->length;
2576 Py_UNICODE *s = self->str;
2577 int status = 0;
2578
2579 while (len-- > 0) {
2580 if (Py_UNICODE_ISUPPER(*s)) {
2581 *s = Py_UNICODE_TOLOWER(*s);
2582 status = 1;
2583 } else if (Py_UNICODE_ISLOWER(*s)) {
2584 *s = Py_UNICODE_TOUPPER(*s);
2585 status = 1;
2586 }
2587 s++;
2588 }
2589
2590 return status;
2591}
2592
2593static
2594int fixcapitalize(PyUnicodeObject *self)
2595{
2596 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2597 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2598 return 1;
2599 }
2600 return 0;
2601}
2602
2603static
2604int fixtitle(PyUnicodeObject *self)
2605{
2606 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2607 register Py_UNICODE *e;
2608 int previous_is_cased;
2609
2610 /* Shortcut for single character strings */
2611 if (PyUnicode_GET_SIZE(self) == 1) {
2612 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2613 if (*p != ch) {
2614 *p = ch;
2615 return 1;
2616 }
2617 else
2618 return 0;
2619 }
2620
2621 e = p + PyUnicode_GET_SIZE(self);
2622 previous_is_cased = 0;
2623 for (; p < e; p++) {
2624 register const Py_UNICODE ch = *p;
2625
2626 if (previous_is_cased)
2627 *p = Py_UNICODE_TOLOWER(ch);
2628 else
2629 *p = Py_UNICODE_TOTITLE(ch);
2630
2631 if (Py_UNICODE_ISLOWER(ch) ||
2632 Py_UNICODE_ISUPPER(ch) ||
2633 Py_UNICODE_ISTITLE(ch))
2634 previous_is_cased = 1;
2635 else
2636 previous_is_cased = 0;
2637 }
2638 return 1;
2639}
2640
2641PyObject *PyUnicode_Join(PyObject *separator,
2642 PyObject *seq)
2643{
2644 Py_UNICODE *sep;
2645 int seplen;
2646 PyUnicodeObject *res = NULL;
2647 int reslen = 0;
2648 Py_UNICODE *p;
2649 int seqlen = 0;
2650 int sz = 100;
2651 int i;
2652
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002653 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 if (seqlen < 0 && PyErr_Occurred())
2655 return NULL;
2656
2657 if (separator == NULL) {
2658 Py_UNICODE blank = ' ';
2659 sep = &blank;
2660 seplen = 1;
2661 }
2662 else {
2663 separator = PyUnicode_FromObject(separator);
2664 if (separator == NULL)
2665 return NULL;
2666 sep = PyUnicode_AS_UNICODE(separator);
2667 seplen = PyUnicode_GET_SIZE(separator);
2668 }
2669
2670 res = _PyUnicode_New(sz);
2671 if (res == NULL)
2672 goto onError;
2673 p = PyUnicode_AS_UNICODE(res);
2674 reslen = 0;
2675
2676 for (i = 0; i < seqlen; i++) {
2677 int itemlen;
2678 PyObject *item;
2679
2680 item = PySequence_GetItem(seq, i);
2681 if (item == NULL)
2682 goto onError;
2683 if (!PyUnicode_Check(item)) {
2684 PyObject *v;
2685 v = PyUnicode_FromObject(item);
2686 Py_DECREF(item);
2687 item = v;
2688 if (item == NULL)
2689 goto onError;
2690 }
2691 itemlen = PyUnicode_GET_SIZE(item);
2692 while (reslen + itemlen + seplen >= sz) {
2693 if (_PyUnicode_Resize(res, sz*2))
2694 goto onError;
2695 sz *= 2;
2696 p = PyUnicode_AS_UNICODE(res) + reslen;
2697 }
2698 if (i > 0) {
2699 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2700 p += seplen;
2701 reslen += seplen;
2702 }
2703 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2704 p += itemlen;
2705 reslen += itemlen;
2706 Py_DECREF(item);
2707 }
2708 if (_PyUnicode_Resize(res, reslen))
2709 goto onError;
2710
2711 Py_XDECREF(separator);
2712 return (PyObject *)res;
2713
2714 onError:
2715 Py_XDECREF(separator);
2716 Py_DECREF(res);
2717 return NULL;
2718}
2719
2720static
2721PyUnicodeObject *pad(PyUnicodeObject *self,
2722 int left,
2723 int right,
2724 Py_UNICODE fill)
2725{
2726 PyUnicodeObject *u;
2727
2728 if (left < 0)
2729 left = 0;
2730 if (right < 0)
2731 right = 0;
2732
2733 if (left == 0 && right == 0) {
2734 Py_INCREF(self);
2735 return self;
2736 }
2737
2738 u = _PyUnicode_New(left + self->length + right);
2739 if (u) {
2740 if (left)
2741 Py_UNICODE_FILL(u->str, fill, left);
2742 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2743 if (right)
2744 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2745 }
2746
2747 return u;
2748}
2749
2750#define SPLIT_APPEND(data, left, right) \
2751 str = PyUnicode_FromUnicode(data + left, right - left); \
2752 if (!str) \
2753 goto onError; \
2754 if (PyList_Append(list, str)) { \
2755 Py_DECREF(str); \
2756 goto onError; \
2757 } \
2758 else \
2759 Py_DECREF(str);
2760
2761static
2762PyObject *split_whitespace(PyUnicodeObject *self,
2763 PyObject *list,
2764 int maxcount)
2765{
2766 register int i;
2767 register int j;
2768 int len = self->length;
2769 PyObject *str;
2770
2771 for (i = j = 0; i < len; ) {
2772 /* find a token */
2773 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2774 i++;
2775 j = i;
2776 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2777 i++;
2778 if (j < i) {
2779 if (maxcount-- <= 0)
2780 break;
2781 SPLIT_APPEND(self->str, j, i);
2782 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2783 i++;
2784 j = i;
2785 }
2786 }
2787 if (j < len) {
2788 SPLIT_APPEND(self->str, j, len);
2789 }
2790 return list;
2791
2792 onError:
2793 Py_DECREF(list);
2794 return NULL;
2795}
2796
2797PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002798 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799{
2800 register int i;
2801 register int j;
2802 int len;
2803 PyObject *list;
2804 PyObject *str;
2805 Py_UNICODE *data;
2806
2807 string = PyUnicode_FromObject(string);
2808 if (string == NULL)
2809 return NULL;
2810 data = PyUnicode_AS_UNICODE(string);
2811 len = PyUnicode_GET_SIZE(string);
2812
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 list = PyList_New(0);
2814 if (!list)
2815 goto onError;
2816
2817 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002818 int eol;
2819
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 /* Find a line and append it */
2821 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2822 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823
2824 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002825 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 if (i < len) {
2827 if (data[i] == '\r' && i + 1 < len &&
2828 data[i+1] == '\n')
2829 i += 2;
2830 else
2831 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002832 if (keepends)
2833 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Guido van Rossum86662912000-04-11 15:38:46 +00002835 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 j = i;
2837 }
2838 if (j < len) {
2839 SPLIT_APPEND(data, j, len);
2840 }
2841
2842 Py_DECREF(string);
2843 return list;
2844
2845 onError:
2846 Py_DECREF(list);
2847 Py_DECREF(string);
2848 return NULL;
2849}
2850
2851static
2852PyObject *split_char(PyUnicodeObject *self,
2853 PyObject *list,
2854 Py_UNICODE ch,
2855 int maxcount)
2856{
2857 register int i;
2858 register int j;
2859 int len = self->length;
2860 PyObject *str;
2861
2862 for (i = j = 0; i < len; ) {
2863 if (self->str[i] == ch) {
2864 if (maxcount-- <= 0)
2865 break;
2866 SPLIT_APPEND(self->str, j, i);
2867 i = j = i + 1;
2868 } else
2869 i++;
2870 }
2871 if (j <= len) {
2872 SPLIT_APPEND(self->str, j, len);
2873 }
2874 return list;
2875
2876 onError:
2877 Py_DECREF(list);
2878 return NULL;
2879}
2880
2881static
2882PyObject *split_substring(PyUnicodeObject *self,
2883 PyObject *list,
2884 PyUnicodeObject *substring,
2885 int maxcount)
2886{
2887 register int i;
2888 register int j;
2889 int len = self->length;
2890 int sublen = substring->length;
2891 PyObject *str;
2892
2893 for (i = j = 0; i < len - sublen; ) {
2894 if (Py_UNICODE_MATCH(self, i, substring)) {
2895 if (maxcount-- <= 0)
2896 break;
2897 SPLIT_APPEND(self->str, j, i);
2898 i = j = i + sublen;
2899 } else
2900 i++;
2901 }
2902 if (j <= len) {
2903 SPLIT_APPEND(self->str, j, len);
2904 }
2905 return list;
2906
2907 onError:
2908 Py_DECREF(list);
2909 return NULL;
2910}
2911
2912#undef SPLIT_APPEND
2913
2914static
2915PyObject *split(PyUnicodeObject *self,
2916 PyUnicodeObject *substring,
2917 int maxcount)
2918{
2919 PyObject *list;
2920
2921 if (maxcount < 0)
2922 maxcount = INT_MAX;
2923
2924 list = PyList_New(0);
2925 if (!list)
2926 return NULL;
2927
2928 if (substring == NULL)
2929 return split_whitespace(self,list,maxcount);
2930
2931 else if (substring->length == 1)
2932 return split_char(self,list,substring->str[0],maxcount);
2933
2934 else if (substring->length == 0) {
2935 Py_DECREF(list);
2936 PyErr_SetString(PyExc_ValueError, "empty separator");
2937 return NULL;
2938 }
2939 else
2940 return split_substring(self,list,substring,maxcount);
2941}
2942
2943static
2944PyObject *strip(PyUnicodeObject *self,
2945 int left,
2946 int right)
2947{
2948 Py_UNICODE *p = self->str;
2949 int start = 0;
2950 int end = self->length;
2951
2952 if (left)
2953 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2954 start++;
2955
2956 if (right)
2957 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2958 end--;
2959
2960 if (start == 0 && end == self->length) {
2961 /* couldn't strip anything off, return original string */
2962 Py_INCREF(self);
2963 return (PyObject*) self;
2964 }
2965
2966 return (PyObject*) PyUnicode_FromUnicode(
2967 self->str + start,
2968 end - start
2969 );
2970}
2971
2972static
2973PyObject *replace(PyUnicodeObject *self,
2974 PyUnicodeObject *str1,
2975 PyUnicodeObject *str2,
2976 int maxcount)
2977{
2978 PyUnicodeObject *u;
2979
2980 if (maxcount < 0)
2981 maxcount = INT_MAX;
2982
2983 if (str1->length == 1 && str2->length == 1) {
2984 int i;
2985
2986 /* replace characters */
2987 if (!findchar(self->str, self->length, str1->str[0])) {
2988 /* nothing to replace, return original string */
2989 Py_INCREF(self);
2990 u = self;
2991 } else {
2992 Py_UNICODE u1 = str1->str[0];
2993 Py_UNICODE u2 = str2->str[0];
2994
2995 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2996 self->str,
2997 self->length
2998 );
2999 if (u)
3000 for (i = 0; i < u->length; i++)
3001 if (u->str[i] == u1) {
3002 if (--maxcount < 0)
3003 break;
3004 u->str[i] = u2;
3005 }
3006 }
3007
3008 } else {
3009 int n, i;
3010 Py_UNICODE *p;
3011
3012 /* replace strings */
3013 n = count(self, 0, self->length, str1);
3014 if (n > maxcount)
3015 n = maxcount;
3016 if (n == 0) {
3017 /* nothing to replace, return original string */
3018 Py_INCREF(self);
3019 u = self;
3020 } else {
3021 u = _PyUnicode_New(
3022 self->length + n * (str2->length - str1->length));
3023 if (u) {
3024 i = 0;
3025 p = u->str;
3026 while (i <= self->length - str1->length)
3027 if (Py_UNICODE_MATCH(self, i, str1)) {
3028 /* replace string segment */
3029 Py_UNICODE_COPY(p, str2->str, str2->length);
3030 p += str2->length;
3031 i += str1->length;
3032 if (--n <= 0) {
3033 /* copy remaining part */
3034 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3035 break;
3036 }
3037 } else
3038 *p++ = self->str[i++];
3039 }
3040 }
3041 }
3042
3043 return (PyObject *) u;
3044}
3045
3046/* --- Unicode Object Methods --------------------------------------------- */
3047
3048static char title__doc__[] =
3049"S.title() -> unicode\n\
3050\n\
3051Return a titlecased version of S, i.e. words start with title case\n\
3052characters, all remaining cased characters have lower case.";
3053
3054static PyObject*
3055unicode_title(PyUnicodeObject *self, PyObject *args)
3056{
3057 if (!PyArg_NoArgs(args))
3058 return NULL;
3059 return fixup(self, fixtitle);
3060}
3061
3062static char capitalize__doc__[] =
3063"S.capitalize() -> unicode\n\
3064\n\
3065Return a capitalized version of S, i.e. make the first character\n\
3066have upper case.";
3067
3068static PyObject*
3069unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3070{
3071 if (!PyArg_NoArgs(args))
3072 return NULL;
3073 return fixup(self, fixcapitalize);
3074}
3075
3076#if 0
3077static char capwords__doc__[] =
3078"S.capwords() -> unicode\n\
3079\n\
3080Apply .capitalize() to all words in S and return the result with\n\
3081normalized whitespace (all whitespace strings are replaced by ' ').";
3082
3083static PyObject*
3084unicode_capwords(PyUnicodeObject *self, PyObject *args)
3085{
3086 PyObject *list;
3087 PyObject *item;
3088 int i;
3089
3090 if (!PyArg_NoArgs(args))
3091 return NULL;
3092
3093 /* Split into words */
3094 list = split(self, NULL, -1);
3095 if (!list)
3096 return NULL;
3097
3098 /* Capitalize each word */
3099 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3100 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3101 fixcapitalize);
3102 if (item == NULL)
3103 goto onError;
3104 Py_DECREF(PyList_GET_ITEM(list, i));
3105 PyList_SET_ITEM(list, i, item);
3106 }
3107
3108 /* Join the words to form a new string */
3109 item = PyUnicode_Join(NULL, list);
3110
3111onError:
3112 Py_DECREF(list);
3113 return (PyObject *)item;
3114}
3115#endif
3116
3117static char center__doc__[] =
3118"S.center(width) -> unicode\n\
3119\n\
3120Return S centered in a Unicode string of length width. Padding is done\n\
3121using spaces.";
3122
3123static PyObject *
3124unicode_center(PyUnicodeObject *self, PyObject *args)
3125{
3126 int marg, left;
3127 int width;
3128
3129 if (!PyArg_ParseTuple(args, "i:center", &width))
3130 return NULL;
3131
3132 if (self->length >= width) {
3133 Py_INCREF(self);
3134 return (PyObject*) self;
3135 }
3136
3137 marg = width - self->length;
3138 left = marg / 2 + (marg & width & 1);
3139
3140 return (PyObject*) pad(self, left, marg - left, ' ');
3141}
3142
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003143/* speedy UTF-16 code point order comparison */
3144/* gleaned from: */
3145/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3146
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003147static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003148{
3149 0, 0, 0, 0, 0, 0, 0, 0,
3150 0, 0, 0, 0, 0, 0, 0, 0,
3151 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003152 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003153};
3154
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155static int
3156unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3157{
3158 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 Py_UNICODE *s1 = str1->str;
3161 Py_UNICODE *s2 = str2->str;
3162
3163 len1 = str1->length;
3164 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003167 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003168 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003169
3170 c1 = *s1++;
3171 c2 = *s2++;
3172 if (c1 > (1<<11) * 26)
3173 c1 += utf16Fixup[c1>>11];
3174 if (c2 > (1<<11) * 26)
3175 c2 += utf16Fixup[c2>>11];
3176
3177 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003178 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003179 if (diff)
3180 return (diff < 0) ? -1 : (diff != 0);
3181 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 }
3183
3184 return (len1 < len2) ? -1 : (len1 != len2);
3185}
3186
3187int PyUnicode_Compare(PyObject *left,
3188 PyObject *right)
3189{
3190 PyUnicodeObject *u = NULL, *v = NULL;
3191 int result;
3192
3193 /* Coerce the two arguments */
3194 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3195 if (u == NULL)
3196 goto onError;
3197 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3198 if (v == NULL)
3199 goto onError;
3200
3201 /* Shortcut for emtpy or interned objects */
3202 if (v == u) {
3203 Py_DECREF(u);
3204 Py_DECREF(v);
3205 return 0;
3206 }
3207
3208 result = unicode_compare(u, v);
3209
3210 Py_DECREF(u);
3211 Py_DECREF(v);
3212 return result;
3213
3214onError:
3215 Py_XDECREF(u);
3216 Py_XDECREF(v);
3217 return -1;
3218}
3219
Guido van Rossum403d68b2000-03-13 15:55:09 +00003220int PyUnicode_Contains(PyObject *container,
3221 PyObject *element)
3222{
3223 PyUnicodeObject *u = NULL, *v = NULL;
3224 int result;
3225 register const Py_UNICODE *p, *e;
3226 register Py_UNICODE ch;
3227
3228 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003229 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003230 if (v == NULL) {
3231 PyErr_SetString(PyExc_TypeError,
3232 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003233 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003234 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003235 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3236 if (u == NULL) {
3237 Py_DECREF(v);
3238 goto onError;
3239 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003240
3241 /* Check v in u */
3242 if (PyUnicode_GET_SIZE(v) != 1) {
3243 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003244 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003245 goto onError;
3246 }
3247 ch = *PyUnicode_AS_UNICODE(v);
3248 p = PyUnicode_AS_UNICODE(u);
3249 e = p + PyUnicode_GET_SIZE(u);
3250 result = 0;
3251 while (p < e) {
3252 if (*p++ == ch) {
3253 result = 1;
3254 break;
3255 }
3256 }
3257
3258 Py_DECREF(u);
3259 Py_DECREF(v);
3260 return result;
3261
3262onError:
3263 Py_XDECREF(u);
3264 Py_XDECREF(v);
3265 return -1;
3266}
3267
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268/* Concat to string or Unicode object giving a new Unicode object. */
3269
3270PyObject *PyUnicode_Concat(PyObject *left,
3271 PyObject *right)
3272{
3273 PyUnicodeObject *u = NULL, *v = NULL, *w;
3274
3275 /* Coerce the two arguments */
3276 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3277 if (u == NULL)
3278 goto onError;
3279 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3280 if (v == NULL)
3281 goto onError;
3282
3283 /* Shortcuts */
3284 if (v == unicode_empty) {
3285 Py_DECREF(v);
3286 return (PyObject *)u;
3287 }
3288 if (u == unicode_empty) {
3289 Py_DECREF(u);
3290 return (PyObject *)v;
3291 }
3292
3293 /* Concat the two Unicode strings */
3294 w = _PyUnicode_New(u->length + v->length);
3295 if (w == NULL)
3296 goto onError;
3297 Py_UNICODE_COPY(w->str, u->str, u->length);
3298 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3299
3300 Py_DECREF(u);
3301 Py_DECREF(v);
3302 return (PyObject *)w;
3303
3304onError:
3305 Py_XDECREF(u);
3306 Py_XDECREF(v);
3307 return NULL;
3308}
3309
3310static char count__doc__[] =
3311"S.count(sub[, start[, end]]) -> int\n\
3312\n\
3313Return the number of occurrences of substring sub in Unicode string\n\
3314S[start:end]. Optional arguments start and end are\n\
3315interpreted as in slice notation.";
3316
3317static PyObject *
3318unicode_count(PyUnicodeObject *self, PyObject *args)
3319{
3320 PyUnicodeObject *substring;
3321 int start = 0;
3322 int end = INT_MAX;
3323 PyObject *result;
3324
Guido van Rossumb8872e62000-05-09 14:14:27 +00003325 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3326 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 return NULL;
3328
3329 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3330 (PyObject *)substring);
3331 if (substring == NULL)
3332 return NULL;
3333
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 if (start < 0)
3335 start += self->length;
3336 if (start < 0)
3337 start = 0;
3338 if (end > self->length)
3339 end = self->length;
3340 if (end < 0)
3341 end += self->length;
3342 if (end < 0)
3343 end = 0;
3344
3345 result = PyInt_FromLong((long) count(self, start, end, substring));
3346
3347 Py_DECREF(substring);
3348 return result;
3349}
3350
3351static char encode__doc__[] =
3352"S.encode([encoding[,errors]]) -> string\n\
3353\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003354Return an encoded string version of S. Default encoding is the current\n\
3355default string encoding. errors may be given to set a different error\n\
3356handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3357a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358
3359static PyObject *
3360unicode_encode(PyUnicodeObject *self, PyObject *args)
3361{
3362 char *encoding = NULL;
3363 char *errors = NULL;
3364 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3365 return NULL;
3366 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3367}
3368
3369static char expandtabs__doc__[] =
3370"S.expandtabs([tabsize]) -> unicode\n\
3371\n\
3372Return a copy of S where all tab characters are expanded using spaces.\n\
3373If tabsize is not given, a tab size of 8 characters is assumed.";
3374
3375static PyObject*
3376unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3377{
3378 Py_UNICODE *e;
3379 Py_UNICODE *p;
3380 Py_UNICODE *q;
3381 int i, j;
3382 PyUnicodeObject *u;
3383 int tabsize = 8;
3384
3385 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3386 return NULL;
3387
3388 /* First pass: determine size of ouput string */
3389 i = j = 0;
3390 e = self->str + self->length;
3391 for (p = self->str; p < e; p++)
3392 if (*p == '\t') {
3393 if (tabsize > 0)
3394 j += tabsize - (j % tabsize);
3395 }
3396 else {
3397 j++;
3398 if (*p == '\n' || *p == '\r') {
3399 i += j;
3400 j = 0;
3401 }
3402 }
3403
3404 /* Second pass: create output string and fill it */
3405 u = _PyUnicode_New(i + j);
3406 if (!u)
3407 return NULL;
3408
3409 j = 0;
3410 q = u->str;
3411
3412 for (p = self->str; p < e; p++)
3413 if (*p == '\t') {
3414 if (tabsize > 0) {
3415 i = tabsize - (j % tabsize);
3416 j += i;
3417 while (i--)
3418 *q++ = ' ';
3419 }
3420 }
3421 else {
3422 j++;
3423 *q++ = *p;
3424 if (*p == '\n' || *p == '\r')
3425 j = 0;
3426 }
3427
3428 return (PyObject*) u;
3429}
3430
3431static char find__doc__[] =
3432"S.find(sub [,start [,end]]) -> int\n\
3433\n\
3434Return the lowest index in S where substring sub is found,\n\
3435such that sub is contained within s[start,end]. Optional\n\
3436arguments start and end are interpreted as in slice notation.\n\
3437\n\
3438Return -1 on failure.";
3439
3440static PyObject *
3441unicode_find(PyUnicodeObject *self, PyObject *args)
3442{
3443 PyUnicodeObject *substring;
3444 int start = 0;
3445 int end = INT_MAX;
3446 PyObject *result;
3447
Guido van Rossumb8872e62000-05-09 14:14:27 +00003448 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3449 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 return NULL;
3451 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3452 (PyObject *)substring);
3453 if (substring == NULL)
3454 return NULL;
3455
3456 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3457
3458 Py_DECREF(substring);
3459 return result;
3460}
3461
3462static PyObject *
3463unicode_getitem(PyUnicodeObject *self, int index)
3464{
3465 if (index < 0 || index >= self->length) {
3466 PyErr_SetString(PyExc_IndexError, "string index out of range");
3467 return NULL;
3468 }
3469
3470 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3471}
3472
3473static long
3474unicode_hash(PyUnicodeObject *self)
3475{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003476 /* Since Unicode objects compare equal to their ASCII string
3477 counterparts, they should use the individual character values
3478 as basis for their hash value. This is needed to assure that
3479 strings and Unicode objects behave in the same way as
3480 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481
Fredrik Lundhdde61642000-07-10 18:27:47 +00003482 register int len;
3483 register Py_UNICODE *p;
3484 register long x;
3485
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 if (self->hash != -1)
3487 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003488 len = PyUnicode_GET_SIZE(self);
3489 p = PyUnicode_AS_UNICODE(self);
3490 x = *p << 7;
3491 while (--len >= 0)
3492 x = (1000003*x) ^ *p++;
3493 x ^= PyUnicode_GET_SIZE(self);
3494 if (x == -1)
3495 x = -2;
3496 self->hash = x;
3497 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498}
3499
3500static char index__doc__[] =
3501"S.index(sub [,start [,end]]) -> int\n\
3502\n\
3503Like S.find() but raise ValueError when the substring is not found.";
3504
3505static PyObject *
3506unicode_index(PyUnicodeObject *self, PyObject *args)
3507{
3508 int result;
3509 PyUnicodeObject *substring;
3510 int start = 0;
3511 int end = INT_MAX;
3512
Guido van Rossumb8872e62000-05-09 14:14:27 +00003513 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3514 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 return NULL;
3516
3517 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3518 (PyObject *)substring);
3519 if (substring == NULL)
3520 return NULL;
3521
3522 result = findstring(self, substring, start, end, 1);
3523
3524 Py_DECREF(substring);
3525 if (result < 0) {
3526 PyErr_SetString(PyExc_ValueError, "substring not found");
3527 return NULL;
3528 }
3529 return PyInt_FromLong(result);
3530}
3531
3532static char islower__doc__[] =
3533"S.islower() -> int\n\
3534\n\
3535Return 1 if all cased characters in S are lowercase and there is\n\
3536at least one cased character in S, 0 otherwise.";
3537
3538static PyObject*
3539unicode_islower(PyUnicodeObject *self, PyObject *args)
3540{
3541 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3542 register const Py_UNICODE *e;
3543 int cased;
3544
3545 if (!PyArg_NoArgs(args))
3546 return NULL;
3547
3548 /* Shortcut for single character strings */
3549 if (PyUnicode_GET_SIZE(self) == 1)
3550 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3551
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003552 /* Special case for empty strings */
3553 if (PyString_GET_SIZE(self) == 0)
3554 return PyInt_FromLong(0);
3555
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 e = p + PyUnicode_GET_SIZE(self);
3557 cased = 0;
3558 for (; p < e; p++) {
3559 register const Py_UNICODE ch = *p;
3560
3561 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3562 return PyInt_FromLong(0);
3563 else if (!cased && Py_UNICODE_ISLOWER(ch))
3564 cased = 1;
3565 }
3566 return PyInt_FromLong(cased);
3567}
3568
3569static char isupper__doc__[] =
3570"S.isupper() -> int\n\
3571\n\
3572Return 1 if all cased characters in S are uppercase and there is\n\
3573at least one cased character in S, 0 otherwise.";
3574
3575static PyObject*
3576unicode_isupper(PyUnicodeObject *self, PyObject *args)
3577{
3578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3579 register const Py_UNICODE *e;
3580 int cased;
3581
3582 if (!PyArg_NoArgs(args))
3583 return NULL;
3584
3585 /* Shortcut for single character strings */
3586 if (PyUnicode_GET_SIZE(self) == 1)
3587 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3588
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003589 /* Special case for empty strings */
3590 if (PyString_GET_SIZE(self) == 0)
3591 return PyInt_FromLong(0);
3592
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 e = p + PyUnicode_GET_SIZE(self);
3594 cased = 0;
3595 for (; p < e; p++) {
3596 register const Py_UNICODE ch = *p;
3597
3598 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3599 return PyInt_FromLong(0);
3600 else if (!cased && Py_UNICODE_ISUPPER(ch))
3601 cased = 1;
3602 }
3603 return PyInt_FromLong(cased);
3604}
3605
3606static char istitle__doc__[] =
3607"S.istitle() -> int\n\
3608\n\
3609Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3610may only follow uncased characters and lowercase characters only cased\n\
3611ones. Return 0 otherwise.";
3612
3613static PyObject*
3614unicode_istitle(PyUnicodeObject *self, PyObject *args)
3615{
3616 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3617 register const Py_UNICODE *e;
3618 int cased, previous_is_cased;
3619
3620 if (!PyArg_NoArgs(args))
3621 return NULL;
3622
3623 /* Shortcut for single character strings */
3624 if (PyUnicode_GET_SIZE(self) == 1)
3625 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3626 (Py_UNICODE_ISUPPER(*p) != 0));
3627
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003628 /* Special case for empty strings */
3629 if (PyString_GET_SIZE(self) == 0)
3630 return PyInt_FromLong(0);
3631
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 e = p + PyUnicode_GET_SIZE(self);
3633 cased = 0;
3634 previous_is_cased = 0;
3635 for (; p < e; p++) {
3636 register const Py_UNICODE ch = *p;
3637
3638 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3639 if (previous_is_cased)
3640 return PyInt_FromLong(0);
3641 previous_is_cased = 1;
3642 cased = 1;
3643 }
3644 else if (Py_UNICODE_ISLOWER(ch)) {
3645 if (!previous_is_cased)
3646 return PyInt_FromLong(0);
3647 previous_is_cased = 1;
3648 cased = 1;
3649 }
3650 else
3651 previous_is_cased = 0;
3652 }
3653 return PyInt_FromLong(cased);
3654}
3655
3656static char isspace__doc__[] =
3657"S.isspace() -> int\n\
3658\n\
3659Return 1 if there are only whitespace characters in S,\n\
36600 otherwise.";
3661
3662static PyObject*
3663unicode_isspace(PyUnicodeObject *self, PyObject *args)
3664{
3665 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3666 register const Py_UNICODE *e;
3667
3668 if (!PyArg_NoArgs(args))
3669 return NULL;
3670
3671 /* Shortcut for single character strings */
3672 if (PyUnicode_GET_SIZE(self) == 1 &&
3673 Py_UNICODE_ISSPACE(*p))
3674 return PyInt_FromLong(1);
3675
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003676 /* Special case for empty strings */
3677 if (PyString_GET_SIZE(self) == 0)
3678 return PyInt_FromLong(0);
3679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 e = p + PyUnicode_GET_SIZE(self);
3681 for (; p < e; p++) {
3682 if (!Py_UNICODE_ISSPACE(*p))
3683 return PyInt_FromLong(0);
3684 }
3685 return PyInt_FromLong(1);
3686}
3687
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003688static char isalpha__doc__[] =
3689"S.isalpha() -> int\n\
3690\n\
3691Return 1 if all characters in S are alphabetic\n\
3692and there is at least one character in S, 0 otherwise.";
3693
3694static PyObject*
3695unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3696{
3697 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3698 register const Py_UNICODE *e;
3699
3700 if (!PyArg_NoArgs(args))
3701 return NULL;
3702
3703 /* Shortcut for single character strings */
3704 if (PyUnicode_GET_SIZE(self) == 1 &&
3705 Py_UNICODE_ISALPHA(*p))
3706 return PyInt_FromLong(1);
3707
3708 /* Special case for empty strings */
3709 if (PyString_GET_SIZE(self) == 0)
3710 return PyInt_FromLong(0);
3711
3712 e = p + PyUnicode_GET_SIZE(self);
3713 for (; p < e; p++) {
3714 if (!Py_UNICODE_ISALPHA(*p))
3715 return PyInt_FromLong(0);
3716 }
3717 return PyInt_FromLong(1);
3718}
3719
3720static char isalnum__doc__[] =
3721"S.isalnum() -> int\n\
3722\n\
3723Return 1 if all characters in S are alphanumeric\n\
3724and there is at least one character in S, 0 otherwise.";
3725
3726static PyObject*
3727unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3728{
3729 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3730 register const Py_UNICODE *e;
3731
3732 if (!PyArg_NoArgs(args))
3733 return NULL;
3734
3735 /* Shortcut for single character strings */
3736 if (PyUnicode_GET_SIZE(self) == 1 &&
3737 Py_UNICODE_ISALNUM(*p))
3738 return PyInt_FromLong(1);
3739
3740 /* Special case for empty strings */
3741 if (PyString_GET_SIZE(self) == 0)
3742 return PyInt_FromLong(0);
3743
3744 e = p + PyUnicode_GET_SIZE(self);
3745 for (; p < e; p++) {
3746 if (!Py_UNICODE_ISALNUM(*p))
3747 return PyInt_FromLong(0);
3748 }
3749 return PyInt_FromLong(1);
3750}
3751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752static char isdecimal__doc__[] =
3753"S.isdecimal() -> int\n\
3754\n\
3755Return 1 if there are only decimal characters in S,\n\
37560 otherwise.";
3757
3758static PyObject*
3759unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3760{
3761 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3762 register const Py_UNICODE *e;
3763
3764 if (!PyArg_NoArgs(args))
3765 return NULL;
3766
3767 /* Shortcut for single character strings */
3768 if (PyUnicode_GET_SIZE(self) == 1 &&
3769 Py_UNICODE_ISDECIMAL(*p))
3770 return PyInt_FromLong(1);
3771
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003772 /* Special case for empty strings */
3773 if (PyString_GET_SIZE(self) == 0)
3774 return PyInt_FromLong(0);
3775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 e = p + PyUnicode_GET_SIZE(self);
3777 for (; p < e; p++) {
3778 if (!Py_UNICODE_ISDECIMAL(*p))
3779 return PyInt_FromLong(0);
3780 }
3781 return PyInt_FromLong(1);
3782}
3783
3784static char isdigit__doc__[] =
3785"S.isdigit() -> int\n\
3786\n\
3787Return 1 if there are only digit characters in S,\n\
37880 otherwise.";
3789
3790static PyObject*
3791unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3792{
3793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3794 register const Py_UNICODE *e;
3795
3796 if (!PyArg_NoArgs(args))
3797 return NULL;
3798
3799 /* Shortcut for single character strings */
3800 if (PyUnicode_GET_SIZE(self) == 1 &&
3801 Py_UNICODE_ISDIGIT(*p))
3802 return PyInt_FromLong(1);
3803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003804 /* Special case for empty strings */
3805 if (PyString_GET_SIZE(self) == 0)
3806 return PyInt_FromLong(0);
3807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 e = p + PyUnicode_GET_SIZE(self);
3809 for (; p < e; p++) {
3810 if (!Py_UNICODE_ISDIGIT(*p))
3811 return PyInt_FromLong(0);
3812 }
3813 return PyInt_FromLong(1);
3814}
3815
3816static char isnumeric__doc__[] =
3817"S.isnumeric() -> int\n\
3818\n\
3819Return 1 if there are only numeric characters in S,\n\
38200 otherwise.";
3821
3822static PyObject*
3823unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3824{
3825 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3826 register const Py_UNICODE *e;
3827
3828 if (!PyArg_NoArgs(args))
3829 return NULL;
3830
3831 /* Shortcut for single character strings */
3832 if (PyUnicode_GET_SIZE(self) == 1 &&
3833 Py_UNICODE_ISNUMERIC(*p))
3834 return PyInt_FromLong(1);
3835
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003836 /* Special case for empty strings */
3837 if (PyString_GET_SIZE(self) == 0)
3838 return PyInt_FromLong(0);
3839
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 e = p + PyUnicode_GET_SIZE(self);
3841 for (; p < e; p++) {
3842 if (!Py_UNICODE_ISNUMERIC(*p))
3843 return PyInt_FromLong(0);
3844 }
3845 return PyInt_FromLong(1);
3846}
3847
3848static char join__doc__[] =
3849"S.join(sequence) -> unicode\n\
3850\n\
3851Return a string which is the concatenation of the strings in the\n\
3852sequence. The separator between elements is S.";
3853
3854static PyObject*
3855unicode_join(PyUnicodeObject *self, PyObject *args)
3856{
3857 PyObject *data;
3858 if (!PyArg_ParseTuple(args, "O:join", &data))
3859 return NULL;
3860
3861 return PyUnicode_Join((PyObject *)self, data);
3862}
3863
3864static int
3865unicode_length(PyUnicodeObject *self)
3866{
3867 return self->length;
3868}
3869
3870static char ljust__doc__[] =
3871"S.ljust(width) -> unicode\n\
3872\n\
3873Return S left justified in a Unicode string of length width. Padding is\n\
3874done using spaces.";
3875
3876static PyObject *
3877unicode_ljust(PyUnicodeObject *self, PyObject *args)
3878{
3879 int width;
3880 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3881 return NULL;
3882
3883 if (self->length >= width) {
3884 Py_INCREF(self);
3885 return (PyObject*) self;
3886 }
3887
3888 return (PyObject*) pad(self, 0, width - self->length, ' ');
3889}
3890
3891static char lower__doc__[] =
3892"S.lower() -> unicode\n\
3893\n\
3894Return a copy of the string S converted to lowercase.";
3895
3896static PyObject*
3897unicode_lower(PyUnicodeObject *self, PyObject *args)
3898{
3899 if (!PyArg_NoArgs(args))
3900 return NULL;
3901 return fixup(self, fixlower);
3902}
3903
3904static char lstrip__doc__[] =
3905"S.lstrip() -> unicode\n\
3906\n\
3907Return a copy of the string S with leading whitespace removed.";
3908
3909static PyObject *
3910unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3911{
3912 if (!PyArg_NoArgs(args))
3913 return NULL;
3914 return strip(self, 1, 0);
3915}
3916
3917static PyObject*
3918unicode_repeat(PyUnicodeObject *str, int len)
3919{
3920 PyUnicodeObject *u;
3921 Py_UNICODE *p;
3922
3923 if (len < 0)
3924 len = 0;
3925
3926 if (len == 1) {
3927 /* no repeat, return original string */
3928 Py_INCREF(str);
3929 return (PyObject*) str;
3930 }
3931
3932 u = _PyUnicode_New(len * str->length);
3933 if (!u)
3934 return NULL;
3935
3936 p = u->str;
3937
3938 while (len-- > 0) {
3939 Py_UNICODE_COPY(p, str->str, str->length);
3940 p += str->length;
3941 }
3942
3943 return (PyObject*) u;
3944}
3945
3946PyObject *PyUnicode_Replace(PyObject *obj,
3947 PyObject *subobj,
3948 PyObject *replobj,
3949 int maxcount)
3950{
3951 PyObject *self;
3952 PyObject *str1;
3953 PyObject *str2;
3954 PyObject *result;
3955
3956 self = PyUnicode_FromObject(obj);
3957 if (self == NULL)
3958 return NULL;
3959 str1 = PyUnicode_FromObject(subobj);
3960 if (str1 == NULL) {
3961 Py_DECREF(self);
3962 return NULL;
3963 }
3964 str2 = PyUnicode_FromObject(replobj);
3965 if (str2 == NULL) {
3966 Py_DECREF(self);
3967 Py_DECREF(str1);
3968 return NULL;
3969 }
3970 result = replace((PyUnicodeObject *)self,
3971 (PyUnicodeObject *)str1,
3972 (PyUnicodeObject *)str2,
3973 maxcount);
3974 Py_DECREF(self);
3975 Py_DECREF(str1);
3976 Py_DECREF(str2);
3977 return result;
3978}
3979
3980static char replace__doc__[] =
3981"S.replace (old, new[, maxsplit]) -> unicode\n\
3982\n\
3983Return a copy of S with all occurrences of substring\n\
3984old replaced by new. If the optional argument maxsplit is\n\
3985given, only the first maxsplit occurrences are replaced.";
3986
3987static PyObject*
3988unicode_replace(PyUnicodeObject *self, PyObject *args)
3989{
3990 PyUnicodeObject *str1;
3991 PyUnicodeObject *str2;
3992 int maxcount = -1;
3993 PyObject *result;
3994
3995 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3996 return NULL;
3997 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3998 if (str1 == NULL)
3999 return NULL;
4000 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4001 if (str2 == NULL)
4002 return NULL;
4003
4004 result = replace(self, str1, str2, maxcount);
4005
4006 Py_DECREF(str1);
4007 Py_DECREF(str2);
4008 return result;
4009}
4010
4011static
4012PyObject *unicode_repr(PyObject *unicode)
4013{
4014 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4015 PyUnicode_GET_SIZE(unicode),
4016 1);
4017}
4018
4019static char rfind__doc__[] =
4020"S.rfind(sub [,start [,end]]) -> int\n\
4021\n\
4022Return the highest index in S where substring sub is found,\n\
4023such that sub is contained within s[start,end]. Optional\n\
4024arguments start and end are interpreted as in slice notation.\n\
4025\n\
4026Return -1 on failure.";
4027
4028static PyObject *
4029unicode_rfind(PyUnicodeObject *self, PyObject *args)
4030{
4031 PyUnicodeObject *substring;
4032 int start = 0;
4033 int end = INT_MAX;
4034 PyObject *result;
4035
Guido van Rossumb8872e62000-05-09 14:14:27 +00004036 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4037 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return NULL;
4039 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4040 (PyObject *)substring);
4041 if (substring == NULL)
4042 return NULL;
4043
4044 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4045
4046 Py_DECREF(substring);
4047 return result;
4048}
4049
4050static char rindex__doc__[] =
4051"S.rindex(sub [,start [,end]]) -> int\n\
4052\n\
4053Like S.rfind() but raise ValueError when the substring is not found.";
4054
4055static PyObject *
4056unicode_rindex(PyUnicodeObject *self, PyObject *args)
4057{
4058 int result;
4059 PyUnicodeObject *substring;
4060 int start = 0;
4061 int end = INT_MAX;
4062
Guido van Rossumb8872e62000-05-09 14:14:27 +00004063 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4064 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 return NULL;
4066 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4067 (PyObject *)substring);
4068 if (substring == NULL)
4069 return NULL;
4070
4071 result = findstring(self, substring, start, end, -1);
4072
4073 Py_DECREF(substring);
4074 if (result < 0) {
4075 PyErr_SetString(PyExc_ValueError, "substring not found");
4076 return NULL;
4077 }
4078 return PyInt_FromLong(result);
4079}
4080
4081static char rjust__doc__[] =
4082"S.rjust(width) -> unicode\n\
4083\n\
4084Return S right justified in a Unicode string of length width. Padding is\n\
4085done using spaces.";
4086
4087static PyObject *
4088unicode_rjust(PyUnicodeObject *self, PyObject *args)
4089{
4090 int width;
4091 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4092 return NULL;
4093
4094 if (self->length >= width) {
4095 Py_INCREF(self);
4096 return (PyObject*) self;
4097 }
4098
4099 return (PyObject*) pad(self, width - self->length, 0, ' ');
4100}
4101
4102static char rstrip__doc__[] =
4103"S.rstrip() -> unicode\n\
4104\n\
4105Return a copy of the string S with trailing whitespace removed.";
4106
4107static PyObject *
4108unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4109{
4110 if (!PyArg_NoArgs(args))
4111 return NULL;
4112 return strip(self, 0, 1);
4113}
4114
4115static PyObject*
4116unicode_slice(PyUnicodeObject *self, int start, int end)
4117{
4118 /* standard clamping */
4119 if (start < 0)
4120 start = 0;
4121 if (end < 0)
4122 end = 0;
4123 if (end > self->length)
4124 end = self->length;
4125 if (start == 0 && end == self->length) {
4126 /* full slice, return original string */
4127 Py_INCREF(self);
4128 return (PyObject*) self;
4129 }
4130 if (start > end)
4131 start = end;
4132 /* copy slice */
4133 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4134 end - start);
4135}
4136
4137PyObject *PyUnicode_Split(PyObject *s,
4138 PyObject *sep,
4139 int maxsplit)
4140{
4141 PyObject *result;
4142
4143 s = PyUnicode_FromObject(s);
4144 if (s == NULL)
4145 return NULL;
4146 if (sep != NULL) {
4147 sep = PyUnicode_FromObject(sep);
4148 if (sep == NULL) {
4149 Py_DECREF(s);
4150 return NULL;
4151 }
4152 }
4153
4154 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4155
4156 Py_DECREF(s);
4157 Py_XDECREF(sep);
4158 return result;
4159}
4160
4161static char split__doc__[] =
4162"S.split([sep [,maxsplit]]) -> list of strings\n\
4163\n\
4164Return a list of the words in S, using sep as the\n\
4165delimiter string. If maxsplit is given, at most maxsplit\n\
4166splits are done. If sep is not specified, any whitespace string\n\
4167is a separator.";
4168
4169static PyObject*
4170unicode_split(PyUnicodeObject *self, PyObject *args)
4171{
4172 PyObject *substring = Py_None;
4173 int maxcount = -1;
4174
4175 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4176 return NULL;
4177
4178 if (substring == Py_None)
4179 return split(self, NULL, maxcount);
4180 else if (PyUnicode_Check(substring))
4181 return split(self, (PyUnicodeObject *)substring, maxcount);
4182 else
4183 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4184}
4185
4186static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004187"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188\n\
4189Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004190Line breaks are not included in the resulting list unless keepends\n\
4191is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192
4193static PyObject*
4194unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4195{
Guido van Rossum86662912000-04-11 15:38:46 +00004196 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197
Guido van Rossum86662912000-04-11 15:38:46 +00004198 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 return NULL;
4200
Guido van Rossum86662912000-04-11 15:38:46 +00004201 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202}
4203
4204static
4205PyObject *unicode_str(PyUnicodeObject *self)
4206{
Fred Drakee4315f52000-05-09 19:53:39 +00004207 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208}
4209
4210static char strip__doc__[] =
4211"S.strip() -> unicode\n\
4212\n\
4213Return a copy of S with leading and trailing whitespace removed.";
4214
4215static PyObject *
4216unicode_strip(PyUnicodeObject *self, PyObject *args)
4217{
4218 if (!PyArg_NoArgs(args))
4219 return NULL;
4220 return strip(self, 1, 1);
4221}
4222
4223static char swapcase__doc__[] =
4224"S.swapcase() -> unicode\n\
4225\n\
4226Return a copy of S with uppercase characters converted to lowercase\n\
4227and vice versa.";
4228
4229static PyObject*
4230unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4231{
4232 if (!PyArg_NoArgs(args))
4233 return NULL;
4234 return fixup(self, fixswapcase);
4235}
4236
4237static char translate__doc__[] =
4238"S.translate(table) -> unicode\n\
4239\n\
4240Return a copy of the string S, where all characters have been mapped\n\
4241through the given translation table, which must be a mapping of\n\
4242Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4243are left untouched. Characters mapped to None are deleted.";
4244
4245static PyObject*
4246unicode_translate(PyUnicodeObject *self, PyObject *args)
4247{
4248 PyObject *table;
4249
4250 if (!PyArg_ParseTuple(args, "O:translate", &table))
4251 return NULL;
4252 return PyUnicode_TranslateCharmap(self->str,
4253 self->length,
4254 table,
4255 "ignore");
4256}
4257
4258static char upper__doc__[] =
4259"S.upper() -> unicode\n\
4260\n\
4261Return a copy of S converted to uppercase.";
4262
4263static PyObject*
4264unicode_upper(PyUnicodeObject *self, PyObject *args)
4265{
4266 if (!PyArg_NoArgs(args))
4267 return NULL;
4268 return fixup(self, fixupper);
4269}
4270
4271#if 0
4272static char zfill__doc__[] =
4273"S.zfill(width) -> unicode\n\
4274\n\
4275Pad a numeric string x with zeros on the left, to fill a field\n\
4276of the specified width. The string x is never truncated.";
4277
4278static PyObject *
4279unicode_zfill(PyUnicodeObject *self, PyObject *args)
4280{
4281 int fill;
4282 PyUnicodeObject *u;
4283
4284 int width;
4285 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4286 return NULL;
4287
4288 if (self->length >= width) {
4289 Py_INCREF(self);
4290 return (PyObject*) self;
4291 }
4292
4293 fill = width - self->length;
4294
4295 u = pad(self, fill, 0, '0');
4296
4297 if (u->str[fill] == '+' || u->str[fill] == '-') {
4298 /* move sign to beginning of string */
4299 u->str[0] = u->str[fill];
4300 u->str[fill] = '0';
4301 }
4302
4303 return (PyObject*) u;
4304}
4305#endif
4306
4307#if 0
4308static PyObject*
4309unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4310{
4311 if (!PyArg_NoArgs(args))
4312 return NULL;
4313 return PyInt_FromLong(unicode_freelist_size);
4314}
4315#endif
4316
4317static char startswith__doc__[] =
4318"S.startswith(prefix[, start[, end]]) -> int\n\
4319\n\
4320Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4321optional start, test S beginning at that position. With optional end, stop\n\
4322comparing S at that position.";
4323
4324static PyObject *
4325unicode_startswith(PyUnicodeObject *self,
4326 PyObject *args)
4327{
4328 PyUnicodeObject *substring;
4329 int start = 0;
4330 int end = INT_MAX;
4331 PyObject *result;
4332
Guido van Rossumb8872e62000-05-09 14:14:27 +00004333 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4334 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 return NULL;
4336 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4337 (PyObject *)substring);
4338 if (substring == NULL)
4339 return NULL;
4340
4341 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4342
4343 Py_DECREF(substring);
4344 return result;
4345}
4346
4347
4348static char endswith__doc__[] =
4349"S.endswith(suffix[, start[, end]]) -> int\n\
4350\n\
4351Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4352optional start, test S beginning at that position. With optional end, stop\n\
4353comparing S at that position.";
4354
4355static PyObject *
4356unicode_endswith(PyUnicodeObject *self,
4357 PyObject *args)
4358{
4359 PyUnicodeObject *substring;
4360 int start = 0;
4361 int end = INT_MAX;
4362 PyObject *result;
4363
Guido van Rossumb8872e62000-05-09 14:14:27 +00004364 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4365 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 return NULL;
4367 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4368 (PyObject *)substring);
4369 if (substring == NULL)
4370 return NULL;
4371
4372 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4373
4374 Py_DECREF(substring);
4375 return result;
4376}
4377
4378
4379static PyMethodDef unicode_methods[] = {
4380
4381 /* Order is according to common usage: often used methods should
4382 appear first, since lookup is done sequentially. */
4383
4384 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4385 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4386 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4387 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4388 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4389 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4390 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4391 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4392 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4393 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4394 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4395 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4396 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4397 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4398/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4399 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4400 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4401 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4402 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4403 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4404 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4405 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4406 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4407 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4408 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4409 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4410 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4411 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4412 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4413 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4414 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4415 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4416 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004417 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4418 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419#if 0
4420 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4421 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4422#endif
4423
4424#if 0
4425 /* This one is just used for debugging the implementation. */
4426 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4427#endif
4428
4429 {NULL, NULL}
4430};
4431
4432static PyObject *
4433unicode_getattr(PyUnicodeObject *self, char *name)
4434{
4435 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4436}
4437
4438static PySequenceMethods unicode_as_sequence = {
4439 (inquiry) unicode_length, /* sq_length */
4440 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4441 (intargfunc) unicode_repeat, /* sq_repeat */
4442 (intargfunc) unicode_getitem, /* sq_item */
4443 (intintargfunc) unicode_slice, /* sq_slice */
4444 0, /* sq_ass_item */
4445 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004446 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447};
4448
4449static int
4450unicode_buffer_getreadbuf(PyUnicodeObject *self,
4451 int index,
4452 const void **ptr)
4453{
4454 if (index != 0) {
4455 PyErr_SetString(PyExc_SystemError,
4456 "accessing non-existent unicode segment");
4457 return -1;
4458 }
4459 *ptr = (void *) self->str;
4460 return PyUnicode_GET_DATA_SIZE(self);
4461}
4462
4463static int
4464unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4465 const void **ptr)
4466{
4467 PyErr_SetString(PyExc_TypeError,
4468 "cannot use unicode as modifyable buffer");
4469 return -1;
4470}
4471
4472static int
4473unicode_buffer_getsegcount(PyUnicodeObject *self,
4474 int *lenp)
4475{
4476 if (lenp)
4477 *lenp = PyUnicode_GET_DATA_SIZE(self);
4478 return 1;
4479}
4480
4481static int
4482unicode_buffer_getcharbuf(PyUnicodeObject *self,
4483 int index,
4484 const void **ptr)
4485{
4486 PyObject *str;
4487
4488 if (index != 0) {
4489 PyErr_SetString(PyExc_SystemError,
4490 "accessing non-existent unicode segment");
4491 return -1;
4492 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004493 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 if (str == NULL)
4495 return -1;
4496 *ptr = (void *) PyString_AS_STRING(str);
4497 return PyString_GET_SIZE(str);
4498}
4499
4500/* Helpers for PyUnicode_Format() */
4501
4502static PyObject *
4503getnextarg(args, arglen, p_argidx)
4504 PyObject *args;
4505int arglen;
4506int *p_argidx;
4507{
4508 int argidx = *p_argidx;
4509 if (argidx < arglen) {
4510 (*p_argidx)++;
4511 if (arglen < 0)
4512 return args;
4513 else
4514 return PyTuple_GetItem(args, argidx);
4515 }
4516 PyErr_SetString(PyExc_TypeError,
4517 "not enough arguments for format string");
4518 return NULL;
4519}
4520
4521#define F_LJUST (1<<0)
4522#define F_SIGN (1<<1)
4523#define F_BLANK (1<<2)
4524#define F_ALT (1<<3)
4525#define F_ZERO (1<<4)
4526
4527static
4528#ifdef HAVE_STDARG_PROTOTYPES
4529int usprintf(register Py_UNICODE *buffer, char *format, ...)
4530#else
4531int usprintf(va_alist) va_dcl
4532#endif
4533{
4534 register int i;
4535 int len;
4536 va_list va;
4537 char *charbuffer;
4538#ifdef HAVE_STDARG_PROTOTYPES
4539 va_start(va, format);
4540#else
4541 Py_UNICODE *args;
4542 char *format;
4543
4544 va_start(va);
4545 buffer = va_arg(va, Py_UNICODE *);
4546 format = va_arg(va, char *);
4547#endif
4548
4549 /* First, format the string as char array, then expand to Py_UNICODE
4550 array. */
4551 charbuffer = (char *)buffer;
4552 len = vsprintf(charbuffer, format, va);
4553 for (i = len - 1; i >= 0; i--)
4554 buffer[i] = (Py_UNICODE) charbuffer[i];
4555
4556 va_end(va);
4557 return len;
4558}
4559
4560static int
4561formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004562 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 int flags,
4564 int prec,
4565 int type,
4566 PyObject *v)
4567{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004568 /* fmt = '%#.' + `prec` + `type`
4569 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 char fmt[20];
4571 double x;
4572
4573 x = PyFloat_AsDouble(v);
4574 if (x == -1.0 && PyErr_Occurred())
4575 return -1;
4576 if (prec < 0)
4577 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4579 type = 'g';
4580 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004581 /* worst case length calc to ensure no buffer overrun:
4582 fmt = %#.<prec>g
4583 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4584 for any double rep.)
4585 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4586 If prec=0 the effective precision is 1 (the leading digit is
4587 always given), therefore increase by one to 10+prec. */
4588 if (buflen <= (size_t)10 + (size_t)prec) {
4589 PyErr_SetString(PyExc_OverflowError,
4590 "formatted float is too long (precision too long?)");
4591 return -1;
4592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593 return usprintf(buf, fmt, x);
4594}
4595
4596static int
4597formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004598 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 int flags,
4600 int prec,
4601 int type,
4602 PyObject *v)
4603{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004604 /* fmt = '%#.' + `prec` + 'l' + `type`
4605 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 char fmt[20];
4607 long x;
4608
4609 x = PyInt_AsLong(v);
4610 if (x == -1 && PyErr_Occurred())
4611 return -1;
4612 if (prec < 0)
4613 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004614 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4615 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4616 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4617 PyErr_SetString(PyExc_OverflowError,
4618 "formatted integer is too long (precision too long?)");
4619 return -1;
4620 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4622 return usprintf(buf, fmt, x);
4623}
4624
4625static int
4626formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004627 size_t buflen,
4628 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004630 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004631 if (PyUnicode_Check(v)) {
4632 if (PyUnicode_GET_SIZE(v) != 1)
4633 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004637 else if (PyString_Check(v)) {
4638 if (PyString_GET_SIZE(v) != 1)
4639 goto onError;
4640 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642
4643 else {
4644 /* Integer input truncated to a character */
4645 long x;
4646 x = PyInt_AsLong(v);
4647 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004648 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 buf[0] = (char) x;
4650 }
4651 buf[1] = '\0';
4652 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004653
4654 onError:
4655 PyErr_SetString(PyExc_TypeError,
4656 "%c requires int or char");
4657 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658}
4659
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004660/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4661
4662 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4663 chars are formatted. XXX This is a magic number. Each formatting
4664 routine does bounds checking to ensure no overflow, but a better
4665 solution may be to malloc a buffer of appropriate size for each
4666 format. For now, the current solution is sufficient.
4667*/
4668#define FORMATBUFLEN (size_t)120
4669
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670PyObject *PyUnicode_Format(PyObject *format,
4671 PyObject *args)
4672{
4673 Py_UNICODE *fmt, *res;
4674 int fmtcnt, rescnt, reslen, arglen, argidx;
4675 int args_owned = 0;
4676 PyUnicodeObject *result = NULL;
4677 PyObject *dict = NULL;
4678 PyObject *uformat;
4679
4680 if (format == NULL || args == NULL) {
4681 PyErr_BadInternalCall();
4682 return NULL;
4683 }
4684 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004685 if (uformat == NULL)
4686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 fmt = PyUnicode_AS_UNICODE(uformat);
4688 fmtcnt = PyUnicode_GET_SIZE(uformat);
4689
4690 reslen = rescnt = fmtcnt + 100;
4691 result = _PyUnicode_New(reslen);
4692 if (result == NULL)
4693 goto onError;
4694 res = PyUnicode_AS_UNICODE(result);
4695
4696 if (PyTuple_Check(args)) {
4697 arglen = PyTuple_Size(args);
4698 argidx = 0;
4699 }
4700 else {
4701 arglen = -1;
4702 argidx = -2;
4703 }
4704 if (args->ob_type->tp_as_mapping)
4705 dict = args;
4706
4707 while (--fmtcnt >= 0) {
4708 if (*fmt != '%') {
4709 if (--rescnt < 0) {
4710 rescnt = fmtcnt + 100;
4711 reslen += rescnt;
4712 if (_PyUnicode_Resize(result, reslen) < 0)
4713 return NULL;
4714 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4715 --rescnt;
4716 }
4717 *res++ = *fmt++;
4718 }
4719 else {
4720 /* Got a format specifier */
4721 int flags = 0;
4722 int width = -1;
4723 int prec = -1;
4724 int size = 0;
4725 Py_UNICODE c = '\0';
4726 Py_UNICODE fill;
4727 PyObject *v = NULL;
4728 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004729 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 Py_UNICODE sign;
4731 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004732 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
4734 fmt++;
4735 if (*fmt == '(') {
4736 Py_UNICODE *keystart;
4737 int keylen;
4738 PyObject *key;
4739 int pcount = 1;
4740
4741 if (dict == NULL) {
4742 PyErr_SetString(PyExc_TypeError,
4743 "format requires a mapping");
4744 goto onError;
4745 }
4746 ++fmt;
4747 --fmtcnt;
4748 keystart = fmt;
4749 /* Skip over balanced parentheses */
4750 while (pcount > 0 && --fmtcnt >= 0) {
4751 if (*fmt == ')')
4752 --pcount;
4753 else if (*fmt == '(')
4754 ++pcount;
4755 fmt++;
4756 }
4757 keylen = fmt - keystart - 1;
4758 if (fmtcnt < 0 || pcount > 0) {
4759 PyErr_SetString(PyExc_ValueError,
4760 "incomplete format key");
4761 goto onError;
4762 }
Fred Drakee4315f52000-05-09 19:53:39 +00004763 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 then looked up since Python uses strings to hold
4765 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004766 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 key = PyUnicode_EncodeUTF8(keystart,
4768 keylen,
4769 NULL);
4770 if (key == NULL)
4771 goto onError;
4772 if (args_owned) {
4773 Py_DECREF(args);
4774 args_owned = 0;
4775 }
4776 args = PyObject_GetItem(dict, key);
4777 Py_DECREF(key);
4778 if (args == NULL) {
4779 goto onError;
4780 }
4781 args_owned = 1;
4782 arglen = -1;
4783 argidx = -2;
4784 }
4785 while (--fmtcnt >= 0) {
4786 switch (c = *fmt++) {
4787 case '-': flags |= F_LJUST; continue;
4788 case '+': flags |= F_SIGN; continue;
4789 case ' ': flags |= F_BLANK; continue;
4790 case '#': flags |= F_ALT; continue;
4791 case '0': flags |= F_ZERO; continue;
4792 }
4793 break;
4794 }
4795 if (c == '*') {
4796 v = getnextarg(args, arglen, &argidx);
4797 if (v == NULL)
4798 goto onError;
4799 if (!PyInt_Check(v)) {
4800 PyErr_SetString(PyExc_TypeError,
4801 "* wants int");
4802 goto onError;
4803 }
4804 width = PyInt_AsLong(v);
4805 if (width < 0) {
4806 flags |= F_LJUST;
4807 width = -width;
4808 }
4809 if (--fmtcnt >= 0)
4810 c = *fmt++;
4811 }
4812 else if (c >= '0' && c <= '9') {
4813 width = c - '0';
4814 while (--fmtcnt >= 0) {
4815 c = *fmt++;
4816 if (c < '0' || c > '9')
4817 break;
4818 if ((width*10) / 10 != width) {
4819 PyErr_SetString(PyExc_ValueError,
4820 "width too big");
4821 goto onError;
4822 }
4823 width = width*10 + (c - '0');
4824 }
4825 }
4826 if (c == '.') {
4827 prec = 0;
4828 if (--fmtcnt >= 0)
4829 c = *fmt++;
4830 if (c == '*') {
4831 v = getnextarg(args, arglen, &argidx);
4832 if (v == NULL)
4833 goto onError;
4834 if (!PyInt_Check(v)) {
4835 PyErr_SetString(PyExc_TypeError,
4836 "* wants int");
4837 goto onError;
4838 }
4839 prec = PyInt_AsLong(v);
4840 if (prec < 0)
4841 prec = 0;
4842 if (--fmtcnt >= 0)
4843 c = *fmt++;
4844 }
4845 else if (c >= '0' && c <= '9') {
4846 prec = c - '0';
4847 while (--fmtcnt >= 0) {
4848 c = Py_CHARMASK(*fmt++);
4849 if (c < '0' || c > '9')
4850 break;
4851 if ((prec*10) / 10 != prec) {
4852 PyErr_SetString(PyExc_ValueError,
4853 "prec too big");
4854 goto onError;
4855 }
4856 prec = prec*10 + (c - '0');
4857 }
4858 }
4859 } /* prec */
4860 if (fmtcnt >= 0) {
4861 if (c == 'h' || c == 'l' || c == 'L') {
4862 size = c;
4863 if (--fmtcnt >= 0)
4864 c = *fmt++;
4865 }
4866 }
4867 if (fmtcnt < 0) {
4868 PyErr_SetString(PyExc_ValueError,
4869 "incomplete format");
4870 goto onError;
4871 }
4872 if (c != '%') {
4873 v = getnextarg(args, arglen, &argidx);
4874 if (v == NULL)
4875 goto onError;
4876 }
4877 sign = 0;
4878 fill = ' ';
4879 switch (c) {
4880
4881 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004882 pbuf = formatbuf;
4883 /* presume that buffer length is at least 1 */
4884 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 len = 1;
4886 break;
4887
4888 case 's':
4889 case 'r':
4890 if (PyUnicode_Check(v) && c == 's') {
4891 temp = v;
4892 Py_INCREF(temp);
4893 }
4894 else {
4895 PyObject *unicode;
4896 if (c == 's')
4897 temp = PyObject_Str(v);
4898 else
4899 temp = PyObject_Repr(v);
4900 if (temp == NULL)
4901 goto onError;
4902 if (!PyString_Check(temp)) {
4903 /* XXX Note: this should never happen, since
4904 PyObject_Repr() and PyObject_Str() assure
4905 this */
4906 Py_DECREF(temp);
4907 PyErr_SetString(PyExc_TypeError,
4908 "%s argument has non-string str()");
4909 goto onError;
4910 }
Fred Drakee4315f52000-05-09 19:53:39 +00004911 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004913 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 "strict");
4915 Py_DECREF(temp);
4916 temp = unicode;
4917 if (temp == NULL)
4918 goto onError;
4919 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004920 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 len = PyUnicode_GET_SIZE(temp);
4922 if (prec >= 0 && len > prec)
4923 len = prec;
4924 break;
4925
4926 case 'i':
4927 case 'd':
4928 case 'u':
4929 case 'o':
4930 case 'x':
4931 case 'X':
4932 if (c == 'i')
4933 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004934 pbuf = formatbuf;
4935 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4936 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 if (len < 0)
4938 goto onError;
4939 sign = (c == 'd');
4940 if (flags & F_ZERO) {
4941 fill = '0';
4942 if ((flags&F_ALT) &&
4943 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004944 pbuf[0] == '0' && pbuf[1] == c) {
4945 *res++ = *pbuf++;
4946 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 rescnt -= 2;
4948 len -= 2;
4949 width -= 2;
4950 if (width < 0)
4951 width = 0;
4952 }
4953 }
4954 break;
4955
4956 case 'e':
4957 case 'E':
4958 case 'f':
4959 case 'g':
4960 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004961 pbuf = formatbuf;
4962 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4963 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 if (len < 0)
4965 goto onError;
4966 sign = 1;
4967 if (flags&F_ZERO)
4968 fill = '0';
4969 break;
4970
4971 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004972 pbuf = formatbuf;
4973 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 if (len < 0)
4975 goto onError;
4976 break;
4977
4978 default:
4979 PyErr_Format(PyExc_ValueError,
4980 "unsupported format character '%c' (0x%x)",
4981 c, c);
4982 goto onError;
4983 }
4984 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004985 if (*pbuf == '-' || *pbuf == '+') {
4986 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 len--;
4988 }
4989 else if (flags & F_SIGN)
4990 sign = '+';
4991 else if (flags & F_BLANK)
4992 sign = ' ';
4993 else
4994 sign = 0;
4995 }
4996 if (width < len)
4997 width = len;
4998 if (rescnt < width + (sign != 0)) {
4999 reslen -= rescnt;
5000 rescnt = width + fmtcnt + 100;
5001 reslen += rescnt;
5002 if (_PyUnicode_Resize(result, reslen) < 0)
5003 return NULL;
5004 res = PyUnicode_AS_UNICODE(result)
5005 + reslen - rescnt;
5006 }
5007 if (sign) {
5008 if (fill != ' ')
5009 *res++ = sign;
5010 rescnt--;
5011 if (width > len)
5012 width--;
5013 }
5014 if (width > len && !(flags & F_LJUST)) {
5015 do {
5016 --rescnt;
5017 *res++ = fill;
5018 } while (--width > len);
5019 }
5020 if (sign && fill == ' ')
5021 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005022 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 res += len;
5024 rescnt -= len;
5025 while (--width >= len) {
5026 --rescnt;
5027 *res++ = ' ';
5028 }
5029 if (dict && (argidx < arglen) && c != '%') {
5030 PyErr_SetString(PyExc_TypeError,
5031 "not all arguments converted");
5032 goto onError;
5033 }
5034 Py_XDECREF(temp);
5035 } /* '%' */
5036 } /* until end */
5037 if (argidx < arglen && !dict) {
5038 PyErr_SetString(PyExc_TypeError,
5039 "not all arguments converted");
5040 goto onError;
5041 }
5042
5043 if (args_owned) {
5044 Py_DECREF(args);
5045 }
5046 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005047 if (_PyUnicode_Resize(result, reslen - rescnt))
5048 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 return (PyObject *)result;
5050
5051 onError:
5052 Py_XDECREF(result);
5053 Py_DECREF(uformat);
5054 if (args_owned) {
5055 Py_DECREF(args);
5056 }
5057 return NULL;
5058}
5059
5060static PyBufferProcs unicode_as_buffer = {
5061 (getreadbufferproc) unicode_buffer_getreadbuf,
5062 (getwritebufferproc) unicode_buffer_getwritebuf,
5063 (getsegcountproc) unicode_buffer_getsegcount,
5064 (getcharbufferproc) unicode_buffer_getcharbuf,
5065};
5066
5067PyTypeObject PyUnicode_Type = {
5068 PyObject_HEAD_INIT(&PyType_Type)
5069 0, /* ob_size */
5070 "unicode", /* tp_name */
5071 sizeof(PyUnicodeObject), /* tp_size */
5072 0, /* tp_itemsize */
5073 /* Slots */
5074 (destructor)_PyUnicode_Free, /* tp_dealloc */
5075 0, /* tp_print */
5076 (getattrfunc)unicode_getattr, /* tp_getattr */
5077 0, /* tp_setattr */
5078 (cmpfunc) unicode_compare, /* tp_compare */
5079 (reprfunc) unicode_repr, /* tp_repr */
5080 0, /* tp_as_number */
5081 &unicode_as_sequence, /* tp_as_sequence */
5082 0, /* tp_as_mapping */
5083 (hashfunc) unicode_hash, /* tp_hash*/
5084 0, /* tp_call*/
5085 (reprfunc) unicode_str, /* tp_str */
5086 (getattrofunc) NULL, /* tp_getattro */
5087 (setattrofunc) NULL, /* tp_setattro */
5088 &unicode_as_buffer, /* tp_as_buffer */
5089 Py_TPFLAGS_DEFAULT, /* tp_flags */
5090};
5091
5092/* Initialize the Unicode implementation */
5093
5094void _PyUnicode_Init()
5095{
5096 /* Doublecheck the configuration... */
5097 if (sizeof(Py_UNICODE) != 2)
5098 Py_FatalError("Unicode configuration error: "
5099 "sizeof(Py_UNICODE) != 2 bytes");
5100
Fred Drakee4315f52000-05-09 19:53:39 +00005101 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005102 unicode_freelist = NULL;
5103 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005105 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106}
5107
5108/* Finalize the Unicode implementation */
5109
5110void
5111_PyUnicode_Fini()
5112{
5113 PyUnicodeObject *u = unicode_freelist;
5114
5115 while (u != NULL) {
5116 PyUnicodeObject *v = u;
5117 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005118 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005119 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005120 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005121 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005123 unicode_freelist = NULL;
5124 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005126 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127}