blob: 7737057614dcf2c913edf21d88382f2a030ea4d9 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000069#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71#if defined(HAVE_LIMITS_H)
72#include <limits.h>
73#else
74#define INT_MAX 2147483647
75#endif
76
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000077#ifdef MS_WIN32
78#include <windows.h>
79#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000080
Guido van Rossumd57fd912000-03-10 22:53:23 +000081/* Limit for the Unicode object free list */
82
83#define MAX_UNICODE_FREELIST_SIZE 1024
84
85/* Limit for the Unicode object free list stay alive optimization.
86
87 The implementation will keep allocated Unicode memory intact for
88 all objects on the free list having a size less than this
89 limit. This reduces malloc() overhead for small Unicode objects.
90
Barry Warsaw51ac5802000-03-20 16:36:48 +000091 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000092 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000093 malloc()-overhead) bytes of unused garbage.
94
95 Setting the limit to 0 effectively turns the feature off.
96
Guido van Rossumfd4b9572000-04-10 13:51:10 +000097 Note: This is an experimental feature ! If you get core dumps when
98 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000099
100*/
101
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000102#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103
104/* Endianness switches; defaults to little endian */
105
106#ifdef WORDS_BIGENDIAN
107# define BYTEORDER_IS_BIG_ENDIAN
108#else
109# define BYTEORDER_IS_LITTLE_ENDIAN
110#endif
111
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000112/* --- Globals ------------------------------------------------------------
113
114 The globals are initialized by the _PyUnicode_Init() API and should
115 not be used before calling that API.
116
117*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
119/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000120static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000121
122/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000123static PyUnicodeObject *unicode_freelist;
124static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125
Fred Drakee4315f52000-05-09 19:53:39 +0000126/* Default encoding to use and assume when NULL is passed as encoding
127 parameter; it is initialized by _PyUnicode_Init().
128
129 Always use the PyUnicode_SetDefaultEncoding() and
130 PyUnicode_GetDefaultEncoding() APIs to access this global.
131
132*/
133
134static char unicode_default_encoding[100];
135
Guido van Rossumd57fd912000-03-10 22:53:23 +0000136/* --- Unicode Object ----------------------------------------------------- */
137
138static
139int _PyUnicode_Resize(register PyUnicodeObject *unicode,
140 int length)
141{
142 void *oldstr;
143
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000144 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000146 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147
148 /* Resizing unicode_empty is not allowed. */
149 if (unicode == unicode_empty) {
150 PyErr_SetString(PyExc_SystemError,
151 "can't resize empty unicode object");
152 return -1;
153 }
154
155 /* We allocate one more byte to make sure the string is
156 Ux0000 terminated -- XXX is this needed ? */
157 oldstr = unicode->str;
158 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
159 if (!unicode->str) {
160 unicode->str = oldstr;
161 PyErr_NoMemory();
162 return -1;
163 }
164 unicode->str[length] = 0;
165 unicode->length = length;
166
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000167 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 /* Reset the object caches */
169 if (unicode->utf8str) {
170 Py_DECREF(unicode->utf8str);
171 unicode->utf8str = NULL;
172 }
173 unicode->hash = -1;
174
175 return 0;
176}
177
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178int PyUnicode_Resize(PyObject **unicode,
179 int length)
180{
181 PyUnicodeObject *v;
182
183 if (unicode == NULL) {
184 PyErr_BadInternalCall();
185 return -1;
186 }
187 v = (PyUnicodeObject *)*unicode;
188 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
189 PyErr_BadInternalCall();
190 return -1;
191 }
192 return _PyUnicode_Resize(v, length);
193}
194
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195/* We allocate one more byte to make sure the string is
196 Ux0000 terminated -- XXX is this needed ?
197
198 XXX This allocator could further be enhanced by assuring that the
199 free list never reduces its size below 1.
200
201*/
202
203static
204PyUnicodeObject *_PyUnicode_New(int length)
205{
206 register PyUnicodeObject *unicode;
207
208 /* Optimization for empty strings */
209 if (length == 0 && unicode_empty != NULL) {
210 Py_INCREF(unicode_empty);
211 return unicode_empty;
212 }
213
214 /* Unicode freelist & memory allocation */
215 if (unicode_freelist) {
216 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000217 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000228 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000230 }
231 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
234 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
235 if (unicode == NULL)
236 return NULL;
237 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
238 }
239
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000240 if (!unicode->str) {
241 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 unicode->str[length] = 0;
245 unicode->length = length;
246 unicode->hash = -1;
247 unicode->utf8str = NULL;
248 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000249
250 onError:
251 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254}
255
256static
257void _PyUnicode_Free(register PyUnicodeObject *unicode)
258{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 /* Keep-Alive optimization */
261 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = NULL;
264 unicode->length = 0;
265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 if (unicode->utf8str) {
267 Py_DECREF(unicode->utf8str);
268 unicode->utf8str = NULL;
269 }
270 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 *(PyUnicodeObject **)unicode = unicode_freelist;
272 unicode_freelist = unicode;
273 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000277 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000278 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 }
280}
281
282PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
283 int size)
284{
285 PyUnicodeObject *unicode;
286
287 unicode = _PyUnicode_New(size);
288 if (!unicode)
289 return NULL;
290
291 /* Copy the Unicode data into the new object */
292 if (u != NULL)
293 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294
295 return (PyObject *)unicode;
296}
297
298#ifdef HAVE_WCHAR_H
299
300PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
301 int size)
302{
303 PyUnicodeObject *unicode;
304
305 if (w == NULL) {
306 PyErr_BadInternalCall();
307 return NULL;
308 }
309
310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the wchar_t data into the new object */
315#ifdef HAVE_USABLE_WCHAR_T
316 memcpy(unicode->str, w, size * sizeof(wchar_t));
317#else
318 {
319 register Py_UNICODE *u;
320 register int i;
321 u = PyUnicode_AS_UNICODE(unicode);
322 for (i = size; i >= 0; i--)
323 *u++ = *w++;
324 }
325#endif
326
327 return (PyObject *)unicode;
328}
329
330int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
331 register wchar_t *w,
332 int size)
333{
334 if (unicode == NULL) {
335 PyErr_BadInternalCall();
336 return -1;
337 }
338 if (size > PyUnicode_GET_SIZE(unicode))
339 size = PyUnicode_GET_SIZE(unicode);
340#ifdef HAVE_USABLE_WCHAR_T
341 memcpy(w, unicode->str, size * sizeof(wchar_t));
342#else
343 {
344 register Py_UNICODE *u;
345 register int i;
346 u = PyUnicode_AS_UNICODE(unicode);
347 for (i = size; i >= 0; i--)
348 *w++ = *u++;
349 }
350#endif
351
352 return size;
353}
354
355#endif
356
357PyObject *PyUnicode_FromObject(register PyObject *obj)
358{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000359 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
360}
361
362PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
363 const char *encoding,
364 const char *errors)
365{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 const char *s;
367 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368 int owned = 0;
369 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371 if (obj == NULL) {
372 PyErr_BadInternalCall();
373 return NULL;
374 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000375
376 /* Coerce object */
377 if (PyInstance_Check(obj)) {
378 PyObject *func;
379 func = PyObject_GetAttrString(obj, "__str__");
380 if (func == NULL) {
381 PyErr_SetString(PyExc_TypeError,
382 "coercing to Unicode: instance doesn't define __str__");
383 return NULL;
384 }
385 obj = PyEval_CallObject(func, NULL);
386 Py_DECREF(func);
387 if (obj == NULL)
388 return NULL;
389 owned = 1;
390 }
391 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 v = obj;
394 if (encoding) {
395 PyErr_SetString(PyExc_TypeError,
396 "decoding Unicode is not supported");
397 return NULL;
398 }
399 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 }
401 else if (PyString_Check(obj)) {
402 s = PyString_AS_STRING(obj);
403 len = PyString_GET_SIZE(obj);
404 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000405 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
406 /* Overwrite the error message with something more useful in
407 case of a TypeError. */
408 if (PyErr_ExceptionMatches(PyExc_TypeError))
409 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410 "coercing to Unicode: need string or buffer");
411 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000413
414 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 if (len == 0) {
416 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000417 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 else
420 v = PyUnicode_Decode(s, len, encoding, errors);
421 done:
422 if (owned)
423 Py_DECREF(obj);
424 return v;
425
426 onError:
427 if (owned)
428 Py_DECREF(obj);
429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430}
431
432PyObject *PyUnicode_Decode(const char *s,
433 int size,
434 const char *encoding,
435 const char *errors)
436{
437 PyObject *buffer = NULL, *unicode;
438
Fred Drakee4315f52000-05-09 19:53:39 +0000439 if (encoding == NULL)
440 encoding = PyUnicode_GetDefaultEncoding();
441
442 /* Shortcuts for common default encodings */
443 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000445 else if (strcmp(encoding, "latin-1") == 0)
446 return PyUnicode_DecodeLatin1(s, size, errors);
447 else if (strcmp(encoding, "ascii") == 0)
448 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449
450 /* Decode via the codec registry */
451 buffer = PyBuffer_FromMemory((void *)s, size);
452 if (buffer == NULL)
453 goto onError;
454 unicode = PyCodec_Decode(buffer, encoding, errors);
455 if (unicode == NULL)
456 goto onError;
457 if (!PyUnicode_Check(unicode)) {
458 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000459 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 unicode->ob_type->tp_name);
461 Py_DECREF(unicode);
462 goto onError;
463 }
464 Py_DECREF(buffer);
465 return unicode;
466
467 onError:
468 Py_XDECREF(buffer);
469 return NULL;
470}
471
472PyObject *PyUnicode_Encode(const Py_UNICODE *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *v, *unicode;
478
479 unicode = PyUnicode_FromUnicode(s, size);
480 if (unicode == NULL)
481 return NULL;
482 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
483 Py_DECREF(unicode);
484 return v;
485}
486
487PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *v;
492
493 if (!PyUnicode_Check(unicode)) {
494 PyErr_BadArgument();
495 goto onError;
496 }
Fred Drakee4315f52000-05-09 19:53:39 +0000497
498 if (encoding == NULL)
499 encoding = PyUnicode_GetDefaultEncoding();
500
501 /* Shortcuts for common default encodings */
502 if (errors == NULL) {
503 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000505 else if (strcmp(encoding, "latin-1") == 0)
506 return PyUnicode_AsLatin1String(unicode);
507 else if (strcmp(encoding, "ascii") == 0)
508 return PyUnicode_AsASCIIString(unicode);
509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 /* Encode via the codec registry */
512 v = PyCodec_Encode(unicode, encoding, errors);
513 if (v == NULL)
514 goto onError;
515 /* XXX Should we really enforce this ? */
516 if (!PyString_Check(v)) {
517 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000518 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 v->ob_type->tp_name);
520 Py_DECREF(v);
521 goto onError;
522 }
523 return v;
524
525 onError:
526 return NULL;
527}
528
529Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
530{
531 if (!PyUnicode_Check(unicode)) {
532 PyErr_BadArgument();
533 goto onError;
534 }
535 return PyUnicode_AS_UNICODE(unicode);
536
537 onError:
538 return NULL;
539}
540
541int PyUnicode_GetSize(PyObject *unicode)
542{
543 if (!PyUnicode_Check(unicode)) {
544 PyErr_BadArgument();
545 goto onError;
546 }
547 return PyUnicode_GET_SIZE(unicode);
548
549 onError:
550 return -1;
551}
552
Fred Drakee4315f52000-05-09 19:53:39 +0000553const char *PyUnicode_GetDefaultEncoding()
554{
555 return unicode_default_encoding;
556}
557
558int PyUnicode_SetDefaultEncoding(const char *encoding)
559{
560 PyObject *v;
561
562 /* Make sure the encoding is valid. As side effect, this also
563 loads the encoding into the codec registry cache. */
564 v = _PyCodec_Lookup(encoding);
565 if (v == NULL)
566 goto onError;
567 Py_DECREF(v);
568 strncpy(unicode_default_encoding,
569 encoding,
570 sizeof(unicode_default_encoding));
571 return 0;
572
573 onError:
574 return -1;
575}
576
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577/* --- UTF-8 Codec -------------------------------------------------------- */
578
579static
580char utf8_code_length[256] = {
581 /* Map UTF-8 encoded prefix byte to sequence length. zero means
582 illegal prefix. see RFC 2279 for details */
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
597 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
598 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
599};
600
601static
602int utf8_decoding_error(const char **source,
603 Py_UNICODE **dest,
604 const char *errors,
605 const char *details)
606{
607 if ((errors == NULL) ||
608 (strcmp(errors,"strict") == 0)) {
609 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000610 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 details);
612 return -1;
613 }
614 else if (strcmp(errors,"ignore") == 0) {
615 (*source)++;
616 return 0;
617 }
618 else if (strcmp(errors,"replace") == 0) {
619 (*source)++;
620 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
621 (*dest)++;
622 return 0;
623 }
624 else {
625 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000626 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000627 errors);
628 return -1;
629 }
630}
631
632#define UTF8_ERROR(details) do { \
633 if (utf8_decoding_error(&s, &p, errors, details)) \
634 goto onError; \
635 continue; \
636} while (0)
637
638PyObject *PyUnicode_DecodeUTF8(const char *s,
639 int size,
640 const char *errors)
641{
642 int n;
643 const char *e;
644 PyUnicodeObject *unicode;
645 Py_UNICODE *p;
646
647 /* Note: size will always be longer than the resulting Unicode
648 character count */
649 unicode = _PyUnicode_New(size);
650 if (!unicode)
651 return NULL;
652 if (size == 0)
653 return (PyObject *)unicode;
654
655 /* Unpack UTF-8 encoded data */
656 p = unicode->str;
657 e = s + size;
658
659 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000660 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661
662 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000663 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 s++;
665 continue;
666 }
667
668 n = utf8_code_length[ch];
669
670 if (s + n > e)
671 UTF8_ERROR("unexpected end of data");
672
673 switch (n) {
674
675 case 0:
676 UTF8_ERROR("unexpected code byte");
677 break;
678
679 case 1:
680 UTF8_ERROR("internal error");
681 break;
682
683 case 2:
684 if ((s[1] & 0xc0) != 0x80)
685 UTF8_ERROR("invalid data");
686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
687 if (ch < 0x80)
688 UTF8_ERROR("illegal encoding");
689 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000690 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 break;
692
693 case 3:
694 if ((s[1] & 0xc0) != 0x80 ||
695 (s[2] & 0xc0) != 0x80)
696 UTF8_ERROR("invalid data");
697 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
698 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
699 UTF8_ERROR("illegal encoding");
700 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000701 *p++ = (Py_UNICODE)ch;
702 break;
703
704 case 4:
705 if ((s[1] & 0xc0) != 0x80 ||
706 (s[2] & 0xc0) != 0x80 ||
707 (s[3] & 0xc0) != 0x80)
708 UTF8_ERROR("invalid data");
709 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
710 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
711 /* validate and convert to UTF-16 */
712 if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
713 (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
714 UTF8_ERROR("illegal encoding");
715 /* compute and append the two surrogates: */
716
717 /* translate from 10000..10FFFF to 0..FFFF */
718 ch -= 0x10000;
719
720 /* high surrogate = top 10 bits added to D800 */
721 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
722
723 /* low surrogate = bottom 10 bits added to DC00 */
724 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000725 break;
726
727 default:
728 /* Other sizes are only needed for UCS-4 */
729 UTF8_ERROR("unsupported Unicode code range");
730 }
731 s += n;
732 }
733
734 /* Adjust length */
735 if (_PyUnicode_Resize(unicode, p - unicode->str))
736 goto onError;
737
738 return (PyObject *)unicode;
739
740onError:
741 Py_DECREF(unicode);
742 return NULL;
743}
744
745#undef UTF8_ERROR
746
747static
748int utf8_encoding_error(const Py_UNICODE **source,
749 char **dest,
750 const char *errors,
751 const char *details)
752{
753 if ((errors == NULL) ||
754 (strcmp(errors,"strict") == 0)) {
755 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000756 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 details);
758 return -1;
759 }
760 else if (strcmp(errors,"ignore") == 0) {
761 return 0;
762 }
763 else if (strcmp(errors,"replace") == 0) {
764 **dest = '?';
765 (*dest)++;
766 return 0;
767 }
768 else {
769 PyErr_Format(PyExc_ValueError,
770 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000771 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 errors);
773 return -1;
774 }
775}
776
777PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
778 int size,
779 const char *errors)
780{
781 PyObject *v;
782 char *p;
783 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000784 Py_UCS4 ch2;
785 unsigned int cbAllocated = 3 * size;
786 unsigned int cbWritten = 0;
787 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000789 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790 if (v == NULL)
791 return NULL;
792 if (size == 0)
793 goto done;
794
795 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000796 while (i < size) {
797 Py_UCS4 ch = s[i++];
798 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000800 cbWritten++;
801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 else if (ch < 0x0800) {
803 *p++ = 0xc0 | (ch >> 6);
804 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 cbWritten += 2;
806 }
807 else {
808 /* Check for high surrogate */
809 if (0xD800 <= ch && ch <= 0xDBFF) {
810 if (i != size) {
811 ch2 = s[i];
812 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
813
814 if (cbWritten >= (cbAllocated - 4)) {
815 /* Provide enough room for some more
816 surrogates */
817 cbAllocated += 4*10;
818 if (_PyString_Resize(&v, cbAllocated))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000820 }
821
822 /* combine the two values */
823 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
824
825 *p++ = (char)((ch >> 18) | 0xf0);
826 *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
827 i++;
828 cbWritten += 4;
829 }
830 }
831 }
832 else {
833 *p++ = (char)(0xe0 | (ch >> 12));
834 cbWritten += 3;
835 }
836 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
837 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000838 }
839 }
840 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000841 if (_PyString_Resize(&v, p - q))
842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
844 done:
845 return v;
846
847 onError:
848 Py_DECREF(v);
849 return NULL;
850}
851
852/* Return a Python string holding the UTF-8 encoded value of the
853 Unicode object.
854
855 The resulting string is cached in the Unicode object for subsequent
856 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000857 the character buffer interface and will live (at least) as long as
858 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859
860 The refcount of the string is *not* incremented.
861
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000862 *** Exported for internal use by the interpreter only !!! ***
863
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864*/
865
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000866PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +0000867 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871 if (v)
872 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000873 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
874 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875 errors);
876 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000877 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 return v;
879}
880
881PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
882{
883 PyObject *str;
884
885 if (!PyUnicode_Check(unicode)) {
886 PyErr_BadArgument();
887 return NULL;
888 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000889 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (str == NULL)
891 return NULL;
892 Py_INCREF(str);
893 return str;
894}
895
896/* --- UTF-16 Codec ------------------------------------------------------- */
897
898static
899int utf16_decoding_error(const Py_UNICODE **source,
900 Py_UNICODE **dest,
901 const char *errors,
902 const char *details)
903{
904 if ((errors == NULL) ||
905 (strcmp(errors,"strict") == 0)) {
906 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000907 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 details);
909 return -1;
910 }
911 else if (strcmp(errors,"ignore") == 0) {
912 return 0;
913 }
914 else if (strcmp(errors,"replace") == 0) {
915 if (dest) {
916 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
917 (*dest)++;
918 }
919 return 0;
920 }
921 else {
922 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000923 "UTF-16 decoding error; "
924 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000925 errors);
926 return -1;
927 }
928}
929
930#define UTF16_ERROR(details) do { \
931 if (utf16_decoding_error(&q, &p, errors, details)) \
932 goto onError; \
933 continue; \
934} while(0)
935
936PyObject *PyUnicode_DecodeUTF16(const char *s,
937 int size,
938 const char *errors,
939 int *byteorder)
940{
941 PyUnicodeObject *unicode;
942 Py_UNICODE *p;
943 const Py_UNICODE *q, *e;
944 int bo = 0;
945
946 /* size should be an even number */
947 if (size % sizeof(Py_UNICODE) != 0) {
948 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
949 return NULL;
950 /* The remaining input chars are ignored if we fall through
951 here... */
952 }
953
954 /* Note: size will always be longer than the resulting Unicode
955 character count */
956 unicode = _PyUnicode_New(size);
957 if (!unicode)
958 return NULL;
959 if (size == 0)
960 return (PyObject *)unicode;
961
962 /* Unpack UTF-16 encoded data */
963 p = unicode->str;
964 q = (Py_UNICODE *)s;
965 e = q + (size / sizeof(Py_UNICODE));
966
967 if (byteorder)
968 bo = *byteorder;
969
970 while (q < e) {
971 register Py_UNICODE ch = *q++;
972
973 /* Check for BOM marks (U+FEFF) in the input and adjust
974 current byte order setting accordingly. Swap input
975 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
976 !) */
977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
978 if (ch == 0xFEFF) {
979 bo = -1;
980 continue;
981 } else if (ch == 0xFFFE) {
982 bo = 1;
983 continue;
984 }
985 if (bo == 1)
986 ch = (ch >> 8) | (ch << 8);
987#else
988 if (ch == 0xFEFF) {
989 bo = 1;
990 continue;
991 } else if (ch == 0xFFFE) {
992 bo = -1;
993 continue;
994 }
995 if (bo == -1)
996 ch = (ch >> 8) | (ch << 8);
997#endif
998 if (ch < 0xD800 || ch > 0xDFFF) {
999 *p++ = ch;
1000 continue;
1001 }
1002
1003 /* UTF-16 code pair: */
1004 if (q >= e)
1005 UTF16_ERROR("unexpected end of data");
1006 if (0xDC00 <= *q && *q <= 0xDFFF) {
1007 q++;
1008 if (0xD800 <= *q && *q <= 0xDBFF)
1009 /* This is valid data (a UTF-16 surrogate pair), but
1010 we are not able to store this information since our
1011 Py_UNICODE type only has 16 bits... this might
1012 change someday, even though it's unlikely. */
1013 UTF16_ERROR("code pairs are not supported");
1014 else
1015 continue;
1016 }
1017 UTF16_ERROR("illegal encoding");
1018 }
1019
1020 if (byteorder)
1021 *byteorder = bo;
1022
1023 /* Adjust length */
1024 if (_PyUnicode_Resize(unicode, p - unicode->str))
1025 goto onError;
1026
1027 return (PyObject *)unicode;
1028
1029onError:
1030 Py_DECREF(unicode);
1031 return NULL;
1032}
1033
1034#undef UTF16_ERROR
1035
1036PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1037 int size,
1038 const char *errors,
1039 int byteorder)
1040{
1041 PyObject *v;
1042 Py_UNICODE *p;
1043 char *q;
1044
1045 /* We don't create UTF-16 pairs... */
1046 v = PyString_FromStringAndSize(NULL,
1047 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1048 if (v == NULL)
1049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050
1051 q = PyString_AS_STRING(v);
1052 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 if (byteorder == 0)
1054 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001055 if (size == 0)
1056 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 if (byteorder == 0 ||
1058#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1059 byteorder == -1
1060#else
1061 byteorder == 1
1062#endif
1063 )
1064 memcpy(p, s, size * sizeof(Py_UNICODE));
1065 else
1066 while (size-- > 0) {
1067 Py_UNICODE ch = *s++;
1068 *p++ = (ch >> 8) | (ch << 8);
1069 }
1070 done:
1071 return v;
1072}
1073
1074PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1075{
1076 if (!PyUnicode_Check(unicode)) {
1077 PyErr_BadArgument();
1078 return NULL;
1079 }
1080 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1081 PyUnicode_GET_SIZE(unicode),
1082 NULL,
1083 0);
1084}
1085
1086/* --- Unicode Escape Codec ----------------------------------------------- */
1087
1088static
1089int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001090 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *errors,
1092 const char *details)
1093{
1094 if ((errors == NULL) ||
1095 (strcmp(errors,"strict") == 0)) {
1096 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001097 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 details);
1099 return -1;
1100 }
1101 else if (strcmp(errors,"ignore") == 0) {
1102 return 0;
1103 }
1104 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001105 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 return 0;
1107 }
1108 else {
1109 PyErr_Format(PyExc_ValueError,
1110 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001111 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 errors);
1113 return -1;
1114 }
1115}
1116
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001117static _Py_UCNHashAPI *pucnHash = NULL;
1118
1119static
1120int mystrnicmp(const char *s1, const char *s2, size_t count)
1121{
1122 char c1, c2;
1123
1124 if (count)
1125 {
1126 do
1127 {
1128 c1 = tolower(*(s1++));
1129 c2 = tolower(*(s2++));
1130 }
1131 while(--count && c1 == c2);
1132
1133 return c1 - c2;
1134 }
1135
1136 return 0;
1137}
1138
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1140 int size,
1141 const char *errors)
1142{
1143 PyUnicodeObject *v;
1144 Py_UNICODE *p = NULL, *buf = NULL;
1145 const char *end;
1146
1147 /* Escaped strings will always be longer than the resulting
1148 Unicode string, so we start with size here and then reduce the
1149 length after conversion to the true value. */
1150 v = _PyUnicode_New(size);
1151 if (v == NULL)
1152 goto onError;
1153 if (size == 0)
1154 return (PyObject *)v;
1155 p = buf = PyUnicode_AS_UNICODE(v);
1156 end = s + size;
1157 while (s < end) {
1158 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001159 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 int i;
1161
1162 /* Non-escape characters are interpreted as Unicode ordinals */
1163 if (*s != '\\') {
1164 *p++ = (unsigned char)*s++;
1165 continue;
1166 }
1167
1168 /* \ - Escapes */
1169 s++;
1170 switch (*s++) {
1171
1172 /* \x escapes */
1173 case '\n': break;
1174 case '\\': *p++ = '\\'; break;
1175 case '\'': *p++ = '\''; break;
1176 case '\"': *p++ = '\"'; break;
1177 case 'b': *p++ = '\b'; break;
1178 case 'f': *p++ = '\014'; break; /* FF */
1179 case 't': *p++ = '\t'; break;
1180 case 'n': *p++ = '\n'; break;
1181 case 'r': *p++ = '\r'; break;
1182 case 'v': *p++ = '\013'; break; /* VT */
1183 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1184
1185 /* \OOO (octal) escapes */
1186 case '0': case '1': case '2': case '3':
1187 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001188 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001190 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001192 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001194 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 break;
1196
1197 /* \xXXXX escape with 0-4 hex digits */
1198 case 'x':
1199 x = 0;
1200 c = (unsigned char)*s;
1201 if (isxdigit(c)) {
1202 do {
1203 x = (x<<4) & ~0xF;
1204 if ('0' <= c && c <= '9')
1205 x += c - '0';
1206 else if ('a' <= c && c <= 'f')
1207 x += 10 + c - 'a';
1208 else
1209 x += 10 + c - 'A';
1210 c = (unsigned char)*++s;
1211 } while (isxdigit(c));
1212 *p++ = x;
1213 } else {
1214 *p++ = '\\';
1215 *p++ = (unsigned char)s[-1];
1216 }
1217 break;
1218
1219 /* \uXXXX with 4 hex digits */
1220 case 'u':
1221 for (x = 0, i = 0; i < 4; i++) {
1222 c = (unsigned char)s[i];
1223 if (!isxdigit(c)) {
1224 if (unicodeescape_decoding_error(&s, &x, errors,
1225 "truncated \\uXXXX"))
1226 goto onError;
1227 i++;
1228 break;
1229 }
1230 x = (x<<4) & ~0xF;
1231 if (c >= '0' && c <= '9')
1232 x += c - '0';
1233 else if (c >= 'a' && c <= 'f')
1234 x += 10 + c - 'a';
1235 else
1236 x += 10 + c - 'A';
1237 }
1238 s += i;
1239 *p++ = x;
1240 break;
1241
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001242 case 'N':
1243 /* Ok, we need to deal with Unicode Character Names now,
1244 * make sure we've imported the hash table data...
1245 */
1246 if (pucnHash == NULL)
1247 {
1248 PyObject *mod = 0, *v = 0;
1249
1250 mod = PyImport_ImportModule("ucnhash");
1251 if (mod == NULL)
1252 goto onError;
1253 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1254 Py_DECREF(mod);
1255 if (v == NULL)
1256 {
1257 goto onError;
1258 }
1259 pucnHash = PyCObject_AsVoidPtr(v);
1260 Py_DECREF(v);
1261 if (pucnHash == NULL)
1262 {
1263 goto onError;
1264 }
1265 }
1266
1267 if (*s == '{')
1268 {
1269 const char *start = s + 1;
1270 const char *endBrace = start;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001271 Py_UCS4 value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001272 unsigned long j;
1273
1274 /* look for either the closing brace, or we
1275 * exceed the maximum length of the unicode character names
1276 */
1277 while (*endBrace != '}' &&
1278 (unsigned int)(endBrace - start) <=
1279 pucnHash->cchMax &&
1280 endBrace < end)
1281 {
1282 endBrace++;
1283 }
1284 if (endBrace != end && *endBrace == '}')
1285 {
1286 j = pucnHash->hash(start, endBrace - start);
1287 if (j > pucnHash->cKeys ||
1288 mystrnicmp(
1289 start,
1290 ((_Py_UnicodeCharacterName *)
1291 (pucnHash->getValue(j)))->pszUCN,
1292 (int)(endBrace - start)) != 0)
1293 {
1294 if (unicodeescape_decoding_error(
1295 &s, &x, errors,
1296 "Invalid Unicode Character Name"))
1297 {
1298 goto onError;
1299 }
1300 goto ucnFallthrough;
1301 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 value = ((_Py_UnicodeCharacterName *)
1303 (pucnHash->getValue(j)))->value;
1304 if (value < 1<<16)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001305 {
1306 /* In UCS-2 range, easy solution.. */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001307 *p++ = value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001308 }
1309 else
1310 {
1311 /* Oops, its in UCS-4 space, */
1312 /* compute and append the two surrogates: */
1313 /* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001314 value -= 0x10000;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001315
1316 /* high surrogate = top 10 bits added to D800 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001317 *p++ = 0xD800 + (value >> 10);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001318
1319 /* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001320 *p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001321 }
1322 s = endBrace + 1;
1323 }
1324 else
1325 {
1326 if (unicodeescape_decoding_error(
1327 &s, &x, errors,
1328 "Unicode name missing closing brace"))
1329 goto onError;
1330 goto ucnFallthrough;
1331 }
1332 break;
1333 }
1334 if (unicodeescape_decoding_error(
1335 &s, &x, errors,
1336 "Missing opening brace for Unicode Character Name escape"))
1337 goto onError;
1338ucnFallthrough:
1339 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001340 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 *p++ = '\\';
1342 *p++ = (unsigned char)s[-1];
1343 break;
1344 }
1345 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001346 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001347 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 return (PyObject *)v;
1349
1350 onError:
1351 Py_XDECREF(v);
1352 return NULL;
1353}
1354
1355/* Return a Unicode-Escape string version of the Unicode object.
1356
1357 If quotes is true, the string is enclosed in u"" or u'' quotes as
1358 appropriate.
1359
1360*/
1361
Barry Warsaw51ac5802000-03-20 16:36:48 +00001362static const Py_UNICODE *findchar(const Py_UNICODE *s,
1363 int size,
1364 Py_UNICODE ch);
1365
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366static
1367PyObject *unicodeescape_string(const Py_UNICODE *s,
1368 int size,
1369 int quotes)
1370{
1371 PyObject *repr;
1372 char *p;
1373 char *q;
1374
1375 static const char *hexdigit = "0123456789ABCDEF";
1376
1377 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1378 if (repr == NULL)
1379 return NULL;
1380
1381 p = q = PyString_AS_STRING(repr);
1382
1383 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 *p++ = 'u';
1385 *p++ = (findchar(s, size, '\'') &&
1386 !findchar(s, size, '"')) ? '"' : '\'';
1387 }
1388 while (size-- > 0) {
1389 Py_UNICODE ch = *s++;
1390 /* Escape quotes */
1391 if (quotes && (ch == q[1] || ch == '\\')) {
1392 *p++ = '\\';
1393 *p++ = (char) ch;
1394 }
1395 /* Map 16-bit characters to '\uxxxx' */
1396 else if (ch >= 256) {
1397 *p++ = '\\';
1398 *p++ = 'u';
1399 *p++ = hexdigit[(ch >> 12) & 0xf];
1400 *p++ = hexdigit[(ch >> 8) & 0xf];
1401 *p++ = hexdigit[(ch >> 4) & 0xf];
1402 *p++ = hexdigit[ch & 15];
1403 }
1404 /* Map non-printable US ASCII to '\ooo' */
1405 else if (ch < ' ' || ch >= 128) {
1406 *p++ = '\\';
1407 *p++ = hexdigit[(ch >> 6) & 7];
1408 *p++ = hexdigit[(ch >> 3) & 7];
1409 *p++ = hexdigit[ch & 7];
1410 }
1411 /* Copy everything else as-is */
1412 else
1413 *p++ = (char) ch;
1414 }
1415 if (quotes)
1416 *p++ = q[1];
1417
1418 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419 if (_PyString_Resize(&repr, p - q))
1420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
1422 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001423
1424 onError:
1425 Py_DECREF(repr);
1426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427}
1428
1429PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1430 int size)
1431{
1432 return unicodeescape_string(s, size, 0);
1433}
1434
1435PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1436{
1437 if (!PyUnicode_Check(unicode)) {
1438 PyErr_BadArgument();
1439 return NULL;
1440 }
1441 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1442 PyUnicode_GET_SIZE(unicode));
1443}
1444
1445/* --- Raw Unicode Escape Codec ------------------------------------------- */
1446
1447PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1448 int size,
1449 const char *errors)
1450{
1451 PyUnicodeObject *v;
1452 Py_UNICODE *p, *buf;
1453 const char *end;
1454 const char *bs;
1455
1456 /* Escaped strings will always be longer than the resulting
1457 Unicode string, so we start with size here and then reduce the
1458 length after conversion to the true value. */
1459 v = _PyUnicode_New(size);
1460 if (v == NULL)
1461 goto onError;
1462 if (size == 0)
1463 return (PyObject *)v;
1464 p = buf = PyUnicode_AS_UNICODE(v);
1465 end = s + size;
1466 while (s < end) {
1467 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001468 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 int i;
1470
1471 /* Non-escape characters are interpreted as Unicode ordinals */
1472 if (*s != '\\') {
1473 *p++ = (unsigned char)*s++;
1474 continue;
1475 }
1476
1477 /* \u-escapes are only interpreted iff the number of leading
1478 backslashes if odd */
1479 bs = s;
1480 for (;s < end;) {
1481 if (*s != '\\')
1482 break;
1483 *p++ = (unsigned char)*s++;
1484 }
1485 if (((s - bs) & 1) == 0 ||
1486 s >= end ||
1487 *s != 'u') {
1488 continue;
1489 }
1490 p--;
1491 s++;
1492
1493 /* \uXXXX with 4 hex digits */
1494 for (x = 0, i = 0; i < 4; i++) {
1495 c = (unsigned char)s[i];
1496 if (!isxdigit(c)) {
1497 if (unicodeescape_decoding_error(&s, &x, errors,
1498 "truncated \\uXXXX"))
1499 goto onError;
1500 i++;
1501 break;
1502 }
1503 x = (x<<4) & ~0xF;
1504 if (c >= '0' && c <= '9')
1505 x += c - '0';
1506 else if (c >= 'a' && c <= 'f')
1507 x += 10 + c - 'a';
1508 else
1509 x += 10 + c - 'A';
1510 }
1511 s += i;
1512 *p++ = x;
1513 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001514 if (_PyUnicode_Resize(v, (int)(p - buf)))
1515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 return (PyObject *)v;
1517
1518 onError:
1519 Py_XDECREF(v);
1520 return NULL;
1521}
1522
1523PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1524 int size)
1525{
1526 PyObject *repr;
1527 char *p;
1528 char *q;
1529
1530 static const char *hexdigit = "0123456789ABCDEF";
1531
1532 repr = PyString_FromStringAndSize(NULL, 6 * size);
1533 if (repr == NULL)
1534 return NULL;
1535
1536 p = q = PyString_AS_STRING(repr);
1537 while (size-- > 0) {
1538 Py_UNICODE ch = *s++;
1539 /* Map 16-bit characters to '\uxxxx' */
1540 if (ch >= 256) {
1541 *p++ = '\\';
1542 *p++ = 'u';
1543 *p++ = hexdigit[(ch >> 12) & 0xf];
1544 *p++ = hexdigit[(ch >> 8) & 0xf];
1545 *p++ = hexdigit[(ch >> 4) & 0xf];
1546 *p++ = hexdigit[ch & 15];
1547 }
1548 /* Copy everything else as-is */
1549 else
1550 *p++ = (char) ch;
1551 }
1552 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001553 if (_PyString_Resize(&repr, p - q))
1554 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001557
1558 onError:
1559 Py_DECREF(repr);
1560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561}
1562
1563PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1564{
1565 if (!PyUnicode_Check(unicode)) {
1566 PyErr_BadArgument();
1567 return NULL;
1568 }
1569 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1570 PyUnicode_GET_SIZE(unicode));
1571}
1572
1573/* --- Latin-1 Codec ------------------------------------------------------ */
1574
1575PyObject *PyUnicode_DecodeLatin1(const char *s,
1576 int size,
1577 const char *errors)
1578{
1579 PyUnicodeObject *v;
1580 Py_UNICODE *p;
1581
1582 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1583 v = _PyUnicode_New(size);
1584 if (v == NULL)
1585 goto onError;
1586 if (size == 0)
1587 return (PyObject *)v;
1588 p = PyUnicode_AS_UNICODE(v);
1589 while (size-- > 0)
1590 *p++ = (unsigned char)*s++;
1591 return (PyObject *)v;
1592
1593 onError:
1594 Py_XDECREF(v);
1595 return NULL;
1596}
1597
1598static
1599int latin1_encoding_error(const Py_UNICODE **source,
1600 char **dest,
1601 const char *errors,
1602 const char *details)
1603{
1604 if ((errors == NULL) ||
1605 (strcmp(errors,"strict") == 0)) {
1606 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001607 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 details);
1609 return -1;
1610 }
1611 else if (strcmp(errors,"ignore") == 0) {
1612 return 0;
1613 }
1614 else if (strcmp(errors,"replace") == 0) {
1615 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001616 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 return 0;
1618 }
1619 else {
1620 PyErr_Format(PyExc_ValueError,
1621 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001622 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 errors);
1624 return -1;
1625 }
1626}
1627
1628PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1629 int size,
1630 const char *errors)
1631{
1632 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001633 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 repr = PyString_FromStringAndSize(NULL, size);
1635 if (repr == NULL)
1636 return NULL;
1637
1638 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001639 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 while (size-- > 0) {
1641 Py_UNICODE ch = *p++;
1642 if (ch >= 256) {
1643 if (latin1_encoding_error(&p, &s, errors,
1644 "ordinal not in range(256)"))
1645 goto onError;
1646 }
1647 else
1648 *s++ = (char)ch;
1649 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001650 /* Resize if error handling skipped some characters */
1651 if (s - start < PyString_GET_SIZE(repr))
1652 if (_PyString_Resize(&repr, s - start))
1653 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 return repr;
1655
1656 onError:
1657 Py_DECREF(repr);
1658 return NULL;
1659}
1660
1661PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1662{
1663 if (!PyUnicode_Check(unicode)) {
1664 PyErr_BadArgument();
1665 return NULL;
1666 }
1667 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1668 PyUnicode_GET_SIZE(unicode),
1669 NULL);
1670}
1671
1672/* --- 7-bit ASCII Codec -------------------------------------------------- */
1673
1674static
1675int ascii_decoding_error(const char **source,
1676 Py_UNICODE **dest,
1677 const char *errors,
1678 const char *details)
1679{
1680 if ((errors == NULL) ||
1681 (strcmp(errors,"strict") == 0)) {
1682 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001683 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 details);
1685 return -1;
1686 }
1687 else if (strcmp(errors,"ignore") == 0) {
1688 return 0;
1689 }
1690 else if (strcmp(errors,"replace") == 0) {
1691 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1692 (*dest)++;
1693 return 0;
1694 }
1695 else {
1696 PyErr_Format(PyExc_ValueError,
1697 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001698 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 errors);
1700 return -1;
1701 }
1702}
1703
1704PyObject *PyUnicode_DecodeASCII(const char *s,
1705 int size,
1706 const char *errors)
1707{
1708 PyUnicodeObject *v;
1709 Py_UNICODE *p;
1710
1711 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1712 v = _PyUnicode_New(size);
1713 if (v == NULL)
1714 goto onError;
1715 if (size == 0)
1716 return (PyObject *)v;
1717 p = PyUnicode_AS_UNICODE(v);
1718 while (size-- > 0) {
1719 register unsigned char c;
1720
1721 c = (unsigned char)*s++;
1722 if (c < 128)
1723 *p++ = c;
1724 else if (ascii_decoding_error(&s, &p, errors,
1725 "ordinal not in range(128)"))
1726 goto onError;
1727 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001728 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1729 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 return (PyObject *)v;
1732
1733 onError:
1734 Py_XDECREF(v);
1735 return NULL;
1736}
1737
1738static
1739int ascii_encoding_error(const Py_UNICODE **source,
1740 char **dest,
1741 const char *errors,
1742 const char *details)
1743{
1744 if ((errors == NULL) ||
1745 (strcmp(errors,"strict") == 0)) {
1746 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001747 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 details);
1749 return -1;
1750 }
1751 else if (strcmp(errors,"ignore") == 0) {
1752 return 0;
1753 }
1754 else if (strcmp(errors,"replace") == 0) {
1755 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001756 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 return 0;
1758 }
1759 else {
1760 PyErr_Format(PyExc_ValueError,
1761 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001762 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 errors);
1764 return -1;
1765 }
1766}
1767
1768PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1769 int size,
1770 const char *errors)
1771{
1772 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001773 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 repr = PyString_FromStringAndSize(NULL, size);
1775 if (repr == NULL)
1776 return NULL;
1777
1778 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001779 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 while (size-- > 0) {
1781 Py_UNICODE ch = *p++;
1782 if (ch >= 128) {
1783 if (ascii_encoding_error(&p, &s, errors,
1784 "ordinal not in range(128)"))
1785 goto onError;
1786 }
1787 else
1788 *s++ = (char)ch;
1789 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001790 /* Resize if error handling skipped some characters */
1791 if (s - start < PyString_GET_SIZE(repr))
1792 if (_PyString_Resize(&repr, s - start))
1793 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 return repr;
1795
1796 onError:
1797 Py_DECREF(repr);
1798 return NULL;
1799}
1800
1801PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1802{
1803 if (!PyUnicode_Check(unicode)) {
1804 PyErr_BadArgument();
1805 return NULL;
1806 }
1807 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1808 PyUnicode_GET_SIZE(unicode),
1809 NULL);
1810}
1811
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001812#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001813
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001814/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001815
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001816PyObject *PyUnicode_DecodeMBCS(const char *s,
1817 int size,
1818 const char *errors)
1819{
1820 PyUnicodeObject *v;
1821 Py_UNICODE *p;
1822
1823 /* First get the size of the result */
1824 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001825 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001826 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1827
1828 v = _PyUnicode_New(usize);
1829 if (v == NULL)
1830 return NULL;
1831 if (usize == 0)
1832 return (PyObject *)v;
1833 p = PyUnicode_AS_UNICODE(v);
1834 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1835 Py_DECREF(v);
1836 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1837 }
1838
1839 return (PyObject *)v;
1840}
1841
1842PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1843 int size,
1844 const char *errors)
1845{
1846 PyObject *repr;
1847 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001848 DWORD mbcssize;
1849
1850 /* If there are no characters, bail now! */
1851 if (size==0)
1852 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001853
1854 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001855 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001856 if (mbcssize==0)
1857 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1858
1859 repr = PyString_FromStringAndSize(NULL, mbcssize);
1860 if (repr == NULL)
1861 return NULL;
1862 if (mbcssize==0)
1863 return repr;
1864
1865 /* Do the conversion */
1866 s = PyString_AS_STRING(repr);
1867 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1868 Py_DECREF(repr);
1869 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1870 }
1871 return repr;
1872}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001873
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001874#endif /* MS_WIN32 */
1875
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876/* --- Character Mapping Codec -------------------------------------------- */
1877
1878static
1879int charmap_decoding_error(const char **source,
1880 Py_UNICODE **dest,
1881 const char *errors,
1882 const char *details)
1883{
1884 if ((errors == NULL) ||
1885 (strcmp(errors,"strict") == 0)) {
1886 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001887 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 details);
1889 return -1;
1890 }
1891 else if (strcmp(errors,"ignore") == 0) {
1892 return 0;
1893 }
1894 else if (strcmp(errors,"replace") == 0) {
1895 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1896 (*dest)++;
1897 return 0;
1898 }
1899 else {
1900 PyErr_Format(PyExc_ValueError,
1901 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001902 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 errors);
1904 return -1;
1905 }
1906}
1907
1908PyObject *PyUnicode_DecodeCharmap(const char *s,
1909 int size,
1910 PyObject *mapping,
1911 const char *errors)
1912{
1913 PyUnicodeObject *v;
1914 Py_UNICODE *p;
1915
1916 /* Default to Latin-1 */
1917 if (mapping == NULL)
1918 return PyUnicode_DecodeLatin1(s, size, errors);
1919
1920 v = _PyUnicode_New(size);
1921 if (v == NULL)
1922 goto onError;
1923 if (size == 0)
1924 return (PyObject *)v;
1925 p = PyUnicode_AS_UNICODE(v);
1926 while (size-- > 0) {
1927 unsigned char ch = *s++;
1928 PyObject *w, *x;
1929
1930 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1931 w = PyInt_FromLong((long)ch);
1932 if (w == NULL)
1933 goto onError;
1934 x = PyObject_GetItem(mapping, w);
1935 Py_DECREF(w);
1936 if (x == NULL) {
1937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1938 /* No mapping found: default to Latin-1 mapping */
1939 PyErr_Clear();
1940 *p++ = (Py_UNICODE)ch;
1941 continue;
1942 }
1943 goto onError;
1944 }
1945
1946 /* Apply mapping */
1947 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001948 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 if (value < 0 || value > 65535) {
1950 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001951 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 Py_DECREF(x);
1953 goto onError;
1954 }
1955 *p++ = (Py_UNICODE)value;
1956 }
1957 else if (x == Py_None) {
1958 /* undefined mapping */
1959 if (charmap_decoding_error(&s, &p, errors,
1960 "character maps to <undefined>")) {
1961 Py_DECREF(x);
1962 goto onError;
1963 }
1964 }
1965 else if (PyUnicode_Check(x)) {
1966 if (PyUnicode_GET_SIZE(x) != 1) {
1967 /* 1-n mapping */
1968 PyErr_SetString(PyExc_NotImplementedError,
1969 "1-n mappings are currently not implemented");
1970 Py_DECREF(x);
1971 goto onError;
1972 }
1973 *p++ = *PyUnicode_AS_UNICODE(x);
1974 }
1975 else {
1976 /* wrong return value */
1977 PyErr_SetString(PyExc_TypeError,
1978 "character mapping must return integer, None or unicode");
1979 Py_DECREF(x);
1980 goto onError;
1981 }
1982 Py_DECREF(x);
1983 }
1984 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1985 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1986 goto onError;
1987 return (PyObject *)v;
1988
1989 onError:
1990 Py_XDECREF(v);
1991 return NULL;
1992}
1993
1994static
1995int charmap_encoding_error(const Py_UNICODE **source,
1996 char **dest,
1997 const char *errors,
1998 const char *details)
1999{
2000 if ((errors == NULL) ||
2001 (strcmp(errors,"strict") == 0)) {
2002 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002003 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 details);
2005 return -1;
2006 }
2007 else if (strcmp(errors,"ignore") == 0) {
2008 return 0;
2009 }
2010 else if (strcmp(errors,"replace") == 0) {
2011 **dest = '?';
2012 (*dest)++;
2013 return 0;
2014 }
2015 else {
2016 PyErr_Format(PyExc_ValueError,
2017 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002018 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 errors);
2020 return -1;
2021 }
2022}
2023
2024PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2025 int size,
2026 PyObject *mapping,
2027 const char *errors)
2028{
2029 PyObject *v;
2030 char *s;
2031
2032 /* Default to Latin-1 */
2033 if (mapping == NULL)
2034 return PyUnicode_EncodeLatin1(p, size, errors);
2035
2036 v = PyString_FromStringAndSize(NULL, size);
2037 if (v == NULL)
2038 return NULL;
2039 s = PyString_AS_STRING(v);
2040 while (size-- > 0) {
2041 Py_UNICODE ch = *p++;
2042 PyObject *w, *x;
2043
2044 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2045 w = PyInt_FromLong((long)ch);
2046 if (w == NULL)
2047 goto onError;
2048 x = PyObject_GetItem(mapping, w);
2049 Py_DECREF(w);
2050 if (x == NULL) {
2051 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2052 /* No mapping found: default to Latin-1 mapping if possible */
2053 PyErr_Clear();
2054 if (ch < 256) {
2055 *s++ = (char)ch;
2056 continue;
2057 }
2058 else if (!charmap_encoding_error(&p, &s, errors,
2059 "missing character mapping"))
2060 continue;
2061 }
2062 goto onError;
2063 }
2064
2065 /* Apply mapping */
2066 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002067 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 if (value < 0 || value > 255) {
2069 PyErr_SetString(PyExc_TypeError,
2070 "character mapping must be in range(256)");
2071 Py_DECREF(x);
2072 goto onError;
2073 }
2074 *s++ = (char)value;
2075 }
2076 else if (x == Py_None) {
2077 /* undefined mapping */
2078 if (charmap_encoding_error(&p, &s, errors,
2079 "character maps to <undefined>")) {
2080 Py_DECREF(x);
2081 goto onError;
2082 }
2083 }
2084 else if (PyString_Check(x)) {
2085 if (PyString_GET_SIZE(x) != 1) {
2086 /* 1-n mapping */
2087 PyErr_SetString(PyExc_NotImplementedError,
2088 "1-n mappings are currently not implemented");
2089 Py_DECREF(x);
2090 goto onError;
2091 }
2092 *s++ = *PyString_AS_STRING(x);
2093 }
2094 else {
2095 /* wrong return value */
2096 PyErr_SetString(PyExc_TypeError,
2097 "character mapping must return integer, None or unicode");
2098 Py_DECREF(x);
2099 goto onError;
2100 }
2101 Py_DECREF(x);
2102 }
2103 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2104 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2105 goto onError;
2106 return v;
2107
2108 onError:
2109 Py_DECREF(v);
2110 return NULL;
2111}
2112
2113PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2114 PyObject *mapping)
2115{
2116 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2117 PyErr_BadArgument();
2118 return NULL;
2119 }
2120 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2121 PyUnicode_GET_SIZE(unicode),
2122 mapping,
2123 NULL);
2124}
2125
2126static
2127int translate_error(const Py_UNICODE **source,
2128 Py_UNICODE **dest,
2129 const char *errors,
2130 const char *details)
2131{
2132 if ((errors == NULL) ||
2133 (strcmp(errors,"strict") == 0)) {
2134 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002135 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 details);
2137 return -1;
2138 }
2139 else if (strcmp(errors,"ignore") == 0) {
2140 return 0;
2141 }
2142 else if (strcmp(errors,"replace") == 0) {
2143 **dest = '?';
2144 (*dest)++;
2145 return 0;
2146 }
2147 else {
2148 PyErr_Format(PyExc_ValueError,
2149 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002150 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 errors);
2152 return -1;
2153 }
2154}
2155
2156PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2157 int size,
2158 PyObject *mapping,
2159 const char *errors)
2160{
2161 PyUnicodeObject *v;
2162 Py_UNICODE *p;
2163
2164 if (mapping == NULL) {
2165 PyErr_BadArgument();
2166 return NULL;
2167 }
2168
2169 /* Output will never be longer than input */
2170 v = _PyUnicode_New(size);
2171 if (v == NULL)
2172 goto onError;
2173 if (size == 0)
2174 goto done;
2175 p = PyUnicode_AS_UNICODE(v);
2176 while (size-- > 0) {
2177 Py_UNICODE ch = *s++;
2178 PyObject *w, *x;
2179
2180 /* Get mapping */
2181 w = PyInt_FromLong(ch);
2182 if (w == NULL)
2183 goto onError;
2184 x = PyObject_GetItem(mapping, w);
2185 Py_DECREF(w);
2186 if (x == NULL) {
2187 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2188 /* No mapping found: default to 1-1 mapping */
2189 PyErr_Clear();
2190 *p++ = ch;
2191 continue;
2192 }
2193 goto onError;
2194 }
2195
2196 /* Apply mapping */
2197 if (PyInt_Check(x))
2198 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2199 else if (x == Py_None) {
2200 /* undefined mapping */
2201 if (translate_error(&s, &p, errors,
2202 "character maps to <undefined>")) {
2203 Py_DECREF(x);
2204 goto onError;
2205 }
2206 }
2207 else if (PyUnicode_Check(x)) {
2208 if (PyUnicode_GET_SIZE(x) != 1) {
2209 /* 1-n mapping */
2210 PyErr_SetString(PyExc_NotImplementedError,
2211 "1-n mappings are currently not implemented");
2212 Py_DECREF(x);
2213 goto onError;
2214 }
2215 *p++ = *PyUnicode_AS_UNICODE(x);
2216 }
2217 else {
2218 /* wrong return value */
2219 PyErr_SetString(PyExc_TypeError,
2220 "translate mapping must return integer, None or unicode");
2221 Py_DECREF(x);
2222 goto onError;
2223 }
2224 Py_DECREF(x);
2225 }
2226 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002227 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
2230 done:
2231 return (PyObject *)v;
2232
2233 onError:
2234 Py_XDECREF(v);
2235 return NULL;
2236}
2237
2238PyObject *PyUnicode_Translate(PyObject *str,
2239 PyObject *mapping,
2240 const char *errors)
2241{
2242 PyObject *result;
2243
2244 str = PyUnicode_FromObject(str);
2245 if (str == NULL)
2246 goto onError;
2247 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2248 PyUnicode_GET_SIZE(str),
2249 mapping,
2250 errors);
2251 Py_DECREF(str);
2252 return result;
2253
2254 onError:
2255 Py_XDECREF(str);
2256 return NULL;
2257}
2258
Guido van Rossum9e896b32000-04-05 20:11:21 +00002259/* --- Decimal Encoder ---------------------------------------------------- */
2260
2261int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2262 int length,
2263 char *output,
2264 const char *errors)
2265{
2266 Py_UNICODE *p, *end;
2267
2268 if (output == NULL) {
2269 PyErr_BadArgument();
2270 return -1;
2271 }
2272
2273 p = s;
2274 end = s + length;
2275 while (p < end) {
2276 register Py_UNICODE ch = *p++;
2277 int decimal;
2278
2279 if (Py_UNICODE_ISSPACE(ch)) {
2280 *output++ = ' ';
2281 continue;
2282 }
2283 decimal = Py_UNICODE_TODECIMAL(ch);
2284 if (decimal >= 0) {
2285 *output++ = '0' + decimal;
2286 continue;
2287 }
Guido van Rossumba477042000-04-06 18:18:10 +00002288 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002289 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002290 continue;
2291 }
2292 /* All other characters are considered invalid */
2293 if (errors == NULL || strcmp(errors, "strict") == 0) {
2294 PyErr_SetString(PyExc_ValueError,
2295 "invalid decimal Unicode string");
2296 goto onError;
2297 }
2298 else if (strcmp(errors, "ignore") == 0)
2299 continue;
2300 else if (strcmp(errors, "replace") == 0) {
2301 *output++ = '?';
2302 continue;
2303 }
2304 }
2305 /* 0-terminate the output string */
2306 *output++ = '\0';
2307 return 0;
2308
2309 onError:
2310 return -1;
2311}
2312
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313/* --- Helpers ------------------------------------------------------------ */
2314
2315static
2316int count(PyUnicodeObject *self,
2317 int start,
2318 int end,
2319 PyUnicodeObject *substring)
2320{
2321 int count = 0;
2322
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002323 if (substring->length == 0)
2324 return (end - start + 1);
2325
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 end -= substring->length;
2327
2328 while (start <= end)
2329 if (Py_UNICODE_MATCH(self, start, substring)) {
2330 count++;
2331 start += substring->length;
2332 } else
2333 start++;
2334
2335 return count;
2336}
2337
2338int PyUnicode_Count(PyObject *str,
2339 PyObject *substr,
2340 int start,
2341 int end)
2342{
2343 int result;
2344
2345 str = PyUnicode_FromObject(str);
2346 if (str == NULL)
2347 return -1;
2348 substr = PyUnicode_FromObject(substr);
2349 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002350 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 return -1;
2352 }
2353
2354 result = count((PyUnicodeObject *)str,
2355 start, end,
2356 (PyUnicodeObject *)substr);
2357
2358 Py_DECREF(str);
2359 Py_DECREF(substr);
2360 return result;
2361}
2362
2363static
2364int findstring(PyUnicodeObject *self,
2365 PyUnicodeObject *substring,
2366 int start,
2367 int end,
2368 int direction)
2369{
2370 if (start < 0)
2371 start += self->length;
2372 if (start < 0)
2373 start = 0;
2374
2375 if (substring->length == 0)
2376 return start;
2377
2378 if (end > self->length)
2379 end = self->length;
2380 if (end < 0)
2381 end += self->length;
2382 if (end < 0)
2383 end = 0;
2384
2385 end -= substring->length;
2386
2387 if (direction < 0) {
2388 for (; end >= start; end--)
2389 if (Py_UNICODE_MATCH(self, end, substring))
2390 return end;
2391 } else {
2392 for (; start <= end; start++)
2393 if (Py_UNICODE_MATCH(self, start, substring))
2394 return start;
2395 }
2396
2397 return -1;
2398}
2399
2400int PyUnicode_Find(PyObject *str,
2401 PyObject *substr,
2402 int start,
2403 int end,
2404 int direction)
2405{
2406 int result;
2407
2408 str = PyUnicode_FromObject(str);
2409 if (str == NULL)
2410 return -1;
2411 substr = PyUnicode_FromObject(substr);
2412 if (substr == NULL) {
2413 Py_DECREF(substr);
2414 return -1;
2415 }
2416
2417 result = findstring((PyUnicodeObject *)str,
2418 (PyUnicodeObject *)substr,
2419 start, end, direction);
2420 Py_DECREF(str);
2421 Py_DECREF(substr);
2422 return result;
2423}
2424
2425static
2426int tailmatch(PyUnicodeObject *self,
2427 PyUnicodeObject *substring,
2428 int start,
2429 int end,
2430 int direction)
2431{
2432 if (start < 0)
2433 start += self->length;
2434 if (start < 0)
2435 start = 0;
2436
2437 if (substring->length == 0)
2438 return 1;
2439
2440 if (end > self->length)
2441 end = self->length;
2442 if (end < 0)
2443 end += self->length;
2444 if (end < 0)
2445 end = 0;
2446
2447 end -= substring->length;
2448 if (end < start)
2449 return 0;
2450
2451 if (direction > 0) {
2452 if (Py_UNICODE_MATCH(self, end, substring))
2453 return 1;
2454 } else {
2455 if (Py_UNICODE_MATCH(self, start, substring))
2456 return 1;
2457 }
2458
2459 return 0;
2460}
2461
2462int PyUnicode_Tailmatch(PyObject *str,
2463 PyObject *substr,
2464 int start,
2465 int end,
2466 int direction)
2467{
2468 int result;
2469
2470 str = PyUnicode_FromObject(str);
2471 if (str == NULL)
2472 return -1;
2473 substr = PyUnicode_FromObject(substr);
2474 if (substr == NULL) {
2475 Py_DECREF(substr);
2476 return -1;
2477 }
2478
2479 result = tailmatch((PyUnicodeObject *)str,
2480 (PyUnicodeObject *)substr,
2481 start, end, direction);
2482 Py_DECREF(str);
2483 Py_DECREF(substr);
2484 return result;
2485}
2486
2487static
2488const Py_UNICODE *findchar(const Py_UNICODE *s,
2489 int size,
2490 Py_UNICODE ch)
2491{
2492 /* like wcschr, but doesn't stop at NULL characters */
2493
2494 while (size-- > 0) {
2495 if (*s == ch)
2496 return s;
2497 s++;
2498 }
2499
2500 return NULL;
2501}
2502
2503/* Apply fixfct filter to the Unicode object self and return a
2504 reference to the modified object */
2505
2506static
2507PyObject *fixup(PyUnicodeObject *self,
2508 int (*fixfct)(PyUnicodeObject *s))
2509{
2510
2511 PyUnicodeObject *u;
2512
2513 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2514 self->length);
2515 if (u == NULL)
2516 return NULL;
2517 if (!fixfct(u)) {
2518 /* fixfct should return TRUE if it modified the buffer. If
2519 FALSE, return a reference to the original buffer instead
2520 (to save space, not time) */
2521 Py_INCREF(self);
2522 Py_DECREF(u);
2523 return (PyObject*) self;
2524 }
2525 return (PyObject*) u;
2526}
2527
2528static
2529int fixupper(PyUnicodeObject *self)
2530{
2531 int len = self->length;
2532 Py_UNICODE *s = self->str;
2533 int status = 0;
2534
2535 while (len-- > 0) {
2536 register Py_UNICODE ch;
2537
2538 ch = Py_UNICODE_TOUPPER(*s);
2539 if (ch != *s) {
2540 status = 1;
2541 *s = ch;
2542 }
2543 s++;
2544 }
2545
2546 return status;
2547}
2548
2549static
2550int fixlower(PyUnicodeObject *self)
2551{
2552 int len = self->length;
2553 Py_UNICODE *s = self->str;
2554 int status = 0;
2555
2556 while (len-- > 0) {
2557 register Py_UNICODE ch;
2558
2559 ch = Py_UNICODE_TOLOWER(*s);
2560 if (ch != *s) {
2561 status = 1;
2562 *s = ch;
2563 }
2564 s++;
2565 }
2566
2567 return status;
2568}
2569
2570static
2571int fixswapcase(PyUnicodeObject *self)
2572{
2573 int len = self->length;
2574 Py_UNICODE *s = self->str;
2575 int status = 0;
2576
2577 while (len-- > 0) {
2578 if (Py_UNICODE_ISUPPER(*s)) {
2579 *s = Py_UNICODE_TOLOWER(*s);
2580 status = 1;
2581 } else if (Py_UNICODE_ISLOWER(*s)) {
2582 *s = Py_UNICODE_TOUPPER(*s);
2583 status = 1;
2584 }
2585 s++;
2586 }
2587
2588 return status;
2589}
2590
2591static
2592int fixcapitalize(PyUnicodeObject *self)
2593{
2594 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2595 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2596 return 1;
2597 }
2598 return 0;
2599}
2600
2601static
2602int fixtitle(PyUnicodeObject *self)
2603{
2604 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2605 register Py_UNICODE *e;
2606 int previous_is_cased;
2607
2608 /* Shortcut for single character strings */
2609 if (PyUnicode_GET_SIZE(self) == 1) {
2610 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2611 if (*p != ch) {
2612 *p = ch;
2613 return 1;
2614 }
2615 else
2616 return 0;
2617 }
2618
2619 e = p + PyUnicode_GET_SIZE(self);
2620 previous_is_cased = 0;
2621 for (; p < e; p++) {
2622 register const Py_UNICODE ch = *p;
2623
2624 if (previous_is_cased)
2625 *p = Py_UNICODE_TOLOWER(ch);
2626 else
2627 *p = Py_UNICODE_TOTITLE(ch);
2628
2629 if (Py_UNICODE_ISLOWER(ch) ||
2630 Py_UNICODE_ISUPPER(ch) ||
2631 Py_UNICODE_ISTITLE(ch))
2632 previous_is_cased = 1;
2633 else
2634 previous_is_cased = 0;
2635 }
2636 return 1;
2637}
2638
2639PyObject *PyUnicode_Join(PyObject *separator,
2640 PyObject *seq)
2641{
2642 Py_UNICODE *sep;
2643 int seplen;
2644 PyUnicodeObject *res = NULL;
2645 int reslen = 0;
2646 Py_UNICODE *p;
2647 int seqlen = 0;
2648 int sz = 100;
2649 int i;
2650
2651 seqlen = PySequence_Length(seq);
2652 if (seqlen < 0 && PyErr_Occurred())
2653 return NULL;
2654
2655 if (separator == NULL) {
2656 Py_UNICODE blank = ' ';
2657 sep = &blank;
2658 seplen = 1;
2659 }
2660 else {
2661 separator = PyUnicode_FromObject(separator);
2662 if (separator == NULL)
2663 return NULL;
2664 sep = PyUnicode_AS_UNICODE(separator);
2665 seplen = PyUnicode_GET_SIZE(separator);
2666 }
2667
2668 res = _PyUnicode_New(sz);
2669 if (res == NULL)
2670 goto onError;
2671 p = PyUnicode_AS_UNICODE(res);
2672 reslen = 0;
2673
2674 for (i = 0; i < seqlen; i++) {
2675 int itemlen;
2676 PyObject *item;
2677
2678 item = PySequence_GetItem(seq, i);
2679 if (item == NULL)
2680 goto onError;
2681 if (!PyUnicode_Check(item)) {
2682 PyObject *v;
2683 v = PyUnicode_FromObject(item);
2684 Py_DECREF(item);
2685 item = v;
2686 if (item == NULL)
2687 goto onError;
2688 }
2689 itemlen = PyUnicode_GET_SIZE(item);
2690 while (reslen + itemlen + seplen >= sz) {
2691 if (_PyUnicode_Resize(res, sz*2))
2692 goto onError;
2693 sz *= 2;
2694 p = PyUnicode_AS_UNICODE(res) + reslen;
2695 }
2696 if (i > 0) {
2697 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2698 p += seplen;
2699 reslen += seplen;
2700 }
2701 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2702 p += itemlen;
2703 reslen += itemlen;
2704 Py_DECREF(item);
2705 }
2706 if (_PyUnicode_Resize(res, reslen))
2707 goto onError;
2708
2709 Py_XDECREF(separator);
2710 return (PyObject *)res;
2711
2712 onError:
2713 Py_XDECREF(separator);
2714 Py_DECREF(res);
2715 return NULL;
2716}
2717
2718static
2719PyUnicodeObject *pad(PyUnicodeObject *self,
2720 int left,
2721 int right,
2722 Py_UNICODE fill)
2723{
2724 PyUnicodeObject *u;
2725
2726 if (left < 0)
2727 left = 0;
2728 if (right < 0)
2729 right = 0;
2730
2731 if (left == 0 && right == 0) {
2732 Py_INCREF(self);
2733 return self;
2734 }
2735
2736 u = _PyUnicode_New(left + self->length + right);
2737 if (u) {
2738 if (left)
2739 Py_UNICODE_FILL(u->str, fill, left);
2740 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2741 if (right)
2742 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2743 }
2744
2745 return u;
2746}
2747
2748#define SPLIT_APPEND(data, left, right) \
2749 str = PyUnicode_FromUnicode(data + left, right - left); \
2750 if (!str) \
2751 goto onError; \
2752 if (PyList_Append(list, str)) { \
2753 Py_DECREF(str); \
2754 goto onError; \
2755 } \
2756 else \
2757 Py_DECREF(str);
2758
2759static
2760PyObject *split_whitespace(PyUnicodeObject *self,
2761 PyObject *list,
2762 int maxcount)
2763{
2764 register int i;
2765 register int j;
2766 int len = self->length;
2767 PyObject *str;
2768
2769 for (i = j = 0; i < len; ) {
2770 /* find a token */
2771 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2772 i++;
2773 j = i;
2774 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2775 i++;
2776 if (j < i) {
2777 if (maxcount-- <= 0)
2778 break;
2779 SPLIT_APPEND(self->str, j, i);
2780 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2781 i++;
2782 j = i;
2783 }
2784 }
2785 if (j < len) {
2786 SPLIT_APPEND(self->str, j, len);
2787 }
2788 return list;
2789
2790 onError:
2791 Py_DECREF(list);
2792 return NULL;
2793}
2794
2795PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002796 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797{
2798 register int i;
2799 register int j;
2800 int len;
2801 PyObject *list;
2802 PyObject *str;
2803 Py_UNICODE *data;
2804
2805 string = PyUnicode_FromObject(string);
2806 if (string == NULL)
2807 return NULL;
2808 data = PyUnicode_AS_UNICODE(string);
2809 len = PyUnicode_GET_SIZE(string);
2810
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811 list = PyList_New(0);
2812 if (!list)
2813 goto onError;
2814
2815 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002816 int eol;
2817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 /* Find a line and append it */
2819 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2820 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821
2822 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002823 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 if (i < len) {
2825 if (data[i] == '\r' && i + 1 < len &&
2826 data[i+1] == '\n')
2827 i += 2;
2828 else
2829 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002830 if (keepends)
2831 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 }
Guido van Rossum86662912000-04-11 15:38:46 +00002833 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 j = i;
2835 }
2836 if (j < len) {
2837 SPLIT_APPEND(data, j, len);
2838 }
2839
2840 Py_DECREF(string);
2841 return list;
2842
2843 onError:
2844 Py_DECREF(list);
2845 Py_DECREF(string);
2846 return NULL;
2847}
2848
2849static
2850PyObject *split_char(PyUnicodeObject *self,
2851 PyObject *list,
2852 Py_UNICODE ch,
2853 int maxcount)
2854{
2855 register int i;
2856 register int j;
2857 int len = self->length;
2858 PyObject *str;
2859
2860 for (i = j = 0; i < len; ) {
2861 if (self->str[i] == ch) {
2862 if (maxcount-- <= 0)
2863 break;
2864 SPLIT_APPEND(self->str, j, i);
2865 i = j = i + 1;
2866 } else
2867 i++;
2868 }
2869 if (j <= len) {
2870 SPLIT_APPEND(self->str, j, len);
2871 }
2872 return list;
2873
2874 onError:
2875 Py_DECREF(list);
2876 return NULL;
2877}
2878
2879static
2880PyObject *split_substring(PyUnicodeObject *self,
2881 PyObject *list,
2882 PyUnicodeObject *substring,
2883 int maxcount)
2884{
2885 register int i;
2886 register int j;
2887 int len = self->length;
2888 int sublen = substring->length;
2889 PyObject *str;
2890
2891 for (i = j = 0; i < len - sublen; ) {
2892 if (Py_UNICODE_MATCH(self, i, substring)) {
2893 if (maxcount-- <= 0)
2894 break;
2895 SPLIT_APPEND(self->str, j, i);
2896 i = j = i + sublen;
2897 } else
2898 i++;
2899 }
2900 if (j <= len) {
2901 SPLIT_APPEND(self->str, j, len);
2902 }
2903 return list;
2904
2905 onError:
2906 Py_DECREF(list);
2907 return NULL;
2908}
2909
2910#undef SPLIT_APPEND
2911
2912static
2913PyObject *split(PyUnicodeObject *self,
2914 PyUnicodeObject *substring,
2915 int maxcount)
2916{
2917 PyObject *list;
2918
2919 if (maxcount < 0)
2920 maxcount = INT_MAX;
2921
2922 list = PyList_New(0);
2923 if (!list)
2924 return NULL;
2925
2926 if (substring == NULL)
2927 return split_whitespace(self,list,maxcount);
2928
2929 else if (substring->length == 1)
2930 return split_char(self,list,substring->str[0],maxcount);
2931
2932 else if (substring->length == 0) {
2933 Py_DECREF(list);
2934 PyErr_SetString(PyExc_ValueError, "empty separator");
2935 return NULL;
2936 }
2937 else
2938 return split_substring(self,list,substring,maxcount);
2939}
2940
2941static
2942PyObject *strip(PyUnicodeObject *self,
2943 int left,
2944 int right)
2945{
2946 Py_UNICODE *p = self->str;
2947 int start = 0;
2948 int end = self->length;
2949
2950 if (left)
2951 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2952 start++;
2953
2954 if (right)
2955 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2956 end--;
2957
2958 if (start == 0 && end == self->length) {
2959 /* couldn't strip anything off, return original string */
2960 Py_INCREF(self);
2961 return (PyObject*) self;
2962 }
2963
2964 return (PyObject*) PyUnicode_FromUnicode(
2965 self->str + start,
2966 end - start
2967 );
2968}
2969
2970static
2971PyObject *replace(PyUnicodeObject *self,
2972 PyUnicodeObject *str1,
2973 PyUnicodeObject *str2,
2974 int maxcount)
2975{
2976 PyUnicodeObject *u;
2977
2978 if (maxcount < 0)
2979 maxcount = INT_MAX;
2980
2981 if (str1->length == 1 && str2->length == 1) {
2982 int i;
2983
2984 /* replace characters */
2985 if (!findchar(self->str, self->length, str1->str[0])) {
2986 /* nothing to replace, return original string */
2987 Py_INCREF(self);
2988 u = self;
2989 } else {
2990 Py_UNICODE u1 = str1->str[0];
2991 Py_UNICODE u2 = str2->str[0];
2992
2993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2994 self->str,
2995 self->length
2996 );
2997 if (u)
2998 for (i = 0; i < u->length; i++)
2999 if (u->str[i] == u1) {
3000 if (--maxcount < 0)
3001 break;
3002 u->str[i] = u2;
3003 }
3004 }
3005
3006 } else {
3007 int n, i;
3008 Py_UNICODE *p;
3009
3010 /* replace strings */
3011 n = count(self, 0, self->length, str1);
3012 if (n > maxcount)
3013 n = maxcount;
3014 if (n == 0) {
3015 /* nothing to replace, return original string */
3016 Py_INCREF(self);
3017 u = self;
3018 } else {
3019 u = _PyUnicode_New(
3020 self->length + n * (str2->length - str1->length));
3021 if (u) {
3022 i = 0;
3023 p = u->str;
3024 while (i <= self->length - str1->length)
3025 if (Py_UNICODE_MATCH(self, i, str1)) {
3026 /* replace string segment */
3027 Py_UNICODE_COPY(p, str2->str, str2->length);
3028 p += str2->length;
3029 i += str1->length;
3030 if (--n <= 0) {
3031 /* copy remaining part */
3032 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3033 break;
3034 }
3035 } else
3036 *p++ = self->str[i++];
3037 }
3038 }
3039 }
3040
3041 return (PyObject *) u;
3042}
3043
3044/* --- Unicode Object Methods --------------------------------------------- */
3045
3046static char title__doc__[] =
3047"S.title() -> unicode\n\
3048\n\
3049Return a titlecased version of S, i.e. words start with title case\n\
3050characters, all remaining cased characters have lower case.";
3051
3052static PyObject*
3053unicode_title(PyUnicodeObject *self, PyObject *args)
3054{
3055 if (!PyArg_NoArgs(args))
3056 return NULL;
3057 return fixup(self, fixtitle);
3058}
3059
3060static char capitalize__doc__[] =
3061"S.capitalize() -> unicode\n\
3062\n\
3063Return a capitalized version of S, i.e. make the first character\n\
3064have upper case.";
3065
3066static PyObject*
3067unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3068{
3069 if (!PyArg_NoArgs(args))
3070 return NULL;
3071 return fixup(self, fixcapitalize);
3072}
3073
3074#if 0
3075static char capwords__doc__[] =
3076"S.capwords() -> unicode\n\
3077\n\
3078Apply .capitalize() to all words in S and return the result with\n\
3079normalized whitespace (all whitespace strings are replaced by ' ').";
3080
3081static PyObject*
3082unicode_capwords(PyUnicodeObject *self, PyObject *args)
3083{
3084 PyObject *list;
3085 PyObject *item;
3086 int i;
3087
3088 if (!PyArg_NoArgs(args))
3089 return NULL;
3090
3091 /* Split into words */
3092 list = split(self, NULL, -1);
3093 if (!list)
3094 return NULL;
3095
3096 /* Capitalize each word */
3097 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3098 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3099 fixcapitalize);
3100 if (item == NULL)
3101 goto onError;
3102 Py_DECREF(PyList_GET_ITEM(list, i));
3103 PyList_SET_ITEM(list, i, item);
3104 }
3105
3106 /* Join the words to form a new string */
3107 item = PyUnicode_Join(NULL, list);
3108
3109onError:
3110 Py_DECREF(list);
3111 return (PyObject *)item;
3112}
3113#endif
3114
3115static char center__doc__[] =
3116"S.center(width) -> unicode\n\
3117\n\
3118Return S centered in a Unicode string of length width. Padding is done\n\
3119using spaces.";
3120
3121static PyObject *
3122unicode_center(PyUnicodeObject *self, PyObject *args)
3123{
3124 int marg, left;
3125 int width;
3126
3127 if (!PyArg_ParseTuple(args, "i:center", &width))
3128 return NULL;
3129
3130 if (self->length >= width) {
3131 Py_INCREF(self);
3132 return (PyObject*) self;
3133 }
3134
3135 marg = width - self->length;
3136 left = marg / 2 + (marg & width & 1);
3137
3138 return (PyObject*) pad(self, left, marg - left, ' ');
3139}
3140
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003141/* speedy UTF-16 code point order comparison */
3142/* gleaned from: */
3143/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3144
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003145static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003146{
3147 0, 0, 0, 0, 0, 0, 0, 0,
3148 0, 0, 0, 0, 0, 0, 0, 0,
3149 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003150 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003151};
3152
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153static int
3154unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3155{
3156 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003157
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 Py_UNICODE *s1 = str1->str;
3159 Py_UNICODE *s2 = str2->str;
3160
3161 len1 = str1->length;
3162 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003163
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003165 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003166 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003167
3168 c1 = *s1++;
3169 c2 = *s2++;
3170 if (c1 > (1<<11) * 26)
3171 c1 += utf16Fixup[c1>>11];
3172 if (c2 > (1<<11) * 26)
3173 c2 += utf16Fixup[c2>>11];
3174
3175 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003176 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003177 if (diff)
3178 return (diff < 0) ? -1 : (diff != 0);
3179 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 }
3181
3182 return (len1 < len2) ? -1 : (len1 != len2);
3183}
3184
3185int PyUnicode_Compare(PyObject *left,
3186 PyObject *right)
3187{
3188 PyUnicodeObject *u = NULL, *v = NULL;
3189 int result;
3190
3191 /* Coerce the two arguments */
3192 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3193 if (u == NULL)
3194 goto onError;
3195 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3196 if (v == NULL)
3197 goto onError;
3198
3199 /* Shortcut for emtpy or interned objects */
3200 if (v == u) {
3201 Py_DECREF(u);
3202 Py_DECREF(v);
3203 return 0;
3204 }
3205
3206 result = unicode_compare(u, v);
3207
3208 Py_DECREF(u);
3209 Py_DECREF(v);
3210 return result;
3211
3212onError:
3213 Py_XDECREF(u);
3214 Py_XDECREF(v);
3215 return -1;
3216}
3217
Guido van Rossum403d68b2000-03-13 15:55:09 +00003218int PyUnicode_Contains(PyObject *container,
3219 PyObject *element)
3220{
3221 PyUnicodeObject *u = NULL, *v = NULL;
3222 int result;
3223 register const Py_UNICODE *p, *e;
3224 register Py_UNICODE ch;
3225
3226 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003227 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003228 if (v == NULL) {
3229 PyErr_SetString(PyExc_TypeError,
3230 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003231 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003232 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003233 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3234 if (u == NULL) {
3235 Py_DECREF(v);
3236 goto onError;
3237 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003238
3239 /* Check v in u */
3240 if (PyUnicode_GET_SIZE(v) != 1) {
3241 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003242 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003243 goto onError;
3244 }
3245 ch = *PyUnicode_AS_UNICODE(v);
3246 p = PyUnicode_AS_UNICODE(u);
3247 e = p + PyUnicode_GET_SIZE(u);
3248 result = 0;
3249 while (p < e) {
3250 if (*p++ == ch) {
3251 result = 1;
3252 break;
3253 }
3254 }
3255
3256 Py_DECREF(u);
3257 Py_DECREF(v);
3258 return result;
3259
3260onError:
3261 Py_XDECREF(u);
3262 Py_XDECREF(v);
3263 return -1;
3264}
3265
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266/* Concat to string or Unicode object giving a new Unicode object. */
3267
3268PyObject *PyUnicode_Concat(PyObject *left,
3269 PyObject *right)
3270{
3271 PyUnicodeObject *u = NULL, *v = NULL, *w;
3272
3273 /* Coerce the two arguments */
3274 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3275 if (u == NULL)
3276 goto onError;
3277 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3278 if (v == NULL)
3279 goto onError;
3280
3281 /* Shortcuts */
3282 if (v == unicode_empty) {
3283 Py_DECREF(v);
3284 return (PyObject *)u;
3285 }
3286 if (u == unicode_empty) {
3287 Py_DECREF(u);
3288 return (PyObject *)v;
3289 }
3290
3291 /* Concat the two Unicode strings */
3292 w = _PyUnicode_New(u->length + v->length);
3293 if (w == NULL)
3294 goto onError;
3295 Py_UNICODE_COPY(w->str, u->str, u->length);
3296 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3297
3298 Py_DECREF(u);
3299 Py_DECREF(v);
3300 return (PyObject *)w;
3301
3302onError:
3303 Py_XDECREF(u);
3304 Py_XDECREF(v);
3305 return NULL;
3306}
3307
3308static char count__doc__[] =
3309"S.count(sub[, start[, end]]) -> int\n\
3310\n\
3311Return the number of occurrences of substring sub in Unicode string\n\
3312S[start:end]. Optional arguments start and end are\n\
3313interpreted as in slice notation.";
3314
3315static PyObject *
3316unicode_count(PyUnicodeObject *self, PyObject *args)
3317{
3318 PyUnicodeObject *substring;
3319 int start = 0;
3320 int end = INT_MAX;
3321 PyObject *result;
3322
Guido van Rossumb8872e62000-05-09 14:14:27 +00003323 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3324 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 return NULL;
3326
3327 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3328 (PyObject *)substring);
3329 if (substring == NULL)
3330 return NULL;
3331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (start < 0)
3333 start += self->length;
3334 if (start < 0)
3335 start = 0;
3336 if (end > self->length)
3337 end = self->length;
3338 if (end < 0)
3339 end += self->length;
3340 if (end < 0)
3341 end = 0;
3342
3343 result = PyInt_FromLong((long) count(self, start, end, substring));
3344
3345 Py_DECREF(substring);
3346 return result;
3347}
3348
3349static char encode__doc__[] =
3350"S.encode([encoding[,errors]]) -> string\n\
3351\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003352Return an encoded string version of S. Default encoding is the current\n\
3353default string encoding. errors may be given to set a different error\n\
3354handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3355a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356
3357static PyObject *
3358unicode_encode(PyUnicodeObject *self, PyObject *args)
3359{
3360 char *encoding = NULL;
3361 char *errors = NULL;
3362 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3363 return NULL;
3364 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3365}
3366
3367static char expandtabs__doc__[] =
3368"S.expandtabs([tabsize]) -> unicode\n\
3369\n\
3370Return a copy of S where all tab characters are expanded using spaces.\n\
3371If tabsize is not given, a tab size of 8 characters is assumed.";
3372
3373static PyObject*
3374unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3375{
3376 Py_UNICODE *e;
3377 Py_UNICODE *p;
3378 Py_UNICODE *q;
3379 int i, j;
3380 PyUnicodeObject *u;
3381 int tabsize = 8;
3382
3383 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3384 return NULL;
3385
3386 /* First pass: determine size of ouput string */
3387 i = j = 0;
3388 e = self->str + self->length;
3389 for (p = self->str; p < e; p++)
3390 if (*p == '\t') {
3391 if (tabsize > 0)
3392 j += tabsize - (j % tabsize);
3393 }
3394 else {
3395 j++;
3396 if (*p == '\n' || *p == '\r') {
3397 i += j;
3398 j = 0;
3399 }
3400 }
3401
3402 /* Second pass: create output string and fill it */
3403 u = _PyUnicode_New(i + j);
3404 if (!u)
3405 return NULL;
3406
3407 j = 0;
3408 q = u->str;
3409
3410 for (p = self->str; p < e; p++)
3411 if (*p == '\t') {
3412 if (tabsize > 0) {
3413 i = tabsize - (j % tabsize);
3414 j += i;
3415 while (i--)
3416 *q++ = ' ';
3417 }
3418 }
3419 else {
3420 j++;
3421 *q++ = *p;
3422 if (*p == '\n' || *p == '\r')
3423 j = 0;
3424 }
3425
3426 return (PyObject*) u;
3427}
3428
3429static char find__doc__[] =
3430"S.find(sub [,start [,end]]) -> int\n\
3431\n\
3432Return the lowest index in S where substring sub is found,\n\
3433such that sub is contained within s[start,end]. Optional\n\
3434arguments start and end are interpreted as in slice notation.\n\
3435\n\
3436Return -1 on failure.";
3437
3438static PyObject *
3439unicode_find(PyUnicodeObject *self, PyObject *args)
3440{
3441 PyUnicodeObject *substring;
3442 int start = 0;
3443 int end = INT_MAX;
3444 PyObject *result;
3445
Guido van Rossumb8872e62000-05-09 14:14:27 +00003446 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3447 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 return NULL;
3449 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3450 (PyObject *)substring);
3451 if (substring == NULL)
3452 return NULL;
3453
3454 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3455
3456 Py_DECREF(substring);
3457 return result;
3458}
3459
3460static PyObject *
3461unicode_getitem(PyUnicodeObject *self, int index)
3462{
3463 if (index < 0 || index >= self->length) {
3464 PyErr_SetString(PyExc_IndexError, "string index out of range");
3465 return NULL;
3466 }
3467
3468 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3469}
3470
3471static long
3472unicode_hash(PyUnicodeObject *self)
3473{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003474 /* Since Unicode objects compare equal to their ASCII string
3475 counterparts, they should use the individual character values
3476 as basis for their hash value. This is needed to assure that
3477 strings and Unicode objects behave in the same way as
3478 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479
Fredrik Lundhdde61642000-07-10 18:27:47 +00003480 register int len;
3481 register Py_UNICODE *p;
3482 register long x;
3483
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 if (self->hash != -1)
3485 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003486 len = PyUnicode_GET_SIZE(self);
3487 p = PyUnicode_AS_UNICODE(self);
3488 x = *p << 7;
3489 while (--len >= 0)
3490 x = (1000003*x) ^ *p++;
3491 x ^= PyUnicode_GET_SIZE(self);
3492 if (x == -1)
3493 x = -2;
3494 self->hash = x;
3495 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496}
3497
3498static char index__doc__[] =
3499"S.index(sub [,start [,end]]) -> int\n\
3500\n\
3501Like S.find() but raise ValueError when the substring is not found.";
3502
3503static PyObject *
3504unicode_index(PyUnicodeObject *self, PyObject *args)
3505{
3506 int result;
3507 PyUnicodeObject *substring;
3508 int start = 0;
3509 int end = INT_MAX;
3510
Guido van Rossumb8872e62000-05-09 14:14:27 +00003511 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 return NULL;
3514
3515 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3516 (PyObject *)substring);
3517 if (substring == NULL)
3518 return NULL;
3519
3520 result = findstring(self, substring, start, end, 1);
3521
3522 Py_DECREF(substring);
3523 if (result < 0) {
3524 PyErr_SetString(PyExc_ValueError, "substring not found");
3525 return NULL;
3526 }
3527 return PyInt_FromLong(result);
3528}
3529
3530static char islower__doc__[] =
3531"S.islower() -> int\n\
3532\n\
3533Return 1 if all cased characters in S are lowercase and there is\n\
3534at least one cased character in S, 0 otherwise.";
3535
3536static PyObject*
3537unicode_islower(PyUnicodeObject *self, PyObject *args)
3538{
3539 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3540 register const Py_UNICODE *e;
3541 int cased;
3542
3543 if (!PyArg_NoArgs(args))
3544 return NULL;
3545
3546 /* Shortcut for single character strings */
3547 if (PyUnicode_GET_SIZE(self) == 1)
3548 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3549
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003550 /* Special case for empty strings */
3551 if (PyString_GET_SIZE(self) == 0)
3552 return PyInt_FromLong(0);
3553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 e = p + PyUnicode_GET_SIZE(self);
3555 cased = 0;
3556 for (; p < e; p++) {
3557 register const Py_UNICODE ch = *p;
3558
3559 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3560 return PyInt_FromLong(0);
3561 else if (!cased && Py_UNICODE_ISLOWER(ch))
3562 cased = 1;
3563 }
3564 return PyInt_FromLong(cased);
3565}
3566
3567static char isupper__doc__[] =
3568"S.isupper() -> int\n\
3569\n\
3570Return 1 if all cased characters in S are uppercase and there is\n\
3571at least one cased character in S, 0 otherwise.";
3572
3573static PyObject*
3574unicode_isupper(PyUnicodeObject *self, PyObject *args)
3575{
3576 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3577 register const Py_UNICODE *e;
3578 int cased;
3579
3580 if (!PyArg_NoArgs(args))
3581 return NULL;
3582
3583 /* Shortcut for single character strings */
3584 if (PyUnicode_GET_SIZE(self) == 1)
3585 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3586
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003587 /* Special case for empty strings */
3588 if (PyString_GET_SIZE(self) == 0)
3589 return PyInt_FromLong(0);
3590
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 e = p + PyUnicode_GET_SIZE(self);
3592 cased = 0;
3593 for (; p < e; p++) {
3594 register const Py_UNICODE ch = *p;
3595
3596 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3597 return PyInt_FromLong(0);
3598 else if (!cased && Py_UNICODE_ISUPPER(ch))
3599 cased = 1;
3600 }
3601 return PyInt_FromLong(cased);
3602}
3603
3604static char istitle__doc__[] =
3605"S.istitle() -> int\n\
3606\n\
3607Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3608may only follow uncased characters and lowercase characters only cased\n\
3609ones. Return 0 otherwise.";
3610
3611static PyObject*
3612unicode_istitle(PyUnicodeObject *self, PyObject *args)
3613{
3614 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3615 register const Py_UNICODE *e;
3616 int cased, previous_is_cased;
3617
3618 if (!PyArg_NoArgs(args))
3619 return NULL;
3620
3621 /* Shortcut for single character strings */
3622 if (PyUnicode_GET_SIZE(self) == 1)
3623 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3624 (Py_UNICODE_ISUPPER(*p) != 0));
3625
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003626 /* Special case for empty strings */
3627 if (PyString_GET_SIZE(self) == 0)
3628 return PyInt_FromLong(0);
3629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 e = p + PyUnicode_GET_SIZE(self);
3631 cased = 0;
3632 previous_is_cased = 0;
3633 for (; p < e; p++) {
3634 register const Py_UNICODE ch = *p;
3635
3636 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3637 if (previous_is_cased)
3638 return PyInt_FromLong(0);
3639 previous_is_cased = 1;
3640 cased = 1;
3641 }
3642 else if (Py_UNICODE_ISLOWER(ch)) {
3643 if (!previous_is_cased)
3644 return PyInt_FromLong(0);
3645 previous_is_cased = 1;
3646 cased = 1;
3647 }
3648 else
3649 previous_is_cased = 0;
3650 }
3651 return PyInt_FromLong(cased);
3652}
3653
3654static char isspace__doc__[] =
3655"S.isspace() -> int\n\
3656\n\
3657Return 1 if there are only whitespace characters in S,\n\
36580 otherwise.";
3659
3660static PyObject*
3661unicode_isspace(PyUnicodeObject *self, PyObject *args)
3662{
3663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3664 register const Py_UNICODE *e;
3665
3666 if (!PyArg_NoArgs(args))
3667 return NULL;
3668
3669 /* Shortcut for single character strings */
3670 if (PyUnicode_GET_SIZE(self) == 1 &&
3671 Py_UNICODE_ISSPACE(*p))
3672 return PyInt_FromLong(1);
3673
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003674 /* Special case for empty strings */
3675 if (PyString_GET_SIZE(self) == 0)
3676 return PyInt_FromLong(0);
3677
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 e = p + PyUnicode_GET_SIZE(self);
3679 for (; p < e; p++) {
3680 if (!Py_UNICODE_ISSPACE(*p))
3681 return PyInt_FromLong(0);
3682 }
3683 return PyInt_FromLong(1);
3684}
3685
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003686static char isalpha__doc__[] =
3687"S.isalpha() -> int\n\
3688\n\
3689Return 1 if all characters in S are alphabetic\n\
3690and there is at least one character in S, 0 otherwise.";
3691
3692static PyObject*
3693unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3694{
3695 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3696 register const Py_UNICODE *e;
3697
3698 if (!PyArg_NoArgs(args))
3699 return NULL;
3700
3701 /* Shortcut for single character strings */
3702 if (PyUnicode_GET_SIZE(self) == 1 &&
3703 Py_UNICODE_ISALPHA(*p))
3704 return PyInt_FromLong(1);
3705
3706 /* Special case for empty strings */
3707 if (PyString_GET_SIZE(self) == 0)
3708 return PyInt_FromLong(0);
3709
3710 e = p + PyUnicode_GET_SIZE(self);
3711 for (; p < e; p++) {
3712 if (!Py_UNICODE_ISALPHA(*p))
3713 return PyInt_FromLong(0);
3714 }
3715 return PyInt_FromLong(1);
3716}
3717
3718static char isalnum__doc__[] =
3719"S.isalnum() -> int\n\
3720\n\
3721Return 1 if all characters in S are alphanumeric\n\
3722and there is at least one character in S, 0 otherwise.";
3723
3724static PyObject*
3725unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3726{
3727 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3728 register const Py_UNICODE *e;
3729
3730 if (!PyArg_NoArgs(args))
3731 return NULL;
3732
3733 /* Shortcut for single character strings */
3734 if (PyUnicode_GET_SIZE(self) == 1 &&
3735 Py_UNICODE_ISALNUM(*p))
3736 return PyInt_FromLong(1);
3737
3738 /* Special case for empty strings */
3739 if (PyString_GET_SIZE(self) == 0)
3740 return PyInt_FromLong(0);
3741
3742 e = p + PyUnicode_GET_SIZE(self);
3743 for (; p < e; p++) {
3744 if (!Py_UNICODE_ISALNUM(*p))
3745 return PyInt_FromLong(0);
3746 }
3747 return PyInt_FromLong(1);
3748}
3749
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750static char isdecimal__doc__[] =
3751"S.isdecimal() -> int\n\
3752\n\
3753Return 1 if there are only decimal characters in S,\n\
37540 otherwise.";
3755
3756static PyObject*
3757unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3758{
3759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3760 register const Py_UNICODE *e;
3761
3762 if (!PyArg_NoArgs(args))
3763 return NULL;
3764
3765 /* Shortcut for single character strings */
3766 if (PyUnicode_GET_SIZE(self) == 1 &&
3767 Py_UNICODE_ISDECIMAL(*p))
3768 return PyInt_FromLong(1);
3769
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003770 /* Special case for empty strings */
3771 if (PyString_GET_SIZE(self) == 0)
3772 return PyInt_FromLong(0);
3773
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 e = p + PyUnicode_GET_SIZE(self);
3775 for (; p < e; p++) {
3776 if (!Py_UNICODE_ISDECIMAL(*p))
3777 return PyInt_FromLong(0);
3778 }
3779 return PyInt_FromLong(1);
3780}
3781
3782static char isdigit__doc__[] =
3783"S.isdigit() -> int\n\
3784\n\
3785Return 1 if there are only digit characters in S,\n\
37860 otherwise.";
3787
3788static PyObject*
3789unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3790{
3791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3792 register const Py_UNICODE *e;
3793
3794 if (!PyArg_NoArgs(args))
3795 return NULL;
3796
3797 /* Shortcut for single character strings */
3798 if (PyUnicode_GET_SIZE(self) == 1 &&
3799 Py_UNICODE_ISDIGIT(*p))
3800 return PyInt_FromLong(1);
3801
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003802 /* Special case for empty strings */
3803 if (PyString_GET_SIZE(self) == 0)
3804 return PyInt_FromLong(0);
3805
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 e = p + PyUnicode_GET_SIZE(self);
3807 for (; p < e; p++) {
3808 if (!Py_UNICODE_ISDIGIT(*p))
3809 return PyInt_FromLong(0);
3810 }
3811 return PyInt_FromLong(1);
3812}
3813
3814static char isnumeric__doc__[] =
3815"S.isnumeric() -> int\n\
3816\n\
3817Return 1 if there are only numeric characters in S,\n\
38180 otherwise.";
3819
3820static PyObject*
3821unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3822{
3823 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3824 register const Py_UNICODE *e;
3825
3826 if (!PyArg_NoArgs(args))
3827 return NULL;
3828
3829 /* Shortcut for single character strings */
3830 if (PyUnicode_GET_SIZE(self) == 1 &&
3831 Py_UNICODE_ISNUMERIC(*p))
3832 return PyInt_FromLong(1);
3833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003834 /* Special case for empty strings */
3835 if (PyString_GET_SIZE(self) == 0)
3836 return PyInt_FromLong(0);
3837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 e = p + PyUnicode_GET_SIZE(self);
3839 for (; p < e; p++) {
3840 if (!Py_UNICODE_ISNUMERIC(*p))
3841 return PyInt_FromLong(0);
3842 }
3843 return PyInt_FromLong(1);
3844}
3845
3846static char join__doc__[] =
3847"S.join(sequence) -> unicode\n\
3848\n\
3849Return a string which is the concatenation of the strings in the\n\
3850sequence. The separator between elements is S.";
3851
3852static PyObject*
3853unicode_join(PyUnicodeObject *self, PyObject *args)
3854{
3855 PyObject *data;
3856 if (!PyArg_ParseTuple(args, "O:join", &data))
3857 return NULL;
3858
3859 return PyUnicode_Join((PyObject *)self, data);
3860}
3861
3862static int
3863unicode_length(PyUnicodeObject *self)
3864{
3865 return self->length;
3866}
3867
3868static char ljust__doc__[] =
3869"S.ljust(width) -> unicode\n\
3870\n\
3871Return S left justified in a Unicode string of length width. Padding is\n\
3872done using spaces.";
3873
3874static PyObject *
3875unicode_ljust(PyUnicodeObject *self, PyObject *args)
3876{
3877 int width;
3878 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3879 return NULL;
3880
3881 if (self->length >= width) {
3882 Py_INCREF(self);
3883 return (PyObject*) self;
3884 }
3885
3886 return (PyObject*) pad(self, 0, width - self->length, ' ');
3887}
3888
3889static char lower__doc__[] =
3890"S.lower() -> unicode\n\
3891\n\
3892Return a copy of the string S converted to lowercase.";
3893
3894static PyObject*
3895unicode_lower(PyUnicodeObject *self, PyObject *args)
3896{
3897 if (!PyArg_NoArgs(args))
3898 return NULL;
3899 return fixup(self, fixlower);
3900}
3901
3902static char lstrip__doc__[] =
3903"S.lstrip() -> unicode\n\
3904\n\
3905Return a copy of the string S with leading whitespace removed.";
3906
3907static PyObject *
3908unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3909{
3910 if (!PyArg_NoArgs(args))
3911 return NULL;
3912 return strip(self, 1, 0);
3913}
3914
3915static PyObject*
3916unicode_repeat(PyUnicodeObject *str, int len)
3917{
3918 PyUnicodeObject *u;
3919 Py_UNICODE *p;
3920
3921 if (len < 0)
3922 len = 0;
3923
3924 if (len == 1) {
3925 /* no repeat, return original string */
3926 Py_INCREF(str);
3927 return (PyObject*) str;
3928 }
3929
3930 u = _PyUnicode_New(len * str->length);
3931 if (!u)
3932 return NULL;
3933
3934 p = u->str;
3935
3936 while (len-- > 0) {
3937 Py_UNICODE_COPY(p, str->str, str->length);
3938 p += str->length;
3939 }
3940
3941 return (PyObject*) u;
3942}
3943
3944PyObject *PyUnicode_Replace(PyObject *obj,
3945 PyObject *subobj,
3946 PyObject *replobj,
3947 int maxcount)
3948{
3949 PyObject *self;
3950 PyObject *str1;
3951 PyObject *str2;
3952 PyObject *result;
3953
3954 self = PyUnicode_FromObject(obj);
3955 if (self == NULL)
3956 return NULL;
3957 str1 = PyUnicode_FromObject(subobj);
3958 if (str1 == NULL) {
3959 Py_DECREF(self);
3960 return NULL;
3961 }
3962 str2 = PyUnicode_FromObject(replobj);
3963 if (str2 == NULL) {
3964 Py_DECREF(self);
3965 Py_DECREF(str1);
3966 return NULL;
3967 }
3968 result = replace((PyUnicodeObject *)self,
3969 (PyUnicodeObject *)str1,
3970 (PyUnicodeObject *)str2,
3971 maxcount);
3972 Py_DECREF(self);
3973 Py_DECREF(str1);
3974 Py_DECREF(str2);
3975 return result;
3976}
3977
3978static char replace__doc__[] =
3979"S.replace (old, new[, maxsplit]) -> unicode\n\
3980\n\
3981Return a copy of S with all occurrences of substring\n\
3982old replaced by new. If the optional argument maxsplit is\n\
3983given, only the first maxsplit occurrences are replaced.";
3984
3985static PyObject*
3986unicode_replace(PyUnicodeObject *self, PyObject *args)
3987{
3988 PyUnicodeObject *str1;
3989 PyUnicodeObject *str2;
3990 int maxcount = -1;
3991 PyObject *result;
3992
3993 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3994 return NULL;
3995 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3996 if (str1 == NULL)
3997 return NULL;
3998 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3999 if (str2 == NULL)
4000 return NULL;
4001
4002 result = replace(self, str1, str2, maxcount);
4003
4004 Py_DECREF(str1);
4005 Py_DECREF(str2);
4006 return result;
4007}
4008
4009static
4010PyObject *unicode_repr(PyObject *unicode)
4011{
4012 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4013 PyUnicode_GET_SIZE(unicode),
4014 1);
4015}
4016
4017static char rfind__doc__[] =
4018"S.rfind(sub [,start [,end]]) -> int\n\
4019\n\
4020Return the highest index in S where substring sub is found,\n\
4021such that sub is contained within s[start,end]. Optional\n\
4022arguments start and end are interpreted as in slice notation.\n\
4023\n\
4024Return -1 on failure.";
4025
4026static PyObject *
4027unicode_rfind(PyUnicodeObject *self, PyObject *args)
4028{
4029 PyUnicodeObject *substring;
4030 int start = 0;
4031 int end = INT_MAX;
4032 PyObject *result;
4033
Guido van Rossumb8872e62000-05-09 14:14:27 +00004034 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4035 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 return NULL;
4037 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4038 (PyObject *)substring);
4039 if (substring == NULL)
4040 return NULL;
4041
4042 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4043
4044 Py_DECREF(substring);
4045 return result;
4046}
4047
4048static char rindex__doc__[] =
4049"S.rindex(sub [,start [,end]]) -> int\n\
4050\n\
4051Like S.rfind() but raise ValueError when the substring is not found.";
4052
4053static PyObject *
4054unicode_rindex(PyUnicodeObject *self, PyObject *args)
4055{
4056 int result;
4057 PyUnicodeObject *substring;
4058 int start = 0;
4059 int end = INT_MAX;
4060
Guido van Rossumb8872e62000-05-09 14:14:27 +00004061 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4062 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 return NULL;
4064 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4065 (PyObject *)substring);
4066 if (substring == NULL)
4067 return NULL;
4068
4069 result = findstring(self, substring, start, end, -1);
4070
4071 Py_DECREF(substring);
4072 if (result < 0) {
4073 PyErr_SetString(PyExc_ValueError, "substring not found");
4074 return NULL;
4075 }
4076 return PyInt_FromLong(result);
4077}
4078
4079static char rjust__doc__[] =
4080"S.rjust(width) -> unicode\n\
4081\n\
4082Return S right justified in a Unicode string of length width. Padding is\n\
4083done using spaces.";
4084
4085static PyObject *
4086unicode_rjust(PyUnicodeObject *self, PyObject *args)
4087{
4088 int width;
4089 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4090 return NULL;
4091
4092 if (self->length >= width) {
4093 Py_INCREF(self);
4094 return (PyObject*) self;
4095 }
4096
4097 return (PyObject*) pad(self, width - self->length, 0, ' ');
4098}
4099
4100static char rstrip__doc__[] =
4101"S.rstrip() -> unicode\n\
4102\n\
4103Return a copy of the string S with trailing whitespace removed.";
4104
4105static PyObject *
4106unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4107{
4108 if (!PyArg_NoArgs(args))
4109 return NULL;
4110 return strip(self, 0, 1);
4111}
4112
4113static PyObject*
4114unicode_slice(PyUnicodeObject *self, int start, int end)
4115{
4116 /* standard clamping */
4117 if (start < 0)
4118 start = 0;
4119 if (end < 0)
4120 end = 0;
4121 if (end > self->length)
4122 end = self->length;
4123 if (start == 0 && end == self->length) {
4124 /* full slice, return original string */
4125 Py_INCREF(self);
4126 return (PyObject*) self;
4127 }
4128 if (start > end)
4129 start = end;
4130 /* copy slice */
4131 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4132 end - start);
4133}
4134
4135PyObject *PyUnicode_Split(PyObject *s,
4136 PyObject *sep,
4137 int maxsplit)
4138{
4139 PyObject *result;
4140
4141 s = PyUnicode_FromObject(s);
4142 if (s == NULL)
4143 return NULL;
4144 if (sep != NULL) {
4145 sep = PyUnicode_FromObject(sep);
4146 if (sep == NULL) {
4147 Py_DECREF(s);
4148 return NULL;
4149 }
4150 }
4151
4152 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4153
4154 Py_DECREF(s);
4155 Py_XDECREF(sep);
4156 return result;
4157}
4158
4159static char split__doc__[] =
4160"S.split([sep [,maxsplit]]) -> list of strings\n\
4161\n\
4162Return a list of the words in S, using sep as the\n\
4163delimiter string. If maxsplit is given, at most maxsplit\n\
4164splits are done. If sep is not specified, any whitespace string\n\
4165is a separator.";
4166
4167static PyObject*
4168unicode_split(PyUnicodeObject *self, PyObject *args)
4169{
4170 PyObject *substring = Py_None;
4171 int maxcount = -1;
4172
4173 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4174 return NULL;
4175
4176 if (substring == Py_None)
4177 return split(self, NULL, maxcount);
4178 else if (PyUnicode_Check(substring))
4179 return split(self, (PyUnicodeObject *)substring, maxcount);
4180 else
4181 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4182}
4183
4184static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004185"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186\n\
4187Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004188Line breaks are not included in the resulting list unless keepends\n\
4189is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190
4191static PyObject*
4192unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4193{
Guido van Rossum86662912000-04-11 15:38:46 +00004194 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195
Guido van Rossum86662912000-04-11 15:38:46 +00004196 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 return NULL;
4198
Guido van Rossum86662912000-04-11 15:38:46 +00004199 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200}
4201
4202static
4203PyObject *unicode_str(PyUnicodeObject *self)
4204{
Fred Drakee4315f52000-05-09 19:53:39 +00004205 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206}
4207
4208static char strip__doc__[] =
4209"S.strip() -> unicode\n\
4210\n\
4211Return a copy of S with leading and trailing whitespace removed.";
4212
4213static PyObject *
4214unicode_strip(PyUnicodeObject *self, PyObject *args)
4215{
4216 if (!PyArg_NoArgs(args))
4217 return NULL;
4218 return strip(self, 1, 1);
4219}
4220
4221static char swapcase__doc__[] =
4222"S.swapcase() -> unicode\n\
4223\n\
4224Return a copy of S with uppercase characters converted to lowercase\n\
4225and vice versa.";
4226
4227static PyObject*
4228unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4229{
4230 if (!PyArg_NoArgs(args))
4231 return NULL;
4232 return fixup(self, fixswapcase);
4233}
4234
4235static char translate__doc__[] =
4236"S.translate(table) -> unicode\n\
4237\n\
4238Return a copy of the string S, where all characters have been mapped\n\
4239through the given translation table, which must be a mapping of\n\
4240Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4241are left untouched. Characters mapped to None are deleted.";
4242
4243static PyObject*
4244unicode_translate(PyUnicodeObject *self, PyObject *args)
4245{
4246 PyObject *table;
4247
4248 if (!PyArg_ParseTuple(args, "O:translate", &table))
4249 return NULL;
4250 return PyUnicode_TranslateCharmap(self->str,
4251 self->length,
4252 table,
4253 "ignore");
4254}
4255
4256static char upper__doc__[] =
4257"S.upper() -> unicode\n\
4258\n\
4259Return a copy of S converted to uppercase.";
4260
4261static PyObject*
4262unicode_upper(PyUnicodeObject *self, PyObject *args)
4263{
4264 if (!PyArg_NoArgs(args))
4265 return NULL;
4266 return fixup(self, fixupper);
4267}
4268
4269#if 0
4270static char zfill__doc__[] =
4271"S.zfill(width) -> unicode\n\
4272\n\
4273Pad a numeric string x with zeros on the left, to fill a field\n\
4274of the specified width. The string x is never truncated.";
4275
4276static PyObject *
4277unicode_zfill(PyUnicodeObject *self, PyObject *args)
4278{
4279 int fill;
4280 PyUnicodeObject *u;
4281
4282 int width;
4283 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4284 return NULL;
4285
4286 if (self->length >= width) {
4287 Py_INCREF(self);
4288 return (PyObject*) self;
4289 }
4290
4291 fill = width - self->length;
4292
4293 u = pad(self, fill, 0, '0');
4294
4295 if (u->str[fill] == '+' || u->str[fill] == '-') {
4296 /* move sign to beginning of string */
4297 u->str[0] = u->str[fill];
4298 u->str[fill] = '0';
4299 }
4300
4301 return (PyObject*) u;
4302}
4303#endif
4304
4305#if 0
4306static PyObject*
4307unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4308{
4309 if (!PyArg_NoArgs(args))
4310 return NULL;
4311 return PyInt_FromLong(unicode_freelist_size);
4312}
4313#endif
4314
4315static char startswith__doc__[] =
4316"S.startswith(prefix[, start[, end]]) -> int\n\
4317\n\
4318Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4319optional start, test S beginning at that position. With optional end, stop\n\
4320comparing S at that position.";
4321
4322static PyObject *
4323unicode_startswith(PyUnicodeObject *self,
4324 PyObject *args)
4325{
4326 PyUnicodeObject *substring;
4327 int start = 0;
4328 int end = INT_MAX;
4329 PyObject *result;
4330
Guido van Rossumb8872e62000-05-09 14:14:27 +00004331 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4332 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 return NULL;
4334 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4335 (PyObject *)substring);
4336 if (substring == NULL)
4337 return NULL;
4338
4339 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4340
4341 Py_DECREF(substring);
4342 return result;
4343}
4344
4345
4346static char endswith__doc__[] =
4347"S.endswith(suffix[, start[, end]]) -> int\n\
4348\n\
4349Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4350optional start, test S beginning at that position. With optional end, stop\n\
4351comparing S at that position.";
4352
4353static PyObject *
4354unicode_endswith(PyUnicodeObject *self,
4355 PyObject *args)
4356{
4357 PyUnicodeObject *substring;
4358 int start = 0;
4359 int end = INT_MAX;
4360 PyObject *result;
4361
Guido van Rossumb8872e62000-05-09 14:14:27 +00004362 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4363 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 return NULL;
4365 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4366 (PyObject *)substring);
4367 if (substring == NULL)
4368 return NULL;
4369
4370 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4371
4372 Py_DECREF(substring);
4373 return result;
4374}
4375
4376
4377static PyMethodDef unicode_methods[] = {
4378
4379 /* Order is according to common usage: often used methods should
4380 appear first, since lookup is done sequentially. */
4381
4382 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4383 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4384 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4385 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4386 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4387 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4388 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4389 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4390 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4391 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4392 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4393 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4394 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4395 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4396/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4397 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4398 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4399 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4400 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4401 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4402 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4403 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4404 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4405 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4406 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4407 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4408 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4409 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4410 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4411 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4412 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4413 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4414 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004415 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4416 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417#if 0
4418 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4419 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4420#endif
4421
4422#if 0
4423 /* This one is just used for debugging the implementation. */
4424 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4425#endif
4426
4427 {NULL, NULL}
4428};
4429
4430static PyObject *
4431unicode_getattr(PyUnicodeObject *self, char *name)
4432{
4433 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4434}
4435
4436static PySequenceMethods unicode_as_sequence = {
4437 (inquiry) unicode_length, /* sq_length */
4438 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4439 (intargfunc) unicode_repeat, /* sq_repeat */
4440 (intargfunc) unicode_getitem, /* sq_item */
4441 (intintargfunc) unicode_slice, /* sq_slice */
4442 0, /* sq_ass_item */
4443 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004444 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445};
4446
4447static int
4448unicode_buffer_getreadbuf(PyUnicodeObject *self,
4449 int index,
4450 const void **ptr)
4451{
4452 if (index != 0) {
4453 PyErr_SetString(PyExc_SystemError,
4454 "accessing non-existent unicode segment");
4455 return -1;
4456 }
4457 *ptr = (void *) self->str;
4458 return PyUnicode_GET_DATA_SIZE(self);
4459}
4460
4461static int
4462unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4463 const void **ptr)
4464{
4465 PyErr_SetString(PyExc_TypeError,
4466 "cannot use unicode as modifyable buffer");
4467 return -1;
4468}
4469
4470static int
4471unicode_buffer_getsegcount(PyUnicodeObject *self,
4472 int *lenp)
4473{
4474 if (lenp)
4475 *lenp = PyUnicode_GET_DATA_SIZE(self);
4476 return 1;
4477}
4478
4479static int
4480unicode_buffer_getcharbuf(PyUnicodeObject *self,
4481 int index,
4482 const void **ptr)
4483{
4484 PyObject *str;
4485
4486 if (index != 0) {
4487 PyErr_SetString(PyExc_SystemError,
4488 "accessing non-existent unicode segment");
4489 return -1;
4490 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004491 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 if (str == NULL)
4493 return -1;
4494 *ptr = (void *) PyString_AS_STRING(str);
4495 return PyString_GET_SIZE(str);
4496}
4497
4498/* Helpers for PyUnicode_Format() */
4499
4500static PyObject *
4501getnextarg(args, arglen, p_argidx)
4502 PyObject *args;
4503int arglen;
4504int *p_argidx;
4505{
4506 int argidx = *p_argidx;
4507 if (argidx < arglen) {
4508 (*p_argidx)++;
4509 if (arglen < 0)
4510 return args;
4511 else
4512 return PyTuple_GetItem(args, argidx);
4513 }
4514 PyErr_SetString(PyExc_TypeError,
4515 "not enough arguments for format string");
4516 return NULL;
4517}
4518
4519#define F_LJUST (1<<0)
4520#define F_SIGN (1<<1)
4521#define F_BLANK (1<<2)
4522#define F_ALT (1<<3)
4523#define F_ZERO (1<<4)
4524
4525static
4526#ifdef HAVE_STDARG_PROTOTYPES
4527int usprintf(register Py_UNICODE *buffer, char *format, ...)
4528#else
4529int usprintf(va_alist) va_dcl
4530#endif
4531{
4532 register int i;
4533 int len;
4534 va_list va;
4535 char *charbuffer;
4536#ifdef HAVE_STDARG_PROTOTYPES
4537 va_start(va, format);
4538#else
4539 Py_UNICODE *args;
4540 char *format;
4541
4542 va_start(va);
4543 buffer = va_arg(va, Py_UNICODE *);
4544 format = va_arg(va, char *);
4545#endif
4546
4547 /* First, format the string as char array, then expand to Py_UNICODE
4548 array. */
4549 charbuffer = (char *)buffer;
4550 len = vsprintf(charbuffer, format, va);
4551 for (i = len - 1; i >= 0; i--)
4552 buffer[i] = (Py_UNICODE) charbuffer[i];
4553
4554 va_end(va);
4555 return len;
4556}
4557
4558static int
4559formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004560 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 int flags,
4562 int prec,
4563 int type,
4564 PyObject *v)
4565{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004566 /* fmt = '%#.' + `prec` + `type`
4567 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 char fmt[20];
4569 double x;
4570
4571 x = PyFloat_AsDouble(v);
4572 if (x == -1.0 && PyErr_Occurred())
4573 return -1;
4574 if (prec < 0)
4575 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4577 type = 'g';
4578 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004579 /* worst case length calc to ensure no buffer overrun:
4580 fmt = %#.<prec>g
4581 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4582 for any double rep.)
4583 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4584 If prec=0 the effective precision is 1 (the leading digit is
4585 always given), therefore increase by one to 10+prec. */
4586 if (buflen <= (size_t)10 + (size_t)prec) {
4587 PyErr_SetString(PyExc_OverflowError,
4588 "formatted float is too long (precision too long?)");
4589 return -1;
4590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 return usprintf(buf, fmt, x);
4592}
4593
4594static int
4595formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004596 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 int flags,
4598 int prec,
4599 int type,
4600 PyObject *v)
4601{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004602 /* fmt = '%#.' + `prec` + 'l' + `type`
4603 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 char fmt[20];
4605 long x;
4606
4607 x = PyInt_AsLong(v);
4608 if (x == -1 && PyErr_Occurred())
4609 return -1;
4610 if (prec < 0)
4611 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004612 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4613 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4614 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4615 PyErr_SetString(PyExc_OverflowError,
4616 "formatted integer is too long (precision too long?)");
4617 return -1;
4618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4620 return usprintf(buf, fmt, x);
4621}
4622
4623static int
4624formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004625 size_t buflen,
4626 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004628 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004629 if (PyUnicode_Check(v)) {
4630 if (PyUnicode_GET_SIZE(v) != 1)
4631 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004635 else if (PyString_Check(v)) {
4636 if (PyString_GET_SIZE(v) != 1)
4637 goto onError;
4638 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640
4641 else {
4642 /* Integer input truncated to a character */
4643 long x;
4644 x = PyInt_AsLong(v);
4645 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 buf[0] = (char) x;
4648 }
4649 buf[1] = '\0';
4650 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004651
4652 onError:
4653 PyErr_SetString(PyExc_TypeError,
4654 "%c requires int or char");
4655 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656}
4657
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004658/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4659
4660 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4661 chars are formatted. XXX This is a magic number. Each formatting
4662 routine does bounds checking to ensure no overflow, but a better
4663 solution may be to malloc a buffer of appropriate size for each
4664 format. For now, the current solution is sufficient.
4665*/
4666#define FORMATBUFLEN (size_t)120
4667
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668PyObject *PyUnicode_Format(PyObject *format,
4669 PyObject *args)
4670{
4671 Py_UNICODE *fmt, *res;
4672 int fmtcnt, rescnt, reslen, arglen, argidx;
4673 int args_owned = 0;
4674 PyUnicodeObject *result = NULL;
4675 PyObject *dict = NULL;
4676 PyObject *uformat;
4677
4678 if (format == NULL || args == NULL) {
4679 PyErr_BadInternalCall();
4680 return NULL;
4681 }
4682 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004683 if (uformat == NULL)
4684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 fmt = PyUnicode_AS_UNICODE(uformat);
4686 fmtcnt = PyUnicode_GET_SIZE(uformat);
4687
4688 reslen = rescnt = fmtcnt + 100;
4689 result = _PyUnicode_New(reslen);
4690 if (result == NULL)
4691 goto onError;
4692 res = PyUnicode_AS_UNICODE(result);
4693
4694 if (PyTuple_Check(args)) {
4695 arglen = PyTuple_Size(args);
4696 argidx = 0;
4697 }
4698 else {
4699 arglen = -1;
4700 argidx = -2;
4701 }
4702 if (args->ob_type->tp_as_mapping)
4703 dict = args;
4704
4705 while (--fmtcnt >= 0) {
4706 if (*fmt != '%') {
4707 if (--rescnt < 0) {
4708 rescnt = fmtcnt + 100;
4709 reslen += rescnt;
4710 if (_PyUnicode_Resize(result, reslen) < 0)
4711 return NULL;
4712 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4713 --rescnt;
4714 }
4715 *res++ = *fmt++;
4716 }
4717 else {
4718 /* Got a format specifier */
4719 int flags = 0;
4720 int width = -1;
4721 int prec = -1;
4722 int size = 0;
4723 Py_UNICODE c = '\0';
4724 Py_UNICODE fill;
4725 PyObject *v = NULL;
4726 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004727 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 Py_UNICODE sign;
4729 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004730 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731
4732 fmt++;
4733 if (*fmt == '(') {
4734 Py_UNICODE *keystart;
4735 int keylen;
4736 PyObject *key;
4737 int pcount = 1;
4738
4739 if (dict == NULL) {
4740 PyErr_SetString(PyExc_TypeError,
4741 "format requires a mapping");
4742 goto onError;
4743 }
4744 ++fmt;
4745 --fmtcnt;
4746 keystart = fmt;
4747 /* Skip over balanced parentheses */
4748 while (pcount > 0 && --fmtcnt >= 0) {
4749 if (*fmt == ')')
4750 --pcount;
4751 else if (*fmt == '(')
4752 ++pcount;
4753 fmt++;
4754 }
4755 keylen = fmt - keystart - 1;
4756 if (fmtcnt < 0 || pcount > 0) {
4757 PyErr_SetString(PyExc_ValueError,
4758 "incomplete format key");
4759 goto onError;
4760 }
Fred Drakee4315f52000-05-09 19:53:39 +00004761 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 then looked up since Python uses strings to hold
4763 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004764 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 key = PyUnicode_EncodeUTF8(keystart,
4766 keylen,
4767 NULL);
4768 if (key == NULL)
4769 goto onError;
4770 if (args_owned) {
4771 Py_DECREF(args);
4772 args_owned = 0;
4773 }
4774 args = PyObject_GetItem(dict, key);
4775 Py_DECREF(key);
4776 if (args == NULL) {
4777 goto onError;
4778 }
4779 args_owned = 1;
4780 arglen = -1;
4781 argidx = -2;
4782 }
4783 while (--fmtcnt >= 0) {
4784 switch (c = *fmt++) {
4785 case '-': flags |= F_LJUST; continue;
4786 case '+': flags |= F_SIGN; continue;
4787 case ' ': flags |= F_BLANK; continue;
4788 case '#': flags |= F_ALT; continue;
4789 case '0': flags |= F_ZERO; continue;
4790 }
4791 break;
4792 }
4793 if (c == '*') {
4794 v = getnextarg(args, arglen, &argidx);
4795 if (v == NULL)
4796 goto onError;
4797 if (!PyInt_Check(v)) {
4798 PyErr_SetString(PyExc_TypeError,
4799 "* wants int");
4800 goto onError;
4801 }
4802 width = PyInt_AsLong(v);
4803 if (width < 0) {
4804 flags |= F_LJUST;
4805 width = -width;
4806 }
4807 if (--fmtcnt >= 0)
4808 c = *fmt++;
4809 }
4810 else if (c >= '0' && c <= '9') {
4811 width = c - '0';
4812 while (--fmtcnt >= 0) {
4813 c = *fmt++;
4814 if (c < '0' || c > '9')
4815 break;
4816 if ((width*10) / 10 != width) {
4817 PyErr_SetString(PyExc_ValueError,
4818 "width too big");
4819 goto onError;
4820 }
4821 width = width*10 + (c - '0');
4822 }
4823 }
4824 if (c == '.') {
4825 prec = 0;
4826 if (--fmtcnt >= 0)
4827 c = *fmt++;
4828 if (c == '*') {
4829 v = getnextarg(args, arglen, &argidx);
4830 if (v == NULL)
4831 goto onError;
4832 if (!PyInt_Check(v)) {
4833 PyErr_SetString(PyExc_TypeError,
4834 "* wants int");
4835 goto onError;
4836 }
4837 prec = PyInt_AsLong(v);
4838 if (prec < 0)
4839 prec = 0;
4840 if (--fmtcnt >= 0)
4841 c = *fmt++;
4842 }
4843 else if (c >= '0' && c <= '9') {
4844 prec = c - '0';
4845 while (--fmtcnt >= 0) {
4846 c = Py_CHARMASK(*fmt++);
4847 if (c < '0' || c > '9')
4848 break;
4849 if ((prec*10) / 10 != prec) {
4850 PyErr_SetString(PyExc_ValueError,
4851 "prec too big");
4852 goto onError;
4853 }
4854 prec = prec*10 + (c - '0');
4855 }
4856 }
4857 } /* prec */
4858 if (fmtcnt >= 0) {
4859 if (c == 'h' || c == 'l' || c == 'L') {
4860 size = c;
4861 if (--fmtcnt >= 0)
4862 c = *fmt++;
4863 }
4864 }
4865 if (fmtcnt < 0) {
4866 PyErr_SetString(PyExc_ValueError,
4867 "incomplete format");
4868 goto onError;
4869 }
4870 if (c != '%') {
4871 v = getnextarg(args, arglen, &argidx);
4872 if (v == NULL)
4873 goto onError;
4874 }
4875 sign = 0;
4876 fill = ' ';
4877 switch (c) {
4878
4879 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004880 pbuf = formatbuf;
4881 /* presume that buffer length is at least 1 */
4882 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 len = 1;
4884 break;
4885
4886 case 's':
4887 case 'r':
4888 if (PyUnicode_Check(v) && c == 's') {
4889 temp = v;
4890 Py_INCREF(temp);
4891 }
4892 else {
4893 PyObject *unicode;
4894 if (c == 's')
4895 temp = PyObject_Str(v);
4896 else
4897 temp = PyObject_Repr(v);
4898 if (temp == NULL)
4899 goto onError;
4900 if (!PyString_Check(temp)) {
4901 /* XXX Note: this should never happen, since
4902 PyObject_Repr() and PyObject_Str() assure
4903 this */
4904 Py_DECREF(temp);
4905 PyErr_SetString(PyExc_TypeError,
4906 "%s argument has non-string str()");
4907 goto onError;
4908 }
Fred Drakee4315f52000-05-09 19:53:39 +00004909 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004911 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 "strict");
4913 Py_DECREF(temp);
4914 temp = unicode;
4915 if (temp == NULL)
4916 goto onError;
4917 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004918 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 len = PyUnicode_GET_SIZE(temp);
4920 if (prec >= 0 && len > prec)
4921 len = prec;
4922 break;
4923
4924 case 'i':
4925 case 'd':
4926 case 'u':
4927 case 'o':
4928 case 'x':
4929 case 'X':
4930 if (c == 'i')
4931 c = 'd';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004932 pbuf = formatbuf;
4933 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4934 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 if (len < 0)
4936 goto onError;
4937 sign = (c == 'd');
4938 if (flags & F_ZERO) {
4939 fill = '0';
4940 if ((flags&F_ALT) &&
4941 (c == 'x' || c == 'X') &&
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004942 pbuf[0] == '0' && pbuf[1] == c) {
4943 *res++ = *pbuf++;
4944 *res++ = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 rescnt -= 2;
4946 len -= 2;
4947 width -= 2;
4948 if (width < 0)
4949 width = 0;
4950 }
4951 }
4952 break;
4953
4954 case 'e':
4955 case 'E':
4956 case 'f':
4957 case 'g':
4958 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004959 pbuf = formatbuf;
4960 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4961 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 if (len < 0)
4963 goto onError;
4964 sign = 1;
4965 if (flags&F_ZERO)
4966 fill = '0';
4967 break;
4968
4969 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004970 pbuf = formatbuf;
4971 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 if (len < 0)
4973 goto onError;
4974 break;
4975
4976 default:
4977 PyErr_Format(PyExc_ValueError,
4978 "unsupported format character '%c' (0x%x)",
4979 c, c);
4980 goto onError;
4981 }
4982 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004983 if (*pbuf == '-' || *pbuf == '+') {
4984 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 len--;
4986 }
4987 else if (flags & F_SIGN)
4988 sign = '+';
4989 else if (flags & F_BLANK)
4990 sign = ' ';
4991 else
4992 sign = 0;
4993 }
4994 if (width < len)
4995 width = len;
4996 if (rescnt < width + (sign != 0)) {
4997 reslen -= rescnt;
4998 rescnt = width + fmtcnt + 100;
4999 reslen += rescnt;
5000 if (_PyUnicode_Resize(result, reslen) < 0)
5001 return NULL;
5002 res = PyUnicode_AS_UNICODE(result)
5003 + reslen - rescnt;
5004 }
5005 if (sign) {
5006 if (fill != ' ')
5007 *res++ = sign;
5008 rescnt--;
5009 if (width > len)
5010 width--;
5011 }
5012 if (width > len && !(flags & F_LJUST)) {
5013 do {
5014 --rescnt;
5015 *res++ = fill;
5016 } while (--width > len);
5017 }
5018 if (sign && fill == ' ')
5019 *res++ = sign;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005020 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 res += len;
5022 rescnt -= len;
5023 while (--width >= len) {
5024 --rescnt;
5025 *res++ = ' ';
5026 }
5027 if (dict && (argidx < arglen) && c != '%') {
5028 PyErr_SetString(PyExc_TypeError,
5029 "not all arguments converted");
5030 goto onError;
5031 }
5032 Py_XDECREF(temp);
5033 } /* '%' */
5034 } /* until end */
5035 if (argidx < arglen && !dict) {
5036 PyErr_SetString(PyExc_TypeError,
5037 "not all arguments converted");
5038 goto onError;
5039 }
5040
5041 if (args_owned) {
5042 Py_DECREF(args);
5043 }
5044 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005045 if (_PyUnicode_Resize(result, reslen - rescnt))
5046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 return (PyObject *)result;
5048
5049 onError:
5050 Py_XDECREF(result);
5051 Py_DECREF(uformat);
5052 if (args_owned) {
5053 Py_DECREF(args);
5054 }
5055 return NULL;
5056}
5057
5058static PyBufferProcs unicode_as_buffer = {
5059 (getreadbufferproc) unicode_buffer_getreadbuf,
5060 (getwritebufferproc) unicode_buffer_getwritebuf,
5061 (getsegcountproc) unicode_buffer_getsegcount,
5062 (getcharbufferproc) unicode_buffer_getcharbuf,
5063};
5064
5065PyTypeObject PyUnicode_Type = {
5066 PyObject_HEAD_INIT(&PyType_Type)
5067 0, /* ob_size */
5068 "unicode", /* tp_name */
5069 sizeof(PyUnicodeObject), /* tp_size */
5070 0, /* tp_itemsize */
5071 /* Slots */
5072 (destructor)_PyUnicode_Free, /* tp_dealloc */
5073 0, /* tp_print */
5074 (getattrfunc)unicode_getattr, /* tp_getattr */
5075 0, /* tp_setattr */
5076 (cmpfunc) unicode_compare, /* tp_compare */
5077 (reprfunc) unicode_repr, /* tp_repr */
5078 0, /* tp_as_number */
5079 &unicode_as_sequence, /* tp_as_sequence */
5080 0, /* tp_as_mapping */
5081 (hashfunc) unicode_hash, /* tp_hash*/
5082 0, /* tp_call*/
5083 (reprfunc) unicode_str, /* tp_str */
5084 (getattrofunc) NULL, /* tp_getattro */
5085 (setattrofunc) NULL, /* tp_setattro */
5086 &unicode_as_buffer, /* tp_as_buffer */
5087 Py_TPFLAGS_DEFAULT, /* tp_flags */
5088};
5089
5090/* Initialize the Unicode implementation */
5091
5092void _PyUnicode_Init()
5093{
5094 /* Doublecheck the configuration... */
5095 if (sizeof(Py_UNICODE) != 2)
5096 Py_FatalError("Unicode configuration error: "
5097 "sizeof(Py_UNICODE) != 2 bytes");
5098
Fred Drakee4315f52000-05-09 19:53:39 +00005099 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005100 unicode_freelist = NULL;
5101 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005103 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104}
5105
5106/* Finalize the Unicode implementation */
5107
5108void
5109_PyUnicode_Fini()
5110{
5111 PyUnicodeObject *u = unicode_freelist;
5112
5113 while (u != NULL) {
5114 PyUnicodeObject *v = u;
5115 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005116 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005117 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005118 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005119 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005121 unicode_freelist = NULL;
5122 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005124 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125}