blob: a50b9256c91b411eac12b5f88f44420d89a697b3 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000427 if (encoding) {
Tim Peters78e0fc72001-09-11 03:07:38 +0000428 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000429 "decoding Unicode is not supported");
Tim Peters78e0fc72001-09-11 03:07:38 +0000430 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000431 }
Tim Peters78e0fc72001-09-11 03:07:38 +0000432 if (PyUnicode_CheckExact(obj)) {
433 Py_INCREF(obj);
434 v = obj;
435 }
436 else {
437 /* For a subclass of unicode, return a true unicode object
438 with the same string value. */
439 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
440 PyUnicode_GET_SIZE(obj));
441 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
444 else if (PyString_Check(obj)) {
445 s = PyString_AS_STRING(obj);
446 len = PyString_GET_SIZE(obj);
447 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000448 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
449 /* Overwrite the error message with something more useful in
450 case of a TypeError. */
451 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000452 PyErr_Format(PyExc_TypeError,
453 "coercing to Unicode: need string or buffer, "
454 "%.80s found",
455 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000457 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000458
459 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 if (len == 0) {
461 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000464 else
465 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000466
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 return v;
472
473 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478}
479
480PyObject *PyUnicode_Decode(const char *s,
481 int size,
482 const char *encoding,
483 const char *errors)
484{
485 PyObject *buffer = NULL, *unicode;
486
Fred Drakee4315f52000-05-09 19:53:39 +0000487 if (encoding == NULL)
488 encoding = PyUnicode_GetDefaultEncoding();
489
490 /* Shortcuts for common default encodings */
491 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000492 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000493 else if (strcmp(encoding, "latin-1") == 0)
494 return PyUnicode_DecodeLatin1(s, size, errors);
495 else if (strcmp(encoding, "ascii") == 0)
496 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497
498 /* Decode via the codec registry */
499 buffer = PyBuffer_FromMemory((void *)s, size);
500 if (buffer == NULL)
501 goto onError;
502 unicode = PyCodec_Decode(buffer, encoding, errors);
503 if (unicode == NULL)
504 goto onError;
505 if (!PyUnicode_Check(unicode)) {
506 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000507 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode->ob_type->tp_name);
509 Py_DECREF(unicode);
510 goto onError;
511 }
512 Py_DECREF(buffer);
513 return unicode;
514
515 onError:
516 Py_XDECREF(buffer);
517 return NULL;
518}
519
520PyObject *PyUnicode_Encode(const Py_UNICODE *s,
521 int size,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *v, *unicode;
526
527 unicode = PyUnicode_FromUnicode(s, size);
528 if (unicode == NULL)
529 return NULL;
530 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
531 Py_DECREF(unicode);
532 return v;
533}
534
535PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
536 const char *encoding,
537 const char *errors)
538{
539 PyObject *v;
540
541 if (!PyUnicode_Check(unicode)) {
542 PyErr_BadArgument();
543 goto onError;
544 }
Fred Drakee4315f52000-05-09 19:53:39 +0000545
546 if (encoding == NULL)
547 encoding = PyUnicode_GetDefaultEncoding();
548
549 /* Shortcuts for common default encodings */
550 if (errors == NULL) {
551 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000552 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000553 else if (strcmp(encoding, "latin-1") == 0)
554 return PyUnicode_AsLatin1String(unicode);
555 else if (strcmp(encoding, "ascii") == 0)
556 return PyUnicode_AsASCIIString(unicode);
557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558
559 /* Encode via the codec registry */
560 v = PyCodec_Encode(unicode, encoding, errors);
561 if (v == NULL)
562 goto onError;
563 /* XXX Should we really enforce this ? */
564 if (!PyString_Check(v)) {
565 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000566 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 v->ob_type->tp_name);
568 Py_DECREF(v);
569 goto onError;
570 }
571 return v;
572
573 onError:
574 return NULL;
575}
576
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000577PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
578 const char *errors)
579{
580 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
581
582 if (v)
583 return v;
584 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
585 if (v && errors == NULL)
586 ((PyUnicodeObject *)unicode)->defenc = v;
587 return v;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
591{
592 if (!PyUnicode_Check(unicode)) {
593 PyErr_BadArgument();
594 goto onError;
595 }
596 return PyUnicode_AS_UNICODE(unicode);
597
598 onError:
599 return NULL;
600}
601
602int PyUnicode_GetSize(PyObject *unicode)
603{
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_BadArgument();
606 goto onError;
607 }
608 return PyUnicode_GET_SIZE(unicode);
609
610 onError:
611 return -1;
612}
613
Thomas Wouters78890102000-07-22 19:25:51 +0000614const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000615{
616 return unicode_default_encoding;
617}
618
619int PyUnicode_SetDefaultEncoding(const char *encoding)
620{
621 PyObject *v;
622
623 /* Make sure the encoding is valid. As side effect, this also
624 loads the encoding into the codec registry cache. */
625 v = _PyCodec_Lookup(encoding);
626 if (v == NULL)
627 goto onError;
628 Py_DECREF(v);
629 strncpy(unicode_default_encoding,
630 encoding,
631 sizeof(unicode_default_encoding));
632 return 0;
633
634 onError:
635 return -1;
636}
637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- UTF-8 Codec -------------------------------------------------------- */
639
640static
641char utf8_code_length[256] = {
642 /* Map UTF-8 encoded prefix byte to sequence length. zero means
643 illegal prefix. see RFC 2279 for details */
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
647 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
654 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
656 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
657 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
658 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
659 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
660};
661
662static
663int utf8_decoding_error(const char **source,
664 Py_UNICODE **dest,
665 const char *errors,
666 const char *details)
667{
668 if ((errors == NULL) ||
669 (strcmp(errors,"strict") == 0)) {
670 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000671 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 details);
673 return -1;
674 }
675 else if (strcmp(errors,"ignore") == 0) {
676 (*source)++;
677 return 0;
678 }
679 else if (strcmp(errors,"replace") == 0) {
680 (*source)++;
681 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
682 (*dest)++;
683 return 0;
684 }
685 else {
686 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000687 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 errors);
689 return -1;
690 }
691}
692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693PyObject *PyUnicode_DecodeUTF8(const char *s,
694 int size,
695 const char *errors)
696{
697 int n;
698 const char *e;
699 PyUnicodeObject *unicode;
700 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000701 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000702
703 /* Note: size will always be longer than the resulting Unicode
704 character count */
705 unicode = _PyUnicode_New(size);
706 if (!unicode)
707 return NULL;
708 if (size == 0)
709 return (PyObject *)unicode;
710
711 /* Unpack UTF-8 encoded data */
712 p = unicode->str;
713 e = s + size;
714
715 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000716 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717
718 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000719 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720 s++;
721 continue;
722 }
723
724 n = utf8_code_length[ch];
725
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000726 if (s + n > e) {
727 errmsg = "unexpected end of data";
728 goto utf8Error;
729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 switch (n) {
732
733 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000734 errmsg = "unexpected code byte";
735 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736
737 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 errmsg = "internal error";
739 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740
741 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000742 if ((s[1] & 0xc0) != 0x80) {
743 errmsg = "invalid data";
744 goto utf8Error;
745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000746 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000747 if (ch < 0x80) {
748 errmsg = "illegal encoding";
749 goto utf8Error;
750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000752 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753 break;
754
755 case 3:
756 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000757 (s[2] & 0xc0) != 0x80) {
758 errmsg = "invalid data";
759 goto utf8Error;
760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000762 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
763 errmsg = "illegal encoding";
764 goto utf8Error;
765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000766 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000767 *p++ = (Py_UNICODE)ch;
768 break;
769
770 case 4:
771 if ((s[1] & 0xc0) != 0x80 ||
772 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000773 (s[3] & 0xc0) != 0x80) {
774 errmsg = "invalid data";
775 goto utf8Error;
776 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000777 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
778 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
779 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000780 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000781 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000782 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000783 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000784 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000785 errmsg = "illegal encoding";
786 goto utf8Error;
787 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000788#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000789 *p++ = (Py_UNICODE)ch;
790#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000791 /* compute and append the two surrogates: */
792
793 /* translate from 10000..10FFFF to 0..FFFF */
794 ch -= 0x10000;
795
796 /* high surrogate = top 10 bits added to D800 */
797 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
798
799 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000800 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000801#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 break;
803
804 default:
805 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000806 errmsg = "unsupported Unicode code range";
807 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 }
809 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000810 continue;
811
812 utf8Error:
813 if (utf8_decoding_error(&s, &p, errors, errmsg))
814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
817 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000818 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 goto onError;
820
821 return (PyObject *)unicode;
822
823onError:
824 Py_DECREF(unicode);
825 return NULL;
826}
827
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000828/* Not used anymore, now that the encoder supports UTF-16
829 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000830#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831static
832int utf8_encoding_error(const Py_UNICODE **source,
833 char **dest,
834 const char *errors,
835 const char *details)
836{
837 if ((errors == NULL) ||
838 (strcmp(errors,"strict") == 0)) {
839 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000840 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841 details);
842 return -1;
843 }
844 else if (strcmp(errors,"ignore") == 0) {
845 return 0;
846 }
847 else if (strcmp(errors,"replace") == 0) {
848 **dest = '?';
849 (*dest)++;
850 return 0;
851 }
852 else {
853 PyErr_Format(PyExc_ValueError,
854 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000855 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856 errors);
857 return -1;
858 }
859}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000860#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000861
862PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
863 int size,
864 const char *errors)
865{
866 PyObject *v;
867 char *p;
868 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000869 Py_UCS4 ch2;
870 unsigned int cbAllocated = 3 * size;
871 unsigned int cbWritten = 0;
872 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875 if (v == NULL)
876 return NULL;
877 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000878 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879
880 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000881 while (i < size) {
882 Py_UCS4 ch = s[i++];
883 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000885 cbWritten++;
886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 else if (ch < 0x0800) {
888 *p++ = 0xc0 | (ch >> 6);
889 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000890 cbWritten += 2;
891 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000892 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000893 /* Check for high surrogate */
894 if (0xD800 <= ch && ch <= 0xDBFF) {
895 if (i != size) {
896 ch2 = s[i];
897 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
898
899 if (cbWritten >= (cbAllocated - 4)) {
900 /* Provide enough room for some more
901 surrogates */
902 cbAllocated += 4*10;
903 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000904 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000905 }
906
907 /* combine the two values */
908 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
909
910 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000911 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000912 i++;
913 cbWritten += 4;
914 }
915 }
916 }
917 else {
918 *p++ = (char)(0xe0 | (ch >> 12));
919 cbWritten += 3;
920 }
921 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
922 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000923 } else {
924 *p++ = 0xf0 | (ch>>18);
925 *p++ = 0x80 | ((ch>>12) & 0x3f);
926 *p++ = 0x80 | ((ch>>6) & 0x3f);
927 *p++ = 0x80 | (ch & 0x3f);
928 cbWritten += 4;
929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 }
931 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000932 if (_PyString_Resize(&v, p - q))
933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 return v;
935
936 onError:
937 Py_DECREF(v);
938 return NULL;
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
942{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943 if (!PyUnicode_Check(unicode)) {
944 PyErr_BadArgument();
945 return NULL;
946 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000947 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
948 PyUnicode_GET_SIZE(unicode),
949 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950}
951
952/* --- UTF-16 Codec ------------------------------------------------------- */
953
954static
Tim Peters772747b2001-08-09 22:21:55 +0000955int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956 const char *errors,
957 const char *details)
958{
959 if ((errors == NULL) ||
960 (strcmp(errors,"strict") == 0)) {
961 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000962 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 details);
964 return -1;
965 }
966 else if (strcmp(errors,"ignore") == 0) {
967 return 0;
968 }
969 else if (strcmp(errors,"replace") == 0) {
970 if (dest) {
971 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
972 (*dest)++;
973 }
974 return 0;
975 }
976 else {
977 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000978 "UTF-16 decoding error; "
979 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 errors);
981 return -1;
982 }
983}
984
Tim Peters772747b2001-08-09 22:21:55 +0000985PyObject *
986PyUnicode_DecodeUTF16(const char *s,
987 int size,
988 const char *errors,
989 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990{
991 PyUnicodeObject *unicode;
992 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +0000993 const unsigned char *q, *e;
994 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000995 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +0000996 /* Offsets from q for retrieving byte pairs in the right order. */
997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
998 int ihi = 1, ilo = 0;
999#else
1000 int ihi = 0, ilo = 1;
1001#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001004 if (size & 1) {
1005 if (utf16_decoding_error(NULL, errors, "truncated data"))
1006 return NULL;
1007 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 /* Note: size will always be longer than the resulting Unicode
1011 character count */
1012 unicode = _PyUnicode_New(size);
1013 if (!unicode)
1014 return NULL;
1015 if (size == 0)
1016 return (PyObject *)unicode;
1017
1018 /* Unpack UTF-16 encoded data */
1019 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001020 q = (unsigned char *)s;
1021 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022
1023 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001024 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001026 /* Check for BOM marks (U+FEFF) in the input and adjust current
1027 byte order setting accordingly. In native mode, the leading BOM
1028 mark is skipped, in all other modes, it is copied to the output
1029 stream as-is (giving a ZWNBSP character). */
1030 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001031 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001033 if (bom == 0xFEFF) {
1034 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001035 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001036 }
1037 else if (bom == 0xFFFE) {
1038 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001039 bo = 1;
1040 }
1041#else
Tim Peters772747b2001-08-09 22:21:55 +00001042 if (bom == 0xFEFF) {
1043 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001044 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001045 }
1046 else if (bom == 0xFFFE) {
1047 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001048 bo = -1;
1049 }
1050#endif
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052
Tim Peters772747b2001-08-09 22:21:55 +00001053 if (bo == -1) {
1054 /* force LE */
1055 ihi = 1;
1056 ilo = 0;
1057 }
1058 else if (bo == 1) {
1059 /* force BE */
1060 ihi = 0;
1061 ilo = 1;
1062 }
1063
1064 while (q < e) {
1065 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1066 q += 2;
1067
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (ch < 0xD800 || ch > 0xDFFF) {
1069 *p++ = ch;
1070 continue;
1071 }
1072
1073 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001074 if (q >= e) {
1075 errmsg = "unexpected end of data";
1076 goto utf16Error;
1077 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001078 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001079 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1080 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001081 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001082#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001083 *p++ = ch;
1084 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001085#else
1086 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001088 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001089 }
1090 else {
1091 errmsg = "illegal UTF-16 surrogate";
1092 goto utf16Error;
1093 }
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001096 errmsg = "illegal encoding";
1097 /* Fall through to report the error */
1098
1099 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001100 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 }
1103
1104 if (byteorder)
1105 *byteorder = bo;
1106
1107 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001108 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 goto onError;
1110
1111 return (PyObject *)unicode;
1112
1113onError:
1114 Py_DECREF(unicode);
1115 return NULL;
1116}
1117
Tim Peters772747b2001-08-09 22:21:55 +00001118PyObject *
1119PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1120 int size,
1121 const char *errors,
1122 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123{
1124 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001125 unsigned char *p;
1126 int i, pairs;
1127 /* Offsets from p for storing byte pairs in the right order. */
1128#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1129 int ihi = 1, ilo = 0;
1130#else
1131 int ihi = 0, ilo = 1;
1132#endif
1133
1134#define STORECHAR(CH) \
1135 do { \
1136 p[ihi] = ((CH) >> 8) & 0xff; \
1137 p[ilo] = (CH) & 0xff; \
1138 p += 2; \
1139 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001141 for (i = pairs = 0; i < size; i++)
1142 if (s[i] >= 0x10000)
1143 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001145 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146 if (v == NULL)
1147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
Tim Peters772747b2001-08-09 22:21:55 +00001149 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001151 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001152 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001153 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001154
1155 if (byteorder == -1) {
1156 /* force LE */
1157 ihi = 1;
1158 ilo = 0;
1159 }
1160 else if (byteorder == 1) {
1161 /* force BE */
1162 ihi = 0;
1163 ilo = 1;
1164 }
1165
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001166 while (size-- > 0) {
1167 Py_UNICODE ch = *s++;
1168 Py_UNICODE ch2 = 0;
1169 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001170 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1171 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 }
Tim Peters772747b2001-08-09 22:21:55 +00001173 STORECHAR(ch);
1174 if (ch2)
1175 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001178#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179}
1180
1181PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1182{
1183 if (!PyUnicode_Check(unicode)) {
1184 PyErr_BadArgument();
1185 return NULL;
1186 }
1187 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1188 PyUnicode_GET_SIZE(unicode),
1189 NULL,
1190 0);
1191}
1192
1193/* --- Unicode Escape Codec ----------------------------------------------- */
1194
1195static
1196int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001197 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 const char *errors,
1199 const char *details)
1200{
1201 if ((errors == NULL) ||
1202 (strcmp(errors,"strict") == 0)) {
1203 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001204 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 details);
1206 return -1;
1207 }
1208 else if (strcmp(errors,"ignore") == 0) {
1209 return 0;
1210 }
1211 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001212 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 return 0;
1214 }
1215 else {
1216 PyErr_Format(PyExc_ValueError,
1217 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001218 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 errors);
1220 return -1;
1221 }
1222}
1223
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1227 int size,
1228 const char *errors)
1229{
1230 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001231 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001233 char* message;
1234 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1235
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 /* Escaped strings will always be longer than the resulting
1237 Unicode string, so we start with size here and then reduce the
1238 length after conversion to the true value. */
1239 v = _PyUnicode_New(size);
1240 if (v == NULL)
1241 goto onError;
1242 if (size == 0)
1243 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001244
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 p = buf = PyUnicode_AS_UNICODE(v);
1246 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 while (s < end) {
1249 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001250 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001251 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252
1253 /* Non-escape characters are interpreted as Unicode ordinals */
1254 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001255 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 continue;
1257 }
1258
1259 /* \ - Escapes */
1260 s++;
1261 switch (*s++) {
1262
1263 /* \x escapes */
1264 case '\n': break;
1265 case '\\': *p++ = '\\'; break;
1266 case '\'': *p++ = '\''; break;
1267 case '\"': *p++ = '\"'; break;
1268 case 'b': *p++ = '\b'; break;
1269 case 'f': *p++ = '\014'; break; /* FF */
1270 case 't': *p++ = '\t'; break;
1271 case 'n': *p++ = '\n'; break;
1272 case 'r': *p++ = '\r'; break;
1273 case 'v': *p++ = '\013'; break; /* VT */
1274 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1275
1276 /* \OOO (octal) escapes */
1277 case '0': case '1': case '2': case '3':
1278 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001279 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001281 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001283 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001285 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 break;
1287
Fredrik Lundhccc74732001-02-18 22:13:49 +00001288 /* hex escapes */
1289 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001291 digits = 2;
1292 message = "truncated \\xXX escape";
1293 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294
Fredrik Lundhccc74732001-02-18 22:13:49 +00001295 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001297 digits = 4;
1298 message = "truncated \\uXXXX escape";
1299 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
Fredrik Lundhccc74732001-02-18 22:13:49 +00001301 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001302 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001303 digits = 8;
1304 message = "truncated \\UXXXXXXXX escape";
1305 hexescape:
1306 chr = 0;
1307 for (i = 0; i < digits; i++) {
1308 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001309 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001310 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001311 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001312 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001313 i++;
1314 break;
1315 }
1316 chr = (chr<<4) & ~0xF;
1317 if (c >= '0' && c <= '9')
1318 chr += c - '0';
1319 else if (c >= 'a' && c <= 'f')
1320 chr += 10 + c - 'a';
1321 else
1322 chr += 10 + c - 'A';
1323 }
1324 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001325 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001326 /* when we get here, chr is a 32-bit unicode character */
1327 if (chr <= 0xffff)
1328 /* UCS-2 character */
1329 *p++ = (Py_UNICODE) chr;
1330 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001331 /* UCS-4 character. Either store directly, or as
1332 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001333#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001334 *p++ = chr;
1335#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 chr -= 0x10000L;
1337 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001338 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001340 } else {
1341 if (unicodeescape_decoding_error(
1342 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001343 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001344 )
1345 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001346 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001347 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001348 break;
1349
1350 /* \N{name} */
1351 case 'N':
1352 message = "malformed \\N character escape";
1353 if (ucnhash_CAPI == NULL) {
1354 /* load the unicode data module */
1355 PyObject *m, *v;
1356 m = PyImport_ImportModule("unicodedata");
1357 if (m == NULL)
1358 goto ucnhashError;
1359 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1360 Py_DECREF(m);
1361 if (v == NULL)
1362 goto ucnhashError;
1363 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1364 Py_DECREF(v);
1365 if (ucnhash_CAPI == NULL)
1366 goto ucnhashError;
1367 }
1368 if (*s == '{') {
1369 const char *start = s+1;
1370 /* look for the closing brace */
1371 while (*s != '}' && s < end)
1372 s++;
1373 if (s > start && s < end && *s == '}') {
1374 /* found a name. look it up in the unicode database */
1375 message = "unknown Unicode character name";
1376 s++;
1377 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1378 goto store;
1379 }
1380 }
1381 if (unicodeescape_decoding_error(&s, &x, errors, message))
1382 goto onError;
1383 *p++ = x;
1384 break;
1385
1386 default:
1387 *p++ = '\\';
1388 *p++ = (unsigned char)s[-1];
1389 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
1391 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001392 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 return (PyObject *)v;
1395
Fredrik Lundhccc74732001-02-18 22:13:49 +00001396ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001397 PyErr_SetString(
1398 PyExc_UnicodeError,
1399 "\\N escapes not supported (can't load unicodedata module)"
1400 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001401 return NULL;
1402
Fredrik Lundhccc74732001-02-18 22:13:49 +00001403onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 Py_XDECREF(v);
1405 return NULL;
1406}
1407
1408/* Return a Unicode-Escape string version of the Unicode object.
1409
1410 If quotes is true, the string is enclosed in u"" or u'' quotes as
1411 appropriate.
1412
1413*/
1414
Barry Warsaw51ac5802000-03-20 16:36:48 +00001415static const Py_UNICODE *findchar(const Py_UNICODE *s,
1416 int size,
1417 Py_UNICODE ch);
1418
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419static
1420PyObject *unicodeescape_string(const Py_UNICODE *s,
1421 int size,
1422 int quotes)
1423{
1424 PyObject *repr;
1425 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001427 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428
1429 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1430 if (repr == NULL)
1431 return NULL;
1432
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001433 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434
1435 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 *p++ = 'u';
1437 *p++ = (findchar(s, size, '\'') &&
1438 !findchar(s, size, '"')) ? '"' : '\'';
1439 }
1440 while (size-- > 0) {
1441 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001444 if (quotes &&
1445 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 *p++ = '\\';
1447 *p++ = (char) ch;
1448 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001449
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001450#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001451 /* Map 21-bit characters to '\U00xxxxxx' */
1452 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001453 int offset = p - PyString_AS_STRING(repr);
1454
1455 /* Resize the string if necessary */
1456 if (offset + 12 > PyString_GET_SIZE(repr)) {
1457 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1458 goto onError;
1459 p = PyString_AS_STRING(repr) + offset;
1460 }
1461
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001462 *p++ = '\\';
1463 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001464 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1465 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1466 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1467 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1468 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1469 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1470 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001471 *p++ = hexdigit[ch & 0x0000000F];
1472 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001473 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001474#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001475 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1476 else if (ch >= 0xD800 && ch < 0xDC00) {
1477 Py_UNICODE ch2;
1478 Py_UCS4 ucs;
1479
1480 ch2 = *s++;
1481 size--;
1482 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1483 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1484 *p++ = '\\';
1485 *p++ = 'U';
1486 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1487 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1488 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1489 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1490 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1491 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1492 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1493 *p++ = hexdigit[ucs & 0x0000000F];
1494 continue;
1495 }
1496 /* Fall through: isolated surrogates are copied as-is */
1497 s--;
1498 size++;
1499 }
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001502 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 *p++ = '\\';
1504 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001505 *p++ = hexdigit[(ch >> 12) & 0x000F];
1506 *p++ = hexdigit[(ch >> 8) & 0x000F];
1507 *p++ = hexdigit[(ch >> 4) & 0x000F];
1508 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001510
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001511 /* Map special whitespace to '\t', \n', '\r' */
1512 else if (ch == '\t') {
1513 *p++ = '\\';
1514 *p++ = 't';
1515 }
1516 else if (ch == '\n') {
1517 *p++ = '\\';
1518 *p++ = 'n';
1519 }
1520 else if (ch == '\r') {
1521 *p++ = '\\';
1522 *p++ = 'r';
1523 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001524
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001525 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 else if (ch < ' ' || ch >= 128) {
1527 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001528 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001529 *p++ = hexdigit[(ch >> 4) & 0x000F];
1530 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001532
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 /* Copy everything else as-is */
1534 else
1535 *p++ = (char) ch;
1536 }
1537 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001538 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539
1540 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001541 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543
1544 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001545
1546 onError:
1547 Py_DECREF(repr);
1548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549}
1550
1551PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1552 int size)
1553{
1554 return unicodeescape_string(s, size, 0);
1555}
1556
1557PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1558{
1559 if (!PyUnicode_Check(unicode)) {
1560 PyErr_BadArgument();
1561 return NULL;
1562 }
1563 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1564 PyUnicode_GET_SIZE(unicode));
1565}
1566
1567/* --- Raw Unicode Escape Codec ------------------------------------------- */
1568
1569PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1570 int size,
1571 const char *errors)
1572{
1573 PyUnicodeObject *v;
1574 Py_UNICODE *p, *buf;
1575 const char *end;
1576 const char *bs;
1577
1578 /* Escaped strings will always be longer than the resulting
1579 Unicode string, so we start with size here and then reduce the
1580 length after conversion to the true value. */
1581 v = _PyUnicode_New(size);
1582 if (v == NULL)
1583 goto onError;
1584 if (size == 0)
1585 return (PyObject *)v;
1586 p = buf = PyUnicode_AS_UNICODE(v);
1587 end = s + size;
1588 while (s < end) {
1589 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001590 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 int i;
1592
1593 /* Non-escape characters are interpreted as Unicode ordinals */
1594 if (*s != '\\') {
1595 *p++ = (unsigned char)*s++;
1596 continue;
1597 }
1598
1599 /* \u-escapes are only interpreted iff the number of leading
1600 backslashes if odd */
1601 bs = s;
1602 for (;s < end;) {
1603 if (*s != '\\')
1604 break;
1605 *p++ = (unsigned char)*s++;
1606 }
1607 if (((s - bs) & 1) == 0 ||
1608 s >= end ||
1609 *s != 'u') {
1610 continue;
1611 }
1612 p--;
1613 s++;
1614
1615 /* \uXXXX with 4 hex digits */
1616 for (x = 0, i = 0; i < 4; i++) {
1617 c = (unsigned char)s[i];
1618 if (!isxdigit(c)) {
1619 if (unicodeescape_decoding_error(&s, &x, errors,
1620 "truncated \\uXXXX"))
1621 goto onError;
1622 i++;
1623 break;
1624 }
1625 x = (x<<4) & ~0xF;
1626 if (c >= '0' && c <= '9')
1627 x += c - '0';
1628 else if (c >= 'a' && c <= 'f')
1629 x += 10 + c - 'a';
1630 else
1631 x += 10 + c - 'A';
1632 }
1633 s += i;
1634 *p++ = x;
1635 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 return (PyObject *)v;
1639
1640 onError:
1641 Py_XDECREF(v);
1642 return NULL;
1643}
1644
1645PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1646 int size)
1647{
1648 PyObject *repr;
1649 char *p;
1650 char *q;
1651
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001652 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
1654 repr = PyString_FromStringAndSize(NULL, 6 * size);
1655 if (repr == NULL)
1656 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001657 if (size == 0)
1658 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
1660 p = q = PyString_AS_STRING(repr);
1661 while (size-- > 0) {
1662 Py_UNICODE ch = *s++;
1663 /* Map 16-bit characters to '\uxxxx' */
1664 if (ch >= 256) {
1665 *p++ = '\\';
1666 *p++ = 'u';
1667 *p++ = hexdigit[(ch >> 12) & 0xf];
1668 *p++ = hexdigit[(ch >> 8) & 0xf];
1669 *p++ = hexdigit[(ch >> 4) & 0xf];
1670 *p++ = hexdigit[ch & 15];
1671 }
1672 /* Copy everything else as-is */
1673 else
1674 *p++ = (char) ch;
1675 }
1676 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001677 if (_PyString_Resize(&repr, p - q))
1678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679
1680 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001681
1682 onError:
1683 Py_DECREF(repr);
1684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685}
1686
1687PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1688{
1689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
1691 return NULL;
1692 }
1693 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1694 PyUnicode_GET_SIZE(unicode));
1695}
1696
1697/* --- Latin-1 Codec ------------------------------------------------------ */
1698
1699PyObject *PyUnicode_DecodeLatin1(const char *s,
1700 int size,
1701 const char *errors)
1702{
1703 PyUnicodeObject *v;
1704 Py_UNICODE *p;
1705
1706 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001707 if (size == 1 && *(unsigned char*)s < 256) {
1708 Py_UNICODE r = *(unsigned char*)s;
1709 return PyUnicode_FromUnicode(&r, 1);
1710 }
1711
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 v = _PyUnicode_New(size);
1713 if (v == NULL)
1714 goto onError;
1715 if (size == 0)
1716 return (PyObject *)v;
1717 p = PyUnicode_AS_UNICODE(v);
1718 while (size-- > 0)
1719 *p++ = (unsigned char)*s++;
1720 return (PyObject *)v;
1721
1722 onError:
1723 Py_XDECREF(v);
1724 return NULL;
1725}
1726
1727static
1728int latin1_encoding_error(const Py_UNICODE **source,
1729 char **dest,
1730 const char *errors,
1731 const char *details)
1732{
1733 if ((errors == NULL) ||
1734 (strcmp(errors,"strict") == 0)) {
1735 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 details);
1738 return -1;
1739 }
1740 else if (strcmp(errors,"ignore") == 0) {
1741 return 0;
1742 }
1743 else if (strcmp(errors,"replace") == 0) {
1744 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001745 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 return 0;
1747 }
1748 else {
1749 PyErr_Format(PyExc_ValueError,
1750 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001751 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 errors);
1753 return -1;
1754 }
1755}
1756
1757PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1758 int size,
1759 const char *errors)
1760{
1761 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001762 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 repr = PyString_FromStringAndSize(NULL, size);
1765 if (repr == NULL)
1766 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001767 if (size == 0)
1768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001771 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 if (ch >= 256) {
1775 if (latin1_encoding_error(&p, &s, errors,
1776 "ordinal not in range(256)"))
1777 goto onError;
1778 }
1779 else
1780 *s++ = (char)ch;
1781 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001782 /* Resize if error handling skipped some characters */
1783 if (s - start < PyString_GET_SIZE(repr))
1784 if (_PyString_Resize(&repr, s - start))
1785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 return repr;
1787
1788 onError:
1789 Py_DECREF(repr);
1790 return NULL;
1791}
1792
1793PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1794{
1795 if (!PyUnicode_Check(unicode)) {
1796 PyErr_BadArgument();
1797 return NULL;
1798 }
1799 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1800 PyUnicode_GET_SIZE(unicode),
1801 NULL);
1802}
1803
1804/* --- 7-bit ASCII Codec -------------------------------------------------- */
1805
1806static
1807int ascii_decoding_error(const char **source,
1808 Py_UNICODE **dest,
1809 const char *errors,
1810 const char *details)
1811{
1812 if ((errors == NULL) ||
1813 (strcmp(errors,"strict") == 0)) {
1814 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001815 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 details);
1817 return -1;
1818 }
1819 else if (strcmp(errors,"ignore") == 0) {
1820 return 0;
1821 }
1822 else if (strcmp(errors,"replace") == 0) {
1823 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1824 (*dest)++;
1825 return 0;
1826 }
1827 else {
1828 PyErr_Format(PyExc_ValueError,
1829 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001830 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 errors);
1832 return -1;
1833 }
1834}
1835
1836PyObject *PyUnicode_DecodeASCII(const char *s,
1837 int size,
1838 const char *errors)
1839{
1840 PyUnicodeObject *v;
1841 Py_UNICODE *p;
1842
1843 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001844 if (size == 1 && *(unsigned char*)s < 128) {
1845 Py_UNICODE r = *(unsigned char*)s;
1846 return PyUnicode_FromUnicode(&r, 1);
1847 }
1848
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 v = _PyUnicode_New(size);
1850 if (v == NULL)
1851 goto onError;
1852 if (size == 0)
1853 return (PyObject *)v;
1854 p = PyUnicode_AS_UNICODE(v);
1855 while (size-- > 0) {
1856 register unsigned char c;
1857
1858 c = (unsigned char)*s++;
1859 if (c < 128)
1860 *p++ = c;
1861 else if (ascii_decoding_error(&s, &p, errors,
1862 "ordinal not in range(128)"))
1863 goto onError;
1864 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001865 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001866 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 return (PyObject *)v;
1869
1870 onError:
1871 Py_XDECREF(v);
1872 return NULL;
1873}
1874
1875static
1876int ascii_encoding_error(const Py_UNICODE **source,
1877 char **dest,
1878 const char *errors,
1879 const char *details)
1880{
1881 if ((errors == NULL) ||
1882 (strcmp(errors,"strict") == 0)) {
1883 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001884 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 details);
1886 return -1;
1887 }
1888 else if (strcmp(errors,"ignore") == 0) {
1889 return 0;
1890 }
1891 else if (strcmp(errors,"replace") == 0) {
1892 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001893 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894 return 0;
1895 }
1896 else {
1897 PyErr_Format(PyExc_ValueError,
1898 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001899 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 errors);
1901 return -1;
1902 }
1903}
1904
1905PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1906 int size,
1907 const char *errors)
1908{
1909 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001910 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 repr = PyString_FromStringAndSize(NULL, size);
1913 if (repr == NULL)
1914 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001915 if (size == 0)
1916 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917
1918 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001919 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 while (size-- > 0) {
1921 Py_UNICODE ch = *p++;
1922 if (ch >= 128) {
1923 if (ascii_encoding_error(&p, &s, errors,
1924 "ordinal not in range(128)"))
1925 goto onError;
1926 }
1927 else
1928 *s++ = (char)ch;
1929 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001930 /* Resize if error handling skipped some characters */
1931 if (s - start < PyString_GET_SIZE(repr))
1932 if (_PyString_Resize(&repr, s - start))
1933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 return repr;
1935
1936 onError:
1937 Py_DECREF(repr);
1938 return NULL;
1939}
1940
1941PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1942{
1943 if (!PyUnicode_Check(unicode)) {
1944 PyErr_BadArgument();
1945 return NULL;
1946 }
1947 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1948 PyUnicode_GET_SIZE(unicode),
1949 NULL);
1950}
1951
Fredrik Lundh30831632001-06-26 15:11:00 +00001952#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001953
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001954/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001955
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001956PyObject *PyUnicode_DecodeMBCS(const char *s,
1957 int size,
1958 const char *errors)
1959{
1960 PyUnicodeObject *v;
1961 Py_UNICODE *p;
1962
1963 /* First get the size of the result */
1964 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001965 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001966 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1967
1968 v = _PyUnicode_New(usize);
1969 if (v == NULL)
1970 return NULL;
1971 if (usize == 0)
1972 return (PyObject *)v;
1973 p = PyUnicode_AS_UNICODE(v);
1974 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1975 Py_DECREF(v);
1976 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1977 }
1978
1979 return (PyObject *)v;
1980}
1981
1982PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1983 int size,
1984 const char *errors)
1985{
1986 PyObject *repr;
1987 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001988 DWORD mbcssize;
1989
1990 /* If there are no characters, bail now! */
1991 if (size==0)
1992 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001993
1994 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001995 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001996 if (mbcssize==0)
1997 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1998
1999 repr = PyString_FromStringAndSize(NULL, mbcssize);
2000 if (repr == NULL)
2001 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002002 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002003 return repr;
2004
2005 /* Do the conversion */
2006 s = PyString_AS_STRING(repr);
2007 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2008 Py_DECREF(repr);
2009 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2010 }
2011 return repr;
2012}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002014#endif /* MS_WIN32 */
2015
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016/* --- Character Mapping Codec -------------------------------------------- */
2017
2018static
2019int charmap_decoding_error(const char **source,
2020 Py_UNICODE **dest,
2021 const char *errors,
2022 const char *details)
2023{
2024 if ((errors == NULL) ||
2025 (strcmp(errors,"strict") == 0)) {
2026 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002027 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 details);
2029 return -1;
2030 }
2031 else if (strcmp(errors,"ignore") == 0) {
2032 return 0;
2033 }
2034 else if (strcmp(errors,"replace") == 0) {
2035 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2036 (*dest)++;
2037 return 0;
2038 }
2039 else {
2040 PyErr_Format(PyExc_ValueError,
2041 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002042 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 errors);
2044 return -1;
2045 }
2046}
2047
2048PyObject *PyUnicode_DecodeCharmap(const char *s,
2049 int size,
2050 PyObject *mapping,
2051 const char *errors)
2052{
2053 PyUnicodeObject *v;
2054 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002055 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Default to Latin-1 */
2058 if (mapping == NULL)
2059 return PyUnicode_DecodeLatin1(s, size, errors);
2060
2061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
2066 p = PyUnicode_AS_UNICODE(v);
2067 while (size-- > 0) {
2068 unsigned char ch = *s++;
2069 PyObject *w, *x;
2070
2071 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2072 w = PyInt_FromLong((long)ch);
2073 if (w == NULL)
2074 goto onError;
2075 x = PyObject_GetItem(mapping, w);
2076 Py_DECREF(w);
2077 if (x == NULL) {
2078 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002079 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002081 x = Py_None;
2082 Py_INCREF(x);
2083 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002084 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 }
2086
2087 /* Apply mapping */
2088 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002089 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 if (value < 0 || value > 65535) {
2091 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002092 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 Py_DECREF(x);
2094 goto onError;
2095 }
2096 *p++ = (Py_UNICODE)value;
2097 }
2098 else if (x == Py_None) {
2099 /* undefined mapping */
2100 if (charmap_decoding_error(&s, &p, errors,
2101 "character maps to <undefined>")) {
2102 Py_DECREF(x);
2103 goto onError;
2104 }
2105 }
2106 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002107 int targetsize = PyUnicode_GET_SIZE(x);
2108
2109 if (targetsize == 1)
2110 /* 1-1 mapping */
2111 *p++ = *PyUnicode_AS_UNICODE(x);
2112
2113 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002115 if (targetsize > extrachars) {
2116 /* resize first */
2117 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2118 int needed = (targetsize - extrachars) + \
2119 (targetsize << 2);
2120 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002121 if (_PyUnicode_Resize(&v,
2122 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002123 Py_DECREF(x);
2124 goto onError;
2125 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 p = PyUnicode_AS_UNICODE(v) + oldpos;
2127 }
2128 Py_UNICODE_COPY(p,
2129 PyUnicode_AS_UNICODE(x),
2130 targetsize);
2131 p += targetsize;
2132 extrachars -= targetsize;
2133 }
2134 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
2136 else {
2137 /* wrong return value */
2138 PyErr_SetString(PyExc_TypeError,
2139 "character mapping must return integer, None or unicode");
2140 Py_DECREF(x);
2141 goto onError;
2142 }
2143 Py_DECREF(x);
2144 }
2145 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002146 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 goto onError;
2148 return (PyObject *)v;
2149
2150 onError:
2151 Py_XDECREF(v);
2152 return NULL;
2153}
2154
2155static
2156int charmap_encoding_error(const Py_UNICODE **source,
2157 char **dest,
2158 const char *errors,
2159 const char *details)
2160{
2161 if ((errors == NULL) ||
2162 (strcmp(errors,"strict") == 0)) {
2163 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 details);
2166 return -1;
2167 }
2168 else if (strcmp(errors,"ignore") == 0) {
2169 return 0;
2170 }
2171 else if (strcmp(errors,"replace") == 0) {
2172 **dest = '?';
2173 (*dest)++;
2174 return 0;
2175 }
2176 else {
2177 PyErr_Format(PyExc_ValueError,
2178 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 errors);
2181 return -1;
2182 }
2183}
2184
2185PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2186 int size,
2187 PyObject *mapping,
2188 const char *errors)
2189{
2190 PyObject *v;
2191 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002192 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193
2194 /* Default to Latin-1 */
2195 if (mapping == NULL)
2196 return PyUnicode_EncodeLatin1(p, size, errors);
2197
2198 v = PyString_FromStringAndSize(NULL, size);
2199 if (v == NULL)
2200 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002201 if (size == 0)
2202 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 s = PyString_AS_STRING(v);
2204 while (size-- > 0) {
2205 Py_UNICODE ch = *p++;
2206 PyObject *w, *x;
2207
2208 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2209 w = PyInt_FromLong((long)ch);
2210 if (w == NULL)
2211 goto onError;
2212 x = PyObject_GetItem(mapping, w);
2213 Py_DECREF(w);
2214 if (x == NULL) {
2215 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002216 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002218 x = Py_None;
2219 Py_INCREF(x);
2220 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002221 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
2223
2224 /* Apply mapping */
2225 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002226 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 if (value < 0 || value > 255) {
2228 PyErr_SetString(PyExc_TypeError,
2229 "character mapping must be in range(256)");
2230 Py_DECREF(x);
2231 goto onError;
2232 }
2233 *s++ = (char)value;
2234 }
2235 else if (x == Py_None) {
2236 /* undefined mapping */
2237 if (charmap_encoding_error(&p, &s, errors,
2238 "character maps to <undefined>")) {
2239 Py_DECREF(x);
2240 goto onError;
2241 }
2242 }
2243 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002244 int targetsize = PyString_GET_SIZE(x);
2245
2246 if (targetsize == 1)
2247 /* 1-1 mapping */
2248 *s++ = *PyString_AS_STRING(x);
2249
2250 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002252 if (targetsize > extrachars) {
2253 /* resize first */
2254 int oldpos = (int)(s - PyString_AS_STRING(v));
2255 int needed = (targetsize - extrachars) + \
2256 (targetsize << 2);
2257 extrachars += needed;
2258 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002259 Py_DECREF(x);
2260 goto onError;
2261 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002262 s = PyString_AS_STRING(v) + oldpos;
2263 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002264 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002265 s += targetsize;
2266 extrachars -= targetsize;
2267 }
2268 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
2270 else {
2271 /* wrong return value */
2272 PyErr_SetString(PyExc_TypeError,
2273 "character mapping must return integer, None or unicode");
2274 Py_DECREF(x);
2275 goto onError;
2276 }
2277 Py_DECREF(x);
2278 }
2279 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2280 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2281 goto onError;
2282 return v;
2283
2284 onError:
2285 Py_DECREF(v);
2286 return NULL;
2287}
2288
2289PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2290 PyObject *mapping)
2291{
2292 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2293 PyErr_BadArgument();
2294 return NULL;
2295 }
2296 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2297 PyUnicode_GET_SIZE(unicode),
2298 mapping,
2299 NULL);
2300}
2301
2302static
2303int translate_error(const Py_UNICODE **source,
2304 Py_UNICODE **dest,
2305 const char *errors,
2306 const char *details)
2307{
2308 if ((errors == NULL) ||
2309 (strcmp(errors,"strict") == 0)) {
2310 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002311 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 details);
2313 return -1;
2314 }
2315 else if (strcmp(errors,"ignore") == 0) {
2316 return 0;
2317 }
2318 else if (strcmp(errors,"replace") == 0) {
2319 **dest = '?';
2320 (*dest)++;
2321 return 0;
2322 }
2323 else {
2324 PyErr_Format(PyExc_ValueError,
2325 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002326 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 errors);
2328 return -1;
2329 }
2330}
2331
2332PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2333 int size,
2334 PyObject *mapping,
2335 const char *errors)
2336{
2337 PyUnicodeObject *v;
2338 Py_UNICODE *p;
2339
2340 if (mapping == NULL) {
2341 PyErr_BadArgument();
2342 return NULL;
2343 }
2344
2345 /* Output will never be longer than input */
2346 v = _PyUnicode_New(size);
2347 if (v == NULL)
2348 goto onError;
2349 if (size == 0)
2350 goto done;
2351 p = PyUnicode_AS_UNICODE(v);
2352 while (size-- > 0) {
2353 Py_UNICODE ch = *s++;
2354 PyObject *w, *x;
2355
2356 /* Get mapping */
2357 w = PyInt_FromLong(ch);
2358 if (w == NULL)
2359 goto onError;
2360 x = PyObject_GetItem(mapping, w);
2361 Py_DECREF(w);
2362 if (x == NULL) {
2363 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2364 /* No mapping found: default to 1-1 mapping */
2365 PyErr_Clear();
2366 *p++ = ch;
2367 continue;
2368 }
2369 goto onError;
2370 }
2371
2372 /* Apply mapping */
2373 if (PyInt_Check(x))
2374 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2375 else if (x == Py_None) {
2376 /* undefined mapping */
2377 if (translate_error(&s, &p, errors,
2378 "character maps to <undefined>")) {
2379 Py_DECREF(x);
2380 goto onError;
2381 }
2382 }
2383 else if (PyUnicode_Check(x)) {
2384 if (PyUnicode_GET_SIZE(x) != 1) {
2385 /* 1-n mapping */
2386 PyErr_SetString(PyExc_NotImplementedError,
2387 "1-n mappings are currently not implemented");
2388 Py_DECREF(x);
2389 goto onError;
2390 }
2391 *p++ = *PyUnicode_AS_UNICODE(x);
2392 }
2393 else {
2394 /* wrong return value */
2395 PyErr_SetString(PyExc_TypeError,
2396 "translate mapping must return integer, None or unicode");
2397 Py_DECREF(x);
2398 goto onError;
2399 }
2400 Py_DECREF(x);
2401 }
2402 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002403 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405
2406 done:
2407 return (PyObject *)v;
2408
2409 onError:
2410 Py_XDECREF(v);
2411 return NULL;
2412}
2413
2414PyObject *PyUnicode_Translate(PyObject *str,
2415 PyObject *mapping,
2416 const char *errors)
2417{
2418 PyObject *result;
2419
2420 str = PyUnicode_FromObject(str);
2421 if (str == NULL)
2422 goto onError;
2423 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2424 PyUnicode_GET_SIZE(str),
2425 mapping,
2426 errors);
2427 Py_DECREF(str);
2428 return result;
2429
2430 onError:
2431 Py_XDECREF(str);
2432 return NULL;
2433}
2434
Guido van Rossum9e896b32000-04-05 20:11:21 +00002435/* --- Decimal Encoder ---------------------------------------------------- */
2436
2437int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2438 int length,
2439 char *output,
2440 const char *errors)
2441{
2442 Py_UNICODE *p, *end;
2443
2444 if (output == NULL) {
2445 PyErr_BadArgument();
2446 return -1;
2447 }
2448
2449 p = s;
2450 end = s + length;
2451 while (p < end) {
2452 register Py_UNICODE ch = *p++;
2453 int decimal;
2454
2455 if (Py_UNICODE_ISSPACE(ch)) {
2456 *output++ = ' ';
2457 continue;
2458 }
2459 decimal = Py_UNICODE_TODECIMAL(ch);
2460 if (decimal >= 0) {
2461 *output++ = '0' + decimal;
2462 continue;
2463 }
Guido van Rossumba477042000-04-06 18:18:10 +00002464 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002465 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002466 continue;
2467 }
2468 /* All other characters are considered invalid */
2469 if (errors == NULL || strcmp(errors, "strict") == 0) {
2470 PyErr_SetString(PyExc_ValueError,
2471 "invalid decimal Unicode string");
2472 goto onError;
2473 }
2474 else if (strcmp(errors, "ignore") == 0)
2475 continue;
2476 else if (strcmp(errors, "replace") == 0) {
2477 *output++ = '?';
2478 continue;
2479 }
2480 }
2481 /* 0-terminate the output string */
2482 *output++ = '\0';
2483 return 0;
2484
2485 onError:
2486 return -1;
2487}
2488
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489/* --- Helpers ------------------------------------------------------------ */
2490
2491static
2492int count(PyUnicodeObject *self,
2493 int start,
2494 int end,
2495 PyUnicodeObject *substring)
2496{
2497 int count = 0;
2498
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002499 if (start < 0)
2500 start += self->length;
2501 if (start < 0)
2502 start = 0;
2503 if (end > self->length)
2504 end = self->length;
2505 if (end < 0)
2506 end += self->length;
2507 if (end < 0)
2508 end = 0;
2509
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002510 if (substring->length == 0)
2511 return (end - start + 1);
2512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 end -= substring->length;
2514
2515 while (start <= end)
2516 if (Py_UNICODE_MATCH(self, start, substring)) {
2517 count++;
2518 start += substring->length;
2519 } else
2520 start++;
2521
2522 return count;
2523}
2524
2525int PyUnicode_Count(PyObject *str,
2526 PyObject *substr,
2527 int start,
2528 int end)
2529{
2530 int result;
2531
2532 str = PyUnicode_FromObject(str);
2533 if (str == NULL)
2534 return -1;
2535 substr = PyUnicode_FromObject(substr);
2536 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002537 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 return -1;
2539 }
2540
2541 result = count((PyUnicodeObject *)str,
2542 start, end,
2543 (PyUnicodeObject *)substr);
2544
2545 Py_DECREF(str);
2546 Py_DECREF(substr);
2547 return result;
2548}
2549
2550static
2551int findstring(PyUnicodeObject *self,
2552 PyUnicodeObject *substring,
2553 int start,
2554 int end,
2555 int direction)
2556{
2557 if (start < 0)
2558 start += self->length;
2559 if (start < 0)
2560 start = 0;
2561
2562 if (substring->length == 0)
2563 return start;
2564
2565 if (end > self->length)
2566 end = self->length;
2567 if (end < 0)
2568 end += self->length;
2569 if (end < 0)
2570 end = 0;
2571
2572 end -= substring->length;
2573
2574 if (direction < 0) {
2575 for (; end >= start; end--)
2576 if (Py_UNICODE_MATCH(self, end, substring))
2577 return end;
2578 } else {
2579 for (; start <= end; start++)
2580 if (Py_UNICODE_MATCH(self, start, substring))
2581 return start;
2582 }
2583
2584 return -1;
2585}
2586
2587int PyUnicode_Find(PyObject *str,
2588 PyObject *substr,
2589 int start,
2590 int end,
2591 int direction)
2592{
2593 int result;
2594
2595 str = PyUnicode_FromObject(str);
2596 if (str == NULL)
2597 return -1;
2598 substr = PyUnicode_FromObject(substr);
2599 if (substr == NULL) {
2600 Py_DECREF(substr);
2601 return -1;
2602 }
2603
2604 result = findstring((PyUnicodeObject *)str,
2605 (PyUnicodeObject *)substr,
2606 start, end, direction);
2607 Py_DECREF(str);
2608 Py_DECREF(substr);
2609 return result;
2610}
2611
2612static
2613int tailmatch(PyUnicodeObject *self,
2614 PyUnicodeObject *substring,
2615 int start,
2616 int end,
2617 int direction)
2618{
2619 if (start < 0)
2620 start += self->length;
2621 if (start < 0)
2622 start = 0;
2623
2624 if (substring->length == 0)
2625 return 1;
2626
2627 if (end > self->length)
2628 end = self->length;
2629 if (end < 0)
2630 end += self->length;
2631 if (end < 0)
2632 end = 0;
2633
2634 end -= substring->length;
2635 if (end < start)
2636 return 0;
2637
2638 if (direction > 0) {
2639 if (Py_UNICODE_MATCH(self, end, substring))
2640 return 1;
2641 } else {
2642 if (Py_UNICODE_MATCH(self, start, substring))
2643 return 1;
2644 }
2645
2646 return 0;
2647}
2648
2649int PyUnicode_Tailmatch(PyObject *str,
2650 PyObject *substr,
2651 int start,
2652 int end,
2653 int direction)
2654{
2655 int result;
2656
2657 str = PyUnicode_FromObject(str);
2658 if (str == NULL)
2659 return -1;
2660 substr = PyUnicode_FromObject(substr);
2661 if (substr == NULL) {
2662 Py_DECREF(substr);
2663 return -1;
2664 }
2665
2666 result = tailmatch((PyUnicodeObject *)str,
2667 (PyUnicodeObject *)substr,
2668 start, end, direction);
2669 Py_DECREF(str);
2670 Py_DECREF(substr);
2671 return result;
2672}
2673
2674static
2675const Py_UNICODE *findchar(const Py_UNICODE *s,
2676 int size,
2677 Py_UNICODE ch)
2678{
2679 /* like wcschr, but doesn't stop at NULL characters */
2680
2681 while (size-- > 0) {
2682 if (*s == ch)
2683 return s;
2684 s++;
2685 }
2686
2687 return NULL;
2688}
2689
2690/* Apply fixfct filter to the Unicode object self and return a
2691 reference to the modified object */
2692
2693static
2694PyObject *fixup(PyUnicodeObject *self,
2695 int (*fixfct)(PyUnicodeObject *s))
2696{
2697
2698 PyUnicodeObject *u;
2699
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002700 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 if (u == NULL)
2702 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002703
2704 Py_UNICODE_COPY(u->str, self->str, self->length);
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 if (!fixfct(u)) {
2707 /* fixfct should return TRUE if it modified the buffer. If
2708 FALSE, return a reference to the original buffer instead
2709 (to save space, not time) */
2710 Py_INCREF(self);
2711 Py_DECREF(u);
2712 return (PyObject*) self;
2713 }
2714 return (PyObject*) u;
2715}
2716
2717static
2718int fixupper(PyUnicodeObject *self)
2719{
2720 int len = self->length;
2721 Py_UNICODE *s = self->str;
2722 int status = 0;
2723
2724 while (len-- > 0) {
2725 register Py_UNICODE ch;
2726
2727 ch = Py_UNICODE_TOUPPER(*s);
2728 if (ch != *s) {
2729 status = 1;
2730 *s = ch;
2731 }
2732 s++;
2733 }
2734
2735 return status;
2736}
2737
2738static
2739int fixlower(PyUnicodeObject *self)
2740{
2741 int len = self->length;
2742 Py_UNICODE *s = self->str;
2743 int status = 0;
2744
2745 while (len-- > 0) {
2746 register Py_UNICODE ch;
2747
2748 ch = Py_UNICODE_TOLOWER(*s);
2749 if (ch != *s) {
2750 status = 1;
2751 *s = ch;
2752 }
2753 s++;
2754 }
2755
2756 return status;
2757}
2758
2759static
2760int fixswapcase(PyUnicodeObject *self)
2761{
2762 int len = self->length;
2763 Py_UNICODE *s = self->str;
2764 int status = 0;
2765
2766 while (len-- > 0) {
2767 if (Py_UNICODE_ISUPPER(*s)) {
2768 *s = Py_UNICODE_TOLOWER(*s);
2769 status = 1;
2770 } else if (Py_UNICODE_ISLOWER(*s)) {
2771 *s = Py_UNICODE_TOUPPER(*s);
2772 status = 1;
2773 }
2774 s++;
2775 }
2776
2777 return status;
2778}
2779
2780static
2781int fixcapitalize(PyUnicodeObject *self)
2782{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002783 int len = self->length;
2784 Py_UNICODE *s = self->str;
2785 int status = 0;
2786
2787 if (len == 0)
2788 return 0;
2789 if (Py_UNICODE_ISLOWER(*s)) {
2790 *s = Py_UNICODE_TOUPPER(*s);
2791 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002793 s++;
2794 while (--len > 0) {
2795 if (Py_UNICODE_ISUPPER(*s)) {
2796 *s = Py_UNICODE_TOLOWER(*s);
2797 status = 1;
2798 }
2799 s++;
2800 }
2801 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802}
2803
2804static
2805int fixtitle(PyUnicodeObject *self)
2806{
2807 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2808 register Py_UNICODE *e;
2809 int previous_is_cased;
2810
2811 /* Shortcut for single character strings */
2812 if (PyUnicode_GET_SIZE(self) == 1) {
2813 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2814 if (*p != ch) {
2815 *p = ch;
2816 return 1;
2817 }
2818 else
2819 return 0;
2820 }
2821
2822 e = p + PyUnicode_GET_SIZE(self);
2823 previous_is_cased = 0;
2824 for (; p < e; p++) {
2825 register const Py_UNICODE ch = *p;
2826
2827 if (previous_is_cased)
2828 *p = Py_UNICODE_TOLOWER(ch);
2829 else
2830 *p = Py_UNICODE_TOTITLE(ch);
2831
2832 if (Py_UNICODE_ISLOWER(ch) ||
2833 Py_UNICODE_ISUPPER(ch) ||
2834 Py_UNICODE_ISTITLE(ch))
2835 previous_is_cased = 1;
2836 else
2837 previous_is_cased = 0;
2838 }
2839 return 1;
2840}
2841
2842PyObject *PyUnicode_Join(PyObject *separator,
2843 PyObject *seq)
2844{
2845 Py_UNICODE *sep;
2846 int seplen;
2847 PyUnicodeObject *res = NULL;
2848 int reslen = 0;
2849 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 int sz = 100;
2851 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002852 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
Tim Peters2cfe3682001-05-05 05:36:48 +00002854 it = PyObject_GetIter(seq);
2855 if (it == NULL)
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857
2858 if (separator == NULL) {
2859 Py_UNICODE blank = ' ';
2860 sep = &blank;
2861 seplen = 1;
2862 }
2863 else {
2864 separator = PyUnicode_FromObject(separator);
2865 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002866 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 sep = PyUnicode_AS_UNICODE(separator);
2868 seplen = PyUnicode_GET_SIZE(separator);
2869 }
2870
2871 res = _PyUnicode_New(sz);
2872 if (res == NULL)
2873 goto onError;
2874 p = PyUnicode_AS_UNICODE(res);
2875 reslen = 0;
2876
Tim Peters2cfe3682001-05-05 05:36:48 +00002877 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002879 PyObject *item = PyIter_Next(it);
2880 if (item == NULL) {
2881 if (PyErr_Occurred())
2882 goto onError;
2883 break;
2884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 if (!PyUnicode_Check(item)) {
2886 PyObject *v;
2887 v = PyUnicode_FromObject(item);
2888 Py_DECREF(item);
2889 item = v;
2890 if (item == NULL)
2891 goto onError;
2892 }
2893 itemlen = PyUnicode_GET_SIZE(item);
2894 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002895 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 goto onError;
2897 sz *= 2;
2898 p = PyUnicode_AS_UNICODE(res) + reslen;
2899 }
2900 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002901 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 p += seplen;
2903 reslen += seplen;
2904 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002905 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 p += itemlen;
2907 reslen += itemlen;
2908 Py_DECREF(item);
2909 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002910 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 goto onError;
2912
2913 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002914 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 return (PyObject *)res;
2916
2917 onError:
2918 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002919 Py_XDECREF(res);
2920 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 return NULL;
2922}
2923
2924static
2925PyUnicodeObject *pad(PyUnicodeObject *self,
2926 int left,
2927 int right,
2928 Py_UNICODE fill)
2929{
2930 PyUnicodeObject *u;
2931
2932 if (left < 0)
2933 left = 0;
2934 if (right < 0)
2935 right = 0;
2936
2937 if (left == 0 && right == 0) {
2938 Py_INCREF(self);
2939 return self;
2940 }
2941
2942 u = _PyUnicode_New(left + self->length + right);
2943 if (u) {
2944 if (left)
2945 Py_UNICODE_FILL(u->str, fill, left);
2946 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2947 if (right)
2948 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2949 }
2950
2951 return u;
2952}
2953
2954#define SPLIT_APPEND(data, left, right) \
2955 str = PyUnicode_FromUnicode(data + left, right - left); \
2956 if (!str) \
2957 goto onError; \
2958 if (PyList_Append(list, str)) { \
2959 Py_DECREF(str); \
2960 goto onError; \
2961 } \
2962 else \
2963 Py_DECREF(str);
2964
2965static
2966PyObject *split_whitespace(PyUnicodeObject *self,
2967 PyObject *list,
2968 int maxcount)
2969{
2970 register int i;
2971 register int j;
2972 int len = self->length;
2973 PyObject *str;
2974
2975 for (i = j = 0; i < len; ) {
2976 /* find a token */
2977 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2978 i++;
2979 j = i;
2980 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2981 i++;
2982 if (j < i) {
2983 if (maxcount-- <= 0)
2984 break;
2985 SPLIT_APPEND(self->str, j, i);
2986 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2987 i++;
2988 j = i;
2989 }
2990 }
2991 if (j < len) {
2992 SPLIT_APPEND(self->str, j, len);
2993 }
2994 return list;
2995
2996 onError:
2997 Py_DECREF(list);
2998 return NULL;
2999}
3000
3001PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003002 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003{
3004 register int i;
3005 register int j;
3006 int len;
3007 PyObject *list;
3008 PyObject *str;
3009 Py_UNICODE *data;
3010
3011 string = PyUnicode_FromObject(string);
3012 if (string == NULL)
3013 return NULL;
3014 data = PyUnicode_AS_UNICODE(string);
3015 len = PyUnicode_GET_SIZE(string);
3016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 list = PyList_New(0);
3018 if (!list)
3019 goto onError;
3020
3021 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003022 int eol;
3023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 /* Find a line and append it */
3025 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3026 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003029 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (i < len) {
3031 if (data[i] == '\r' && i + 1 < len &&
3032 data[i+1] == '\n')
3033 i += 2;
3034 else
3035 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003036 if (keepends)
3037 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Guido van Rossum86662912000-04-11 15:38:46 +00003039 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 j = i;
3041 }
3042 if (j < len) {
3043 SPLIT_APPEND(data, j, len);
3044 }
3045
3046 Py_DECREF(string);
3047 return list;
3048
3049 onError:
3050 Py_DECREF(list);
3051 Py_DECREF(string);
3052 return NULL;
3053}
3054
3055static
3056PyObject *split_char(PyUnicodeObject *self,
3057 PyObject *list,
3058 Py_UNICODE ch,
3059 int maxcount)
3060{
3061 register int i;
3062 register int j;
3063 int len = self->length;
3064 PyObject *str;
3065
3066 for (i = j = 0; i < len; ) {
3067 if (self->str[i] == ch) {
3068 if (maxcount-- <= 0)
3069 break;
3070 SPLIT_APPEND(self->str, j, i);
3071 i = j = i + 1;
3072 } else
3073 i++;
3074 }
3075 if (j <= len) {
3076 SPLIT_APPEND(self->str, j, len);
3077 }
3078 return list;
3079
3080 onError:
3081 Py_DECREF(list);
3082 return NULL;
3083}
3084
3085static
3086PyObject *split_substring(PyUnicodeObject *self,
3087 PyObject *list,
3088 PyUnicodeObject *substring,
3089 int maxcount)
3090{
3091 register int i;
3092 register int j;
3093 int len = self->length;
3094 int sublen = substring->length;
3095 PyObject *str;
3096
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003097 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 if (Py_UNICODE_MATCH(self, i, substring)) {
3099 if (maxcount-- <= 0)
3100 break;
3101 SPLIT_APPEND(self->str, j, i);
3102 i = j = i + sublen;
3103 } else
3104 i++;
3105 }
3106 if (j <= len) {
3107 SPLIT_APPEND(self->str, j, len);
3108 }
3109 return list;
3110
3111 onError:
3112 Py_DECREF(list);
3113 return NULL;
3114}
3115
3116#undef SPLIT_APPEND
3117
3118static
3119PyObject *split(PyUnicodeObject *self,
3120 PyUnicodeObject *substring,
3121 int maxcount)
3122{
3123 PyObject *list;
3124
3125 if (maxcount < 0)
3126 maxcount = INT_MAX;
3127
3128 list = PyList_New(0);
3129 if (!list)
3130 return NULL;
3131
3132 if (substring == NULL)
3133 return split_whitespace(self,list,maxcount);
3134
3135 else if (substring->length == 1)
3136 return split_char(self,list,substring->str[0],maxcount);
3137
3138 else if (substring->length == 0) {
3139 Py_DECREF(list);
3140 PyErr_SetString(PyExc_ValueError, "empty separator");
3141 return NULL;
3142 }
3143 else
3144 return split_substring(self,list,substring,maxcount);
3145}
3146
3147static
3148PyObject *strip(PyUnicodeObject *self,
3149 int left,
3150 int right)
3151{
3152 Py_UNICODE *p = self->str;
3153 int start = 0;
3154 int end = self->length;
3155
3156 if (left)
3157 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3158 start++;
3159
3160 if (right)
3161 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3162 end--;
3163
3164 if (start == 0 && end == self->length) {
3165 /* couldn't strip anything off, return original string */
3166 Py_INCREF(self);
3167 return (PyObject*) self;
3168 }
3169
3170 return (PyObject*) PyUnicode_FromUnicode(
3171 self->str + start,
3172 end - start
3173 );
3174}
3175
3176static
3177PyObject *replace(PyUnicodeObject *self,
3178 PyUnicodeObject *str1,
3179 PyUnicodeObject *str2,
3180 int maxcount)
3181{
3182 PyUnicodeObject *u;
3183
3184 if (maxcount < 0)
3185 maxcount = INT_MAX;
3186
3187 if (str1->length == 1 && str2->length == 1) {
3188 int i;
3189
3190 /* replace characters */
3191 if (!findchar(self->str, self->length, str1->str[0])) {
3192 /* nothing to replace, return original string */
3193 Py_INCREF(self);
3194 u = self;
3195 } else {
3196 Py_UNICODE u1 = str1->str[0];
3197 Py_UNICODE u2 = str2->str[0];
3198
3199 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003200 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 self->length
3202 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003203 if (u != NULL) {
3204 Py_UNICODE_COPY(u->str, self->str,
3205 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 for (i = 0; i < u->length; i++)
3207 if (u->str[i] == u1) {
3208 if (--maxcount < 0)
3209 break;
3210 u->str[i] = u2;
3211 }
3212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214
3215 } else {
3216 int n, i;
3217 Py_UNICODE *p;
3218
3219 /* replace strings */
3220 n = count(self, 0, self->length, str1);
3221 if (n > maxcount)
3222 n = maxcount;
3223 if (n == 0) {
3224 /* nothing to replace, return original string */
3225 Py_INCREF(self);
3226 u = self;
3227 } else {
3228 u = _PyUnicode_New(
3229 self->length + n * (str2->length - str1->length));
3230 if (u) {
3231 i = 0;
3232 p = u->str;
3233 while (i <= self->length - str1->length)
3234 if (Py_UNICODE_MATCH(self, i, str1)) {
3235 /* replace string segment */
3236 Py_UNICODE_COPY(p, str2->str, str2->length);
3237 p += str2->length;
3238 i += str1->length;
3239 if (--n <= 0) {
3240 /* copy remaining part */
3241 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3242 break;
3243 }
3244 } else
3245 *p++ = self->str[i++];
3246 }
3247 }
3248 }
3249
3250 return (PyObject *) u;
3251}
3252
3253/* --- Unicode Object Methods --------------------------------------------- */
3254
3255static char title__doc__[] =
3256"S.title() -> unicode\n\
3257\n\
3258Return a titlecased version of S, i.e. words start with title case\n\
3259characters, all remaining cased characters have lower case.";
3260
3261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003262unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return fixup(self, fixtitle);
3265}
3266
3267static char capitalize__doc__[] =
3268"S.capitalize() -> unicode\n\
3269\n\
3270Return a capitalized version of S, i.e. make the first character\n\
3271have upper case.";
3272
3273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003274unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 return fixup(self, fixcapitalize);
3277}
3278
3279#if 0
3280static char capwords__doc__[] =
3281"S.capwords() -> unicode\n\
3282\n\
3283Apply .capitalize() to all words in S and return the result with\n\
3284normalized whitespace (all whitespace strings are replaced by ' ').";
3285
3286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003287unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288{
3289 PyObject *list;
3290 PyObject *item;
3291 int i;
3292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 /* Split into words */
3294 list = split(self, NULL, -1);
3295 if (!list)
3296 return NULL;
3297
3298 /* Capitalize each word */
3299 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3300 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3301 fixcapitalize);
3302 if (item == NULL)
3303 goto onError;
3304 Py_DECREF(PyList_GET_ITEM(list, i));
3305 PyList_SET_ITEM(list, i, item);
3306 }
3307
3308 /* Join the words to form a new string */
3309 item = PyUnicode_Join(NULL, list);
3310
3311onError:
3312 Py_DECREF(list);
3313 return (PyObject *)item;
3314}
3315#endif
3316
3317static char center__doc__[] =
3318"S.center(width) -> unicode\n\
3319\n\
3320Return S centered in a Unicode string of length width. Padding is done\n\
3321using spaces.";
3322
3323static PyObject *
3324unicode_center(PyUnicodeObject *self, PyObject *args)
3325{
3326 int marg, left;
3327 int width;
3328
3329 if (!PyArg_ParseTuple(args, "i:center", &width))
3330 return NULL;
3331
3332 if (self->length >= width) {
3333 Py_INCREF(self);
3334 return (PyObject*) self;
3335 }
3336
3337 marg = width - self->length;
3338 left = marg / 2 + (marg & width & 1);
3339
3340 return (PyObject*) pad(self, left, marg - left, ' ');
3341}
3342
Marc-André Lemburge5034372000-08-08 08:04:29 +00003343#if 0
3344
3345/* This code should go into some future Unicode collation support
3346 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003347 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003348
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003349/* speedy UTF-16 code point order comparison */
3350/* gleaned from: */
3351/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003353static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003354{
3355 0, 0, 0, 0, 0, 0, 0, 0,
3356 0, 0, 0, 0, 0, 0, 0, 0,
3357 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003358 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003359};
3360
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361static int
3362unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3363{
3364 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 Py_UNICODE *s1 = str1->str;
3367 Py_UNICODE *s2 = str2->str;
3368
3369 len1 = str1->length;
3370 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003371
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003373 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003374
3375 c1 = *s1++;
3376 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003377
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003378 if (c1 > (1<<11) * 26)
3379 c1 += utf16Fixup[c1>>11];
3380 if (c2 > (1<<11) * 26)
3381 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003382 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003383
3384 if (c1 != c2)
3385 return (c1 < c2) ? -1 : 1;
3386
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003387 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 }
3389
3390 return (len1 < len2) ? -1 : (len1 != len2);
3391}
3392
Marc-André Lemburge5034372000-08-08 08:04:29 +00003393#else
3394
3395static int
3396unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3397{
3398 register int len1, len2;
3399
3400 Py_UNICODE *s1 = str1->str;
3401 Py_UNICODE *s2 = str2->str;
3402
3403 len1 = str1->length;
3404 len2 = str2->length;
3405
3406 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003407 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003408
Fredrik Lundh45714e92001-06-26 16:39:36 +00003409 c1 = *s1++;
3410 c2 = *s2++;
3411
3412 if (c1 != c2)
3413 return (c1 < c2) ? -1 : 1;
3414
Marc-André Lemburge5034372000-08-08 08:04:29 +00003415 len1--; len2--;
3416 }
3417
3418 return (len1 < len2) ? -1 : (len1 != len2);
3419}
3420
3421#endif
3422
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423int PyUnicode_Compare(PyObject *left,
3424 PyObject *right)
3425{
3426 PyUnicodeObject *u = NULL, *v = NULL;
3427 int result;
3428
3429 /* Coerce the two arguments */
3430 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3431 if (u == NULL)
3432 goto onError;
3433 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3434 if (v == NULL)
3435 goto onError;
3436
Thomas Wouters7e474022000-07-16 12:04:32 +00003437 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 if (v == u) {
3439 Py_DECREF(u);
3440 Py_DECREF(v);
3441 return 0;
3442 }
3443
3444 result = unicode_compare(u, v);
3445
3446 Py_DECREF(u);
3447 Py_DECREF(v);
3448 return result;
3449
3450onError:
3451 Py_XDECREF(u);
3452 Py_XDECREF(v);
3453 return -1;
3454}
3455
Guido van Rossum403d68b2000-03-13 15:55:09 +00003456int PyUnicode_Contains(PyObject *container,
3457 PyObject *element)
3458{
3459 PyUnicodeObject *u = NULL, *v = NULL;
3460 int result;
3461 register const Py_UNICODE *p, *e;
3462 register Py_UNICODE ch;
3463
3464 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003465 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003466 if (v == NULL) {
3467 PyErr_SetString(PyExc_TypeError,
3468 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003469 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003470 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003471 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3472 if (u == NULL) {
3473 Py_DECREF(v);
3474 goto onError;
3475 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003476
3477 /* Check v in u */
3478 if (PyUnicode_GET_SIZE(v) != 1) {
3479 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003480 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003481 goto onError;
3482 }
3483 ch = *PyUnicode_AS_UNICODE(v);
3484 p = PyUnicode_AS_UNICODE(u);
3485 e = p + PyUnicode_GET_SIZE(u);
3486 result = 0;
3487 while (p < e) {
3488 if (*p++ == ch) {
3489 result = 1;
3490 break;
3491 }
3492 }
3493
3494 Py_DECREF(u);
3495 Py_DECREF(v);
3496 return result;
3497
3498onError:
3499 Py_XDECREF(u);
3500 Py_XDECREF(v);
3501 return -1;
3502}
3503
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504/* Concat to string or Unicode object giving a new Unicode object. */
3505
3506PyObject *PyUnicode_Concat(PyObject *left,
3507 PyObject *right)
3508{
3509 PyUnicodeObject *u = NULL, *v = NULL, *w;
3510
3511 /* Coerce the two arguments */
3512 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3513 if (u == NULL)
3514 goto onError;
3515 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3516 if (v == NULL)
3517 goto onError;
3518
3519 /* Shortcuts */
3520 if (v == unicode_empty) {
3521 Py_DECREF(v);
3522 return (PyObject *)u;
3523 }
3524 if (u == unicode_empty) {
3525 Py_DECREF(u);
3526 return (PyObject *)v;
3527 }
3528
3529 /* Concat the two Unicode strings */
3530 w = _PyUnicode_New(u->length + v->length);
3531 if (w == NULL)
3532 goto onError;
3533 Py_UNICODE_COPY(w->str, u->str, u->length);
3534 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3535
3536 Py_DECREF(u);
3537 Py_DECREF(v);
3538 return (PyObject *)w;
3539
3540onError:
3541 Py_XDECREF(u);
3542 Py_XDECREF(v);
3543 return NULL;
3544}
3545
3546static char count__doc__[] =
3547"S.count(sub[, start[, end]]) -> int\n\
3548\n\
3549Return the number of occurrences of substring sub in Unicode string\n\
3550S[start:end]. Optional arguments start and end are\n\
3551interpreted as in slice notation.";
3552
3553static PyObject *
3554unicode_count(PyUnicodeObject *self, PyObject *args)
3555{
3556 PyUnicodeObject *substring;
3557 int start = 0;
3558 int end = INT_MAX;
3559 PyObject *result;
3560
Guido van Rossumb8872e62000-05-09 14:14:27 +00003561 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3562 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 return NULL;
3564
3565 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3566 (PyObject *)substring);
3567 if (substring == NULL)
3568 return NULL;
3569
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 if (start < 0)
3571 start += self->length;
3572 if (start < 0)
3573 start = 0;
3574 if (end > self->length)
3575 end = self->length;
3576 if (end < 0)
3577 end += self->length;
3578 if (end < 0)
3579 end = 0;
3580
3581 result = PyInt_FromLong((long) count(self, start, end, substring));
3582
3583 Py_DECREF(substring);
3584 return result;
3585}
3586
3587static char encode__doc__[] =
3588"S.encode([encoding[,errors]]) -> string\n\
3589\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003590Return an encoded string version of S. Default encoding is the current\n\
3591default string encoding. errors may be given to set a different error\n\
3592handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3593a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594
3595static PyObject *
3596unicode_encode(PyUnicodeObject *self, PyObject *args)
3597{
3598 char *encoding = NULL;
3599 char *errors = NULL;
3600 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3601 return NULL;
3602 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3603}
3604
3605static char expandtabs__doc__[] =
3606"S.expandtabs([tabsize]) -> unicode\n\
3607\n\
3608Return a copy of S where all tab characters are expanded using spaces.\n\
3609If tabsize is not given, a tab size of 8 characters is assumed.";
3610
3611static PyObject*
3612unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3613{
3614 Py_UNICODE *e;
3615 Py_UNICODE *p;
3616 Py_UNICODE *q;
3617 int i, j;
3618 PyUnicodeObject *u;
3619 int tabsize = 8;
3620
3621 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3622 return NULL;
3623
Thomas Wouters7e474022000-07-16 12:04:32 +00003624 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 i = j = 0;
3626 e = self->str + self->length;
3627 for (p = self->str; p < e; p++)
3628 if (*p == '\t') {
3629 if (tabsize > 0)
3630 j += tabsize - (j % tabsize);
3631 }
3632 else {
3633 j++;
3634 if (*p == '\n' || *p == '\r') {
3635 i += j;
3636 j = 0;
3637 }
3638 }
3639
3640 /* Second pass: create output string and fill it */
3641 u = _PyUnicode_New(i + j);
3642 if (!u)
3643 return NULL;
3644
3645 j = 0;
3646 q = u->str;
3647
3648 for (p = self->str; p < e; p++)
3649 if (*p == '\t') {
3650 if (tabsize > 0) {
3651 i = tabsize - (j % tabsize);
3652 j += i;
3653 while (i--)
3654 *q++ = ' ';
3655 }
3656 }
3657 else {
3658 j++;
3659 *q++ = *p;
3660 if (*p == '\n' || *p == '\r')
3661 j = 0;
3662 }
3663
3664 return (PyObject*) u;
3665}
3666
3667static char find__doc__[] =
3668"S.find(sub [,start [,end]]) -> int\n\
3669\n\
3670Return the lowest index in S where substring sub is found,\n\
3671such that sub is contained within s[start,end]. Optional\n\
3672arguments start and end are interpreted as in slice notation.\n\
3673\n\
3674Return -1 on failure.";
3675
3676static PyObject *
3677unicode_find(PyUnicodeObject *self, PyObject *args)
3678{
3679 PyUnicodeObject *substring;
3680 int start = 0;
3681 int end = INT_MAX;
3682 PyObject *result;
3683
Guido van Rossumb8872e62000-05-09 14:14:27 +00003684 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3685 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return NULL;
3687 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3688 (PyObject *)substring);
3689 if (substring == NULL)
3690 return NULL;
3691
3692 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3693
3694 Py_DECREF(substring);
3695 return result;
3696}
3697
3698static PyObject *
3699unicode_getitem(PyUnicodeObject *self, int index)
3700{
3701 if (index < 0 || index >= self->length) {
3702 PyErr_SetString(PyExc_IndexError, "string index out of range");
3703 return NULL;
3704 }
3705
3706 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3707}
3708
3709static long
3710unicode_hash(PyUnicodeObject *self)
3711{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003712 /* Since Unicode objects compare equal to their ASCII string
3713 counterparts, they should use the individual character values
3714 as basis for their hash value. This is needed to assure that
3715 strings and Unicode objects behave in the same way as
3716 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717
Fredrik Lundhdde61642000-07-10 18:27:47 +00003718 register int len;
3719 register Py_UNICODE *p;
3720 register long x;
3721
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 if (self->hash != -1)
3723 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003724 len = PyUnicode_GET_SIZE(self);
3725 p = PyUnicode_AS_UNICODE(self);
3726 x = *p << 7;
3727 while (--len >= 0)
3728 x = (1000003*x) ^ *p++;
3729 x ^= PyUnicode_GET_SIZE(self);
3730 if (x == -1)
3731 x = -2;
3732 self->hash = x;
3733 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734}
3735
3736static char index__doc__[] =
3737"S.index(sub [,start [,end]]) -> int\n\
3738\n\
3739Like S.find() but raise ValueError when the substring is not found.";
3740
3741static PyObject *
3742unicode_index(PyUnicodeObject *self, PyObject *args)
3743{
3744 int result;
3745 PyUnicodeObject *substring;
3746 int start = 0;
3747 int end = INT_MAX;
3748
Guido van Rossumb8872e62000-05-09 14:14:27 +00003749 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3750 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 return NULL;
3752
3753 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3754 (PyObject *)substring);
3755 if (substring == NULL)
3756 return NULL;
3757
3758 result = findstring(self, substring, start, end, 1);
3759
3760 Py_DECREF(substring);
3761 if (result < 0) {
3762 PyErr_SetString(PyExc_ValueError, "substring not found");
3763 return NULL;
3764 }
3765 return PyInt_FromLong(result);
3766}
3767
3768static char islower__doc__[] =
3769"S.islower() -> int\n\
3770\n\
3771Return 1 if all cased characters in S are lowercase and there is\n\
3772at least one cased character in S, 0 otherwise.";
3773
3774static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003775unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776{
3777 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3778 register const Py_UNICODE *e;
3779 int cased;
3780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 /* Shortcut for single character strings */
3782 if (PyUnicode_GET_SIZE(self) == 1)
3783 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003785 /* Special case for empty strings */
3786 if (PyString_GET_SIZE(self) == 0)
3787 return PyInt_FromLong(0);
3788
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 e = p + PyUnicode_GET_SIZE(self);
3790 cased = 0;
3791 for (; p < e; p++) {
3792 register const Py_UNICODE ch = *p;
3793
3794 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3795 return PyInt_FromLong(0);
3796 else if (!cased && Py_UNICODE_ISLOWER(ch))
3797 cased = 1;
3798 }
3799 return PyInt_FromLong(cased);
3800}
3801
3802static char isupper__doc__[] =
3803"S.isupper() -> int\n\
3804\n\
3805Return 1 if all cased characters in S are uppercase and there is\n\
3806at least one cased character in S, 0 otherwise.";
3807
3808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003809unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
3811 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3812 register const Py_UNICODE *e;
3813 int cased;
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 /* Shortcut for single character strings */
3816 if (PyUnicode_GET_SIZE(self) == 1)
3817 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003819 /* Special case for empty strings */
3820 if (PyString_GET_SIZE(self) == 0)
3821 return PyInt_FromLong(0);
3822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 e = p + PyUnicode_GET_SIZE(self);
3824 cased = 0;
3825 for (; p < e; p++) {
3826 register const Py_UNICODE ch = *p;
3827
3828 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3829 return PyInt_FromLong(0);
3830 else if (!cased && Py_UNICODE_ISUPPER(ch))
3831 cased = 1;
3832 }
3833 return PyInt_FromLong(cased);
3834}
3835
3836static char istitle__doc__[] =
3837"S.istitle() -> int\n\
3838\n\
3839Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3840may only follow uncased characters and lowercase characters only cased\n\
3841ones. Return 0 otherwise.";
3842
3843static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003844unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845{
3846 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3847 register const Py_UNICODE *e;
3848 int cased, previous_is_cased;
3849
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 /* Shortcut for single character strings */
3851 if (PyUnicode_GET_SIZE(self) == 1)
3852 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3853 (Py_UNICODE_ISUPPER(*p) != 0));
3854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003855 /* Special case for empty strings */
3856 if (PyString_GET_SIZE(self) == 0)
3857 return PyInt_FromLong(0);
3858
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 e = p + PyUnicode_GET_SIZE(self);
3860 cased = 0;
3861 previous_is_cased = 0;
3862 for (; p < e; p++) {
3863 register const Py_UNICODE ch = *p;
3864
3865 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3866 if (previous_is_cased)
3867 return PyInt_FromLong(0);
3868 previous_is_cased = 1;
3869 cased = 1;
3870 }
3871 else if (Py_UNICODE_ISLOWER(ch)) {
3872 if (!previous_is_cased)
3873 return PyInt_FromLong(0);
3874 previous_is_cased = 1;
3875 cased = 1;
3876 }
3877 else
3878 previous_is_cased = 0;
3879 }
3880 return PyInt_FromLong(cased);
3881}
3882
3883static char isspace__doc__[] =
3884"S.isspace() -> int\n\
3885\n\
3886Return 1 if there are only whitespace characters in S,\n\
38870 otherwise.";
3888
3889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003890unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891{
3892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3893 register const Py_UNICODE *e;
3894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 /* Shortcut for single character strings */
3896 if (PyUnicode_GET_SIZE(self) == 1 &&
3897 Py_UNICODE_ISSPACE(*p))
3898 return PyInt_FromLong(1);
3899
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003900 /* Special case for empty strings */
3901 if (PyString_GET_SIZE(self) == 0)
3902 return PyInt_FromLong(0);
3903
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 e = p + PyUnicode_GET_SIZE(self);
3905 for (; p < e; p++) {
3906 if (!Py_UNICODE_ISSPACE(*p))
3907 return PyInt_FromLong(0);
3908 }
3909 return PyInt_FromLong(1);
3910}
3911
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003912static char isalpha__doc__[] =
3913"S.isalpha() -> int\n\
3914\n\
3915Return 1 if all characters in S are alphabetic\n\
3916and there is at least one character in S, 0 otherwise.";
3917
3918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003919unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003920{
3921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3922 register const Py_UNICODE *e;
3923
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003924 /* Shortcut for single character strings */
3925 if (PyUnicode_GET_SIZE(self) == 1 &&
3926 Py_UNICODE_ISALPHA(*p))
3927 return PyInt_FromLong(1);
3928
3929 /* Special case for empty strings */
3930 if (PyString_GET_SIZE(self) == 0)
3931 return PyInt_FromLong(0);
3932
3933 e = p + PyUnicode_GET_SIZE(self);
3934 for (; p < e; p++) {
3935 if (!Py_UNICODE_ISALPHA(*p))
3936 return PyInt_FromLong(0);
3937 }
3938 return PyInt_FromLong(1);
3939}
3940
3941static char isalnum__doc__[] =
3942"S.isalnum() -> int\n\
3943\n\
3944Return 1 if all characters in S are alphanumeric\n\
3945and there is at least one character in S, 0 otherwise.";
3946
3947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003948unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003949{
3950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3951 register const Py_UNICODE *e;
3952
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003953 /* Shortcut for single character strings */
3954 if (PyUnicode_GET_SIZE(self) == 1 &&
3955 Py_UNICODE_ISALNUM(*p))
3956 return PyInt_FromLong(1);
3957
3958 /* Special case for empty strings */
3959 if (PyString_GET_SIZE(self) == 0)
3960 return PyInt_FromLong(0);
3961
3962 e = p + PyUnicode_GET_SIZE(self);
3963 for (; p < e; p++) {
3964 if (!Py_UNICODE_ISALNUM(*p))
3965 return PyInt_FromLong(0);
3966 }
3967 return PyInt_FromLong(1);
3968}
3969
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970static char isdecimal__doc__[] =
3971"S.isdecimal() -> int\n\
3972\n\
3973Return 1 if there are only decimal characters in S,\n\
39740 otherwise.";
3975
3976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003977unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978{
3979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3980 register const Py_UNICODE *e;
3981
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 /* Shortcut for single character strings */
3983 if (PyUnicode_GET_SIZE(self) == 1 &&
3984 Py_UNICODE_ISDECIMAL(*p))
3985 return PyInt_FromLong(1);
3986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003987 /* Special case for empty strings */
3988 if (PyString_GET_SIZE(self) == 0)
3989 return PyInt_FromLong(0);
3990
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 e = p + PyUnicode_GET_SIZE(self);
3992 for (; p < e; p++) {
3993 if (!Py_UNICODE_ISDECIMAL(*p))
3994 return PyInt_FromLong(0);
3995 }
3996 return PyInt_FromLong(1);
3997}
3998
3999static char isdigit__doc__[] =
4000"S.isdigit() -> int\n\
4001\n\
4002Return 1 if there are only digit characters in S,\n\
40030 otherwise.";
4004
4005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004006unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007{
4008 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4009 register const Py_UNICODE *e;
4010
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 /* Shortcut for single character strings */
4012 if (PyUnicode_GET_SIZE(self) == 1 &&
4013 Py_UNICODE_ISDIGIT(*p))
4014 return PyInt_FromLong(1);
4015
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004016 /* Special case for empty strings */
4017 if (PyString_GET_SIZE(self) == 0)
4018 return PyInt_FromLong(0);
4019
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 e = p + PyUnicode_GET_SIZE(self);
4021 for (; p < e; p++) {
4022 if (!Py_UNICODE_ISDIGIT(*p))
4023 return PyInt_FromLong(0);
4024 }
4025 return PyInt_FromLong(1);
4026}
4027
4028static char isnumeric__doc__[] =
4029"S.isnumeric() -> int\n\
4030\n\
4031Return 1 if there are only numeric characters in S,\n\
40320 otherwise.";
4033
4034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004035unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036{
4037 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4038 register const Py_UNICODE *e;
4039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 /* Shortcut for single character strings */
4041 if (PyUnicode_GET_SIZE(self) == 1 &&
4042 Py_UNICODE_ISNUMERIC(*p))
4043 return PyInt_FromLong(1);
4044
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004045 /* Special case for empty strings */
4046 if (PyString_GET_SIZE(self) == 0)
4047 return PyInt_FromLong(0);
4048
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 e = p + PyUnicode_GET_SIZE(self);
4050 for (; p < e; p++) {
4051 if (!Py_UNICODE_ISNUMERIC(*p))
4052 return PyInt_FromLong(0);
4053 }
4054 return PyInt_FromLong(1);
4055}
4056
4057static char join__doc__[] =
4058"S.join(sequence) -> unicode\n\
4059\n\
4060Return a string which is the concatenation of the strings in the\n\
4061sequence. The separator between elements is S.";
4062
4063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004064unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004066 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067}
4068
4069static int
4070unicode_length(PyUnicodeObject *self)
4071{
4072 return self->length;
4073}
4074
4075static char ljust__doc__[] =
4076"S.ljust(width) -> unicode\n\
4077\n\
4078Return S left justified in a Unicode string of length width. Padding is\n\
4079done using spaces.";
4080
4081static PyObject *
4082unicode_ljust(PyUnicodeObject *self, PyObject *args)
4083{
4084 int width;
4085 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4086 return NULL;
4087
4088 if (self->length >= width) {
4089 Py_INCREF(self);
4090 return (PyObject*) self;
4091 }
4092
4093 return (PyObject*) pad(self, 0, width - self->length, ' ');
4094}
4095
4096static char lower__doc__[] =
4097"S.lower() -> unicode\n\
4098\n\
4099Return a copy of the string S converted to lowercase.";
4100
4101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004102unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 return fixup(self, fixlower);
4105}
4106
4107static char lstrip__doc__[] =
4108"S.lstrip() -> unicode\n\
4109\n\
4110Return a copy of the string S with leading whitespace removed.";
4111
4112static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004113unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 return strip(self, 1, 0);
4116}
4117
4118static PyObject*
4119unicode_repeat(PyUnicodeObject *str, int len)
4120{
4121 PyUnicodeObject *u;
4122 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004123 int nchars;
4124 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
4126 if (len < 0)
4127 len = 0;
4128
4129 if (len == 1) {
4130 /* no repeat, return original string */
4131 Py_INCREF(str);
4132 return (PyObject*) str;
4133 }
Tim Peters8f422462000-09-09 06:13:41 +00004134
4135 /* ensure # of chars needed doesn't overflow int and # of bytes
4136 * needed doesn't overflow size_t
4137 */
4138 nchars = len * str->length;
4139 if (len && nchars / len != str->length) {
4140 PyErr_SetString(PyExc_OverflowError,
4141 "repeated string is too long");
4142 return NULL;
4143 }
4144 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4145 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4146 PyErr_SetString(PyExc_OverflowError,
4147 "repeated string is too long");
4148 return NULL;
4149 }
4150 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 if (!u)
4152 return NULL;
4153
4154 p = u->str;
4155
4156 while (len-- > 0) {
4157 Py_UNICODE_COPY(p, str->str, str->length);
4158 p += str->length;
4159 }
4160
4161 return (PyObject*) u;
4162}
4163
4164PyObject *PyUnicode_Replace(PyObject *obj,
4165 PyObject *subobj,
4166 PyObject *replobj,
4167 int maxcount)
4168{
4169 PyObject *self;
4170 PyObject *str1;
4171 PyObject *str2;
4172 PyObject *result;
4173
4174 self = PyUnicode_FromObject(obj);
4175 if (self == NULL)
4176 return NULL;
4177 str1 = PyUnicode_FromObject(subobj);
4178 if (str1 == NULL) {
4179 Py_DECREF(self);
4180 return NULL;
4181 }
4182 str2 = PyUnicode_FromObject(replobj);
4183 if (str2 == NULL) {
4184 Py_DECREF(self);
4185 Py_DECREF(str1);
4186 return NULL;
4187 }
4188 result = replace((PyUnicodeObject *)self,
4189 (PyUnicodeObject *)str1,
4190 (PyUnicodeObject *)str2,
4191 maxcount);
4192 Py_DECREF(self);
4193 Py_DECREF(str1);
4194 Py_DECREF(str2);
4195 return result;
4196}
4197
4198static char replace__doc__[] =
4199"S.replace (old, new[, maxsplit]) -> unicode\n\
4200\n\
4201Return a copy of S with all occurrences of substring\n\
4202old replaced by new. If the optional argument maxsplit is\n\
4203given, only the first maxsplit occurrences are replaced.";
4204
4205static PyObject*
4206unicode_replace(PyUnicodeObject *self, PyObject *args)
4207{
4208 PyUnicodeObject *str1;
4209 PyUnicodeObject *str2;
4210 int maxcount = -1;
4211 PyObject *result;
4212
4213 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4214 return NULL;
4215 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4216 if (str1 == NULL)
4217 return NULL;
4218 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4219 if (str2 == NULL)
4220 return NULL;
4221
4222 result = replace(self, str1, str2, maxcount);
4223
4224 Py_DECREF(str1);
4225 Py_DECREF(str2);
4226 return result;
4227}
4228
4229static
4230PyObject *unicode_repr(PyObject *unicode)
4231{
4232 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4233 PyUnicode_GET_SIZE(unicode),
4234 1);
4235}
4236
4237static char rfind__doc__[] =
4238"S.rfind(sub [,start [,end]]) -> int\n\
4239\n\
4240Return the highest index in S where substring sub is found,\n\
4241such that sub is contained within s[start,end]. Optional\n\
4242arguments start and end are interpreted as in slice notation.\n\
4243\n\
4244Return -1 on failure.";
4245
4246static PyObject *
4247unicode_rfind(PyUnicodeObject *self, PyObject *args)
4248{
4249 PyUnicodeObject *substring;
4250 int start = 0;
4251 int end = INT_MAX;
4252 PyObject *result;
4253
Guido van Rossumb8872e62000-05-09 14:14:27 +00004254 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4255 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 return NULL;
4257 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4258 (PyObject *)substring);
4259 if (substring == NULL)
4260 return NULL;
4261
4262 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4263
4264 Py_DECREF(substring);
4265 return result;
4266}
4267
4268static char rindex__doc__[] =
4269"S.rindex(sub [,start [,end]]) -> int\n\
4270\n\
4271Like S.rfind() but raise ValueError when the substring is not found.";
4272
4273static PyObject *
4274unicode_rindex(PyUnicodeObject *self, PyObject *args)
4275{
4276 int result;
4277 PyUnicodeObject *substring;
4278 int start = 0;
4279 int end = INT_MAX;
4280
Guido van Rossumb8872e62000-05-09 14:14:27 +00004281 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4282 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 return NULL;
4284 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4285 (PyObject *)substring);
4286 if (substring == NULL)
4287 return NULL;
4288
4289 result = findstring(self, substring, start, end, -1);
4290
4291 Py_DECREF(substring);
4292 if (result < 0) {
4293 PyErr_SetString(PyExc_ValueError, "substring not found");
4294 return NULL;
4295 }
4296 return PyInt_FromLong(result);
4297}
4298
4299static char rjust__doc__[] =
4300"S.rjust(width) -> unicode\n\
4301\n\
4302Return S right justified in a Unicode string of length width. Padding is\n\
4303done using spaces.";
4304
4305static PyObject *
4306unicode_rjust(PyUnicodeObject *self, PyObject *args)
4307{
4308 int width;
4309 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4310 return NULL;
4311
4312 if (self->length >= width) {
4313 Py_INCREF(self);
4314 return (PyObject*) self;
4315 }
4316
4317 return (PyObject*) pad(self, width - self->length, 0, ' ');
4318}
4319
4320static char rstrip__doc__[] =
4321"S.rstrip() -> unicode\n\
4322\n\
4323Return a copy of the string S with trailing whitespace removed.";
4324
4325static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004326unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 return strip(self, 0, 1);
4329}
4330
4331static PyObject*
4332unicode_slice(PyUnicodeObject *self, int start, int end)
4333{
4334 /* standard clamping */
4335 if (start < 0)
4336 start = 0;
4337 if (end < 0)
4338 end = 0;
4339 if (end > self->length)
4340 end = self->length;
4341 if (start == 0 && end == self->length) {
4342 /* full slice, return original string */
4343 Py_INCREF(self);
4344 return (PyObject*) self;
4345 }
4346 if (start > end)
4347 start = end;
4348 /* copy slice */
4349 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4350 end - start);
4351}
4352
4353PyObject *PyUnicode_Split(PyObject *s,
4354 PyObject *sep,
4355 int maxsplit)
4356{
4357 PyObject *result;
4358
4359 s = PyUnicode_FromObject(s);
4360 if (s == NULL)
4361 return NULL;
4362 if (sep != NULL) {
4363 sep = PyUnicode_FromObject(sep);
4364 if (sep == NULL) {
4365 Py_DECREF(s);
4366 return NULL;
4367 }
4368 }
4369
4370 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4371
4372 Py_DECREF(s);
4373 Py_XDECREF(sep);
4374 return result;
4375}
4376
4377static char split__doc__[] =
4378"S.split([sep [,maxsplit]]) -> list of strings\n\
4379\n\
4380Return a list of the words in S, using sep as the\n\
4381delimiter string. If maxsplit is given, at most maxsplit\n\
4382splits are done. If sep is not specified, any whitespace string\n\
4383is a separator.";
4384
4385static PyObject*
4386unicode_split(PyUnicodeObject *self, PyObject *args)
4387{
4388 PyObject *substring = Py_None;
4389 int maxcount = -1;
4390
4391 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4392 return NULL;
4393
4394 if (substring == Py_None)
4395 return split(self, NULL, maxcount);
4396 else if (PyUnicode_Check(substring))
4397 return split(self, (PyUnicodeObject *)substring, maxcount);
4398 else
4399 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4400}
4401
4402static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004403"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404\n\
4405Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004406Line breaks are not included in the resulting list unless keepends\n\
4407is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408
4409static PyObject*
4410unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4411{
Guido van Rossum86662912000-04-11 15:38:46 +00004412 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413
Guido van Rossum86662912000-04-11 15:38:46 +00004414 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 return NULL;
4416
Guido van Rossum86662912000-04-11 15:38:46 +00004417 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418}
4419
4420static
4421PyObject *unicode_str(PyUnicodeObject *self)
4422{
Fred Drakee4315f52000-05-09 19:53:39 +00004423 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424}
4425
4426static char strip__doc__[] =
4427"S.strip() -> unicode\n\
4428\n\
4429Return a copy of S with leading and trailing whitespace removed.";
4430
4431static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004432unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 return strip(self, 1, 1);
4435}
4436
4437static char swapcase__doc__[] =
4438"S.swapcase() -> unicode\n\
4439\n\
4440Return a copy of S with uppercase characters converted to lowercase\n\
4441and vice versa.";
4442
4443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004444unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 return fixup(self, fixswapcase);
4447}
4448
4449static char translate__doc__[] =
4450"S.translate(table) -> unicode\n\
4451\n\
4452Return a copy of the string S, where all characters have been mapped\n\
4453through the given translation table, which must be a mapping of\n\
4454Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4455are left untouched. Characters mapped to None are deleted.";
4456
4457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004458unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 return PyUnicode_TranslateCharmap(self->str,
4461 self->length,
4462 table,
4463 "ignore");
4464}
4465
4466static char upper__doc__[] =
4467"S.upper() -> unicode\n\
4468\n\
4469Return a copy of S converted to uppercase.";
4470
4471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004472unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 return fixup(self, fixupper);
4475}
4476
4477#if 0
4478static char zfill__doc__[] =
4479"S.zfill(width) -> unicode\n\
4480\n\
4481Pad a numeric string x with zeros on the left, to fill a field\n\
4482of the specified width. The string x is never truncated.";
4483
4484static PyObject *
4485unicode_zfill(PyUnicodeObject *self, PyObject *args)
4486{
4487 int fill;
4488 PyUnicodeObject *u;
4489
4490 int width;
4491 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4492 return NULL;
4493
4494 if (self->length >= width) {
4495 Py_INCREF(self);
4496 return (PyObject*) self;
4497 }
4498
4499 fill = width - self->length;
4500
4501 u = pad(self, fill, 0, '0');
4502
4503 if (u->str[fill] == '+' || u->str[fill] == '-') {
4504 /* move sign to beginning of string */
4505 u->str[0] = u->str[fill];
4506 u->str[fill] = '0';
4507 }
4508
4509 return (PyObject*) u;
4510}
4511#endif
4512
4513#if 0
4514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004515unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 return PyInt_FromLong(unicode_freelist_size);
4518}
4519#endif
4520
4521static char startswith__doc__[] =
4522"S.startswith(prefix[, start[, end]]) -> int\n\
4523\n\
4524Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4525optional start, test S beginning at that position. With optional end, stop\n\
4526comparing S at that position.";
4527
4528static PyObject *
4529unicode_startswith(PyUnicodeObject *self,
4530 PyObject *args)
4531{
4532 PyUnicodeObject *substring;
4533 int start = 0;
4534 int end = INT_MAX;
4535 PyObject *result;
4536
Guido van Rossumb8872e62000-05-09 14:14:27 +00004537 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4538 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 return NULL;
4540 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4541 (PyObject *)substring);
4542 if (substring == NULL)
4543 return NULL;
4544
4545 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4546
4547 Py_DECREF(substring);
4548 return result;
4549}
4550
4551
4552static char endswith__doc__[] =
4553"S.endswith(suffix[, start[, end]]) -> int\n\
4554\n\
4555Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4556optional start, test S beginning at that position. With optional end, stop\n\
4557comparing S at that position.";
4558
4559static PyObject *
4560unicode_endswith(PyUnicodeObject *self,
4561 PyObject *args)
4562{
4563 PyUnicodeObject *substring;
4564 int start = 0;
4565 int end = INT_MAX;
4566 PyObject *result;
4567
Guido van Rossumb8872e62000-05-09 14:14:27 +00004568 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4569 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 return NULL;
4571 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4572 (PyObject *)substring);
4573 if (substring == NULL)
4574 return NULL;
4575
4576 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4577
4578 Py_DECREF(substring);
4579 return result;
4580}
4581
4582
4583static PyMethodDef unicode_methods[] = {
4584
4585 /* Order is according to common usage: often used methods should
4586 appear first, since lookup is done sequentially. */
4587
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004588 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4589 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4590 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4591 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4592 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4593 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4594 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4595 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4596 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4597 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4598 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4599 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4600 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4601 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4602/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4603 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4604 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4605 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4606 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4607 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4608 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4609 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4610 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4611 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4612 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4613 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4614 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4615 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4616 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4617 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4618 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4619 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4620 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4621 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4622 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004624 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4625 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626#endif
4627
4628#if 0
4629 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004630 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631#endif
4632
4633 {NULL, NULL}
4634};
4635
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636static PySequenceMethods unicode_as_sequence = {
4637 (inquiry) unicode_length, /* sq_length */
4638 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4639 (intargfunc) unicode_repeat, /* sq_repeat */
4640 (intargfunc) unicode_getitem, /* sq_item */
4641 (intintargfunc) unicode_slice, /* sq_slice */
4642 0, /* sq_ass_item */
4643 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004644 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645};
4646
4647static int
4648unicode_buffer_getreadbuf(PyUnicodeObject *self,
4649 int index,
4650 const void **ptr)
4651{
4652 if (index != 0) {
4653 PyErr_SetString(PyExc_SystemError,
4654 "accessing non-existent unicode segment");
4655 return -1;
4656 }
4657 *ptr = (void *) self->str;
4658 return PyUnicode_GET_DATA_SIZE(self);
4659}
4660
4661static int
4662unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4663 const void **ptr)
4664{
4665 PyErr_SetString(PyExc_TypeError,
4666 "cannot use unicode as modifyable buffer");
4667 return -1;
4668}
4669
4670static int
4671unicode_buffer_getsegcount(PyUnicodeObject *self,
4672 int *lenp)
4673{
4674 if (lenp)
4675 *lenp = PyUnicode_GET_DATA_SIZE(self);
4676 return 1;
4677}
4678
4679static int
4680unicode_buffer_getcharbuf(PyUnicodeObject *self,
4681 int index,
4682 const void **ptr)
4683{
4684 PyObject *str;
4685
4686 if (index != 0) {
4687 PyErr_SetString(PyExc_SystemError,
4688 "accessing non-existent unicode segment");
4689 return -1;
4690 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004691 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 if (str == NULL)
4693 return -1;
4694 *ptr = (void *) PyString_AS_STRING(str);
4695 return PyString_GET_SIZE(str);
4696}
4697
4698/* Helpers for PyUnicode_Format() */
4699
4700static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004701getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702{
4703 int argidx = *p_argidx;
4704 if (argidx < arglen) {
4705 (*p_argidx)++;
4706 if (arglen < 0)
4707 return args;
4708 else
4709 return PyTuple_GetItem(args, argidx);
4710 }
4711 PyErr_SetString(PyExc_TypeError,
4712 "not enough arguments for format string");
4713 return NULL;
4714}
4715
4716#define F_LJUST (1<<0)
4717#define F_SIGN (1<<1)
4718#define F_BLANK (1<<2)
4719#define F_ALT (1<<3)
4720#define F_ZERO (1<<4)
4721
4722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724{
4725 register int i;
4726 int len;
4727 va_list va;
4728 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730
4731 /* First, format the string as char array, then expand to Py_UNICODE
4732 array. */
4733 charbuffer = (char *)buffer;
4734 len = vsprintf(charbuffer, format, va);
4735 for (i = len - 1; i >= 0; i--)
4736 buffer[i] = (Py_UNICODE) charbuffer[i];
4737
4738 va_end(va);
4739 return len;
4740}
4741
4742static int
4743formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004744 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 int flags,
4746 int prec,
4747 int type,
4748 PyObject *v)
4749{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004750 /* fmt = '%#.' + `prec` + `type`
4751 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 char fmt[20];
4753 double x;
4754
4755 x = PyFloat_AsDouble(v);
4756 if (x == -1.0 && PyErr_Occurred())
4757 return -1;
4758 if (prec < 0)
4759 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4761 type = 'g';
4762 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004763 /* worst case length calc to ensure no buffer overrun:
4764 fmt = %#.<prec>g
4765 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4766 for any double rep.)
4767 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4768 If prec=0 the effective precision is 1 (the leading digit is
4769 always given), therefore increase by one to 10+prec. */
4770 if (buflen <= (size_t)10 + (size_t)prec) {
4771 PyErr_SetString(PyExc_OverflowError,
4772 "formatted float is too long (precision too long?)");
4773 return -1;
4774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return usprintf(buf, fmt, x);
4776}
4777
Tim Peters38fd5b62000-09-21 05:43:11 +00004778static PyObject*
4779formatlong(PyObject *val, int flags, int prec, int type)
4780{
4781 char *buf;
4782 int i, len;
4783 PyObject *str; /* temporary string object. */
4784 PyUnicodeObject *result;
4785
4786 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4787 if (!str)
4788 return NULL;
4789 result = _PyUnicode_New(len);
4790 for (i = 0; i < len; i++)
4791 result->str[i] = buf[i];
4792 result->str[len] = 0;
4793 Py_DECREF(str);
4794 return (PyObject*)result;
4795}
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797static int
4798formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004799 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 int flags,
4801 int prec,
4802 int type,
4803 PyObject *v)
4804{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004805 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004806 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4807 + 1 + 1 = 24*/
4808 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004810 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812 x = PyInt_AsLong(v);
4813 if (x == -1 && PyErr_Occurred())
4814 return -1;
4815 if (prec < 0)
4816 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004817 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4818 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4819 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4820 PyErr_SetString(PyExc_OverflowError,
4821 "formatted integer is too long (precision too long?)");
4822 return -1;
4823 }
Tim Petersfff53252001-04-12 18:38:48 +00004824 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4825 * but we want it (for consistency with other %#x conversions, and
4826 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004827 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4828 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4829 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004830 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004831 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4832 /* Only way to know what the platform does is to try it. */
4833 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4834 if (fmt[1] != (char)type) {
4835 /* Supply our own leading 0x/0X -- needed under std C */
4836 use_native_c_format = 0;
4837 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4838 }
4839 }
4840 if (use_native_c_format)
4841 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 return usprintf(buf, fmt, x);
4843}
4844
4845static int
4846formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004847 size_t buflen,
4848 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004850 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004851 if (PyUnicode_Check(v)) {
4852 if (PyUnicode_GET_SIZE(v) != 1)
4853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004857 else if (PyString_Check(v)) {
4858 if (PyString_GET_SIZE(v) != 1)
4859 goto onError;
4860 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
4863 else {
4864 /* Integer input truncated to a character */
4865 long x;
4866 x = PyInt_AsLong(v);
4867 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 buf[0] = (char) x;
4870 }
4871 buf[1] = '\0';
4872 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004873
4874 onError:
4875 PyErr_SetString(PyExc_TypeError,
4876 "%c requires int or char");
4877 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878}
4879
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004880/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4881
4882 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4883 chars are formatted. XXX This is a magic number. Each formatting
4884 routine does bounds checking to ensure no overflow, but a better
4885 solution may be to malloc a buffer of appropriate size for each
4886 format. For now, the current solution is sufficient.
4887*/
4888#define FORMATBUFLEN (size_t)120
4889
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890PyObject *PyUnicode_Format(PyObject *format,
4891 PyObject *args)
4892{
4893 Py_UNICODE *fmt, *res;
4894 int fmtcnt, rescnt, reslen, arglen, argidx;
4895 int args_owned = 0;
4896 PyUnicodeObject *result = NULL;
4897 PyObject *dict = NULL;
4898 PyObject *uformat;
4899
4900 if (format == NULL || args == NULL) {
4901 PyErr_BadInternalCall();
4902 return NULL;
4903 }
4904 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004905 if (uformat == NULL)
4906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 fmt = PyUnicode_AS_UNICODE(uformat);
4908 fmtcnt = PyUnicode_GET_SIZE(uformat);
4909
4910 reslen = rescnt = fmtcnt + 100;
4911 result = _PyUnicode_New(reslen);
4912 if (result == NULL)
4913 goto onError;
4914 res = PyUnicode_AS_UNICODE(result);
4915
4916 if (PyTuple_Check(args)) {
4917 arglen = PyTuple_Size(args);
4918 argidx = 0;
4919 }
4920 else {
4921 arglen = -1;
4922 argidx = -2;
4923 }
4924 if (args->ob_type->tp_as_mapping)
4925 dict = args;
4926
4927 while (--fmtcnt >= 0) {
4928 if (*fmt != '%') {
4929 if (--rescnt < 0) {
4930 rescnt = fmtcnt + 100;
4931 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004932 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 return NULL;
4934 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4935 --rescnt;
4936 }
4937 *res++ = *fmt++;
4938 }
4939 else {
4940 /* Got a format specifier */
4941 int flags = 0;
4942 int width = -1;
4943 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 Py_UNICODE c = '\0';
4945 Py_UNICODE fill;
4946 PyObject *v = NULL;
4947 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004948 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 Py_UNICODE sign;
4950 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004951 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952
4953 fmt++;
4954 if (*fmt == '(') {
4955 Py_UNICODE *keystart;
4956 int keylen;
4957 PyObject *key;
4958 int pcount = 1;
4959
4960 if (dict == NULL) {
4961 PyErr_SetString(PyExc_TypeError,
4962 "format requires a mapping");
4963 goto onError;
4964 }
4965 ++fmt;
4966 --fmtcnt;
4967 keystart = fmt;
4968 /* Skip over balanced parentheses */
4969 while (pcount > 0 && --fmtcnt >= 0) {
4970 if (*fmt == ')')
4971 --pcount;
4972 else if (*fmt == '(')
4973 ++pcount;
4974 fmt++;
4975 }
4976 keylen = fmt - keystart - 1;
4977 if (fmtcnt < 0 || pcount > 0) {
4978 PyErr_SetString(PyExc_ValueError,
4979 "incomplete format key");
4980 goto onError;
4981 }
Fred Drakee4315f52000-05-09 19:53:39 +00004982 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 then looked up since Python uses strings to hold
4984 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004985 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 key = PyUnicode_EncodeUTF8(keystart,
4987 keylen,
4988 NULL);
4989 if (key == NULL)
4990 goto onError;
4991 if (args_owned) {
4992 Py_DECREF(args);
4993 args_owned = 0;
4994 }
4995 args = PyObject_GetItem(dict, key);
4996 Py_DECREF(key);
4997 if (args == NULL) {
4998 goto onError;
4999 }
5000 args_owned = 1;
5001 arglen = -1;
5002 argidx = -2;
5003 }
5004 while (--fmtcnt >= 0) {
5005 switch (c = *fmt++) {
5006 case '-': flags |= F_LJUST; continue;
5007 case '+': flags |= F_SIGN; continue;
5008 case ' ': flags |= F_BLANK; continue;
5009 case '#': flags |= F_ALT; continue;
5010 case '0': flags |= F_ZERO; continue;
5011 }
5012 break;
5013 }
5014 if (c == '*') {
5015 v = getnextarg(args, arglen, &argidx);
5016 if (v == NULL)
5017 goto onError;
5018 if (!PyInt_Check(v)) {
5019 PyErr_SetString(PyExc_TypeError,
5020 "* wants int");
5021 goto onError;
5022 }
5023 width = PyInt_AsLong(v);
5024 if (width < 0) {
5025 flags |= F_LJUST;
5026 width = -width;
5027 }
5028 if (--fmtcnt >= 0)
5029 c = *fmt++;
5030 }
5031 else if (c >= '0' && c <= '9') {
5032 width = c - '0';
5033 while (--fmtcnt >= 0) {
5034 c = *fmt++;
5035 if (c < '0' || c > '9')
5036 break;
5037 if ((width*10) / 10 != width) {
5038 PyErr_SetString(PyExc_ValueError,
5039 "width too big");
5040 goto onError;
5041 }
5042 width = width*10 + (c - '0');
5043 }
5044 }
5045 if (c == '.') {
5046 prec = 0;
5047 if (--fmtcnt >= 0)
5048 c = *fmt++;
5049 if (c == '*') {
5050 v = getnextarg(args, arglen, &argidx);
5051 if (v == NULL)
5052 goto onError;
5053 if (!PyInt_Check(v)) {
5054 PyErr_SetString(PyExc_TypeError,
5055 "* wants int");
5056 goto onError;
5057 }
5058 prec = PyInt_AsLong(v);
5059 if (prec < 0)
5060 prec = 0;
5061 if (--fmtcnt >= 0)
5062 c = *fmt++;
5063 }
5064 else if (c >= '0' && c <= '9') {
5065 prec = c - '0';
5066 while (--fmtcnt >= 0) {
5067 c = Py_CHARMASK(*fmt++);
5068 if (c < '0' || c > '9')
5069 break;
5070 if ((prec*10) / 10 != prec) {
5071 PyErr_SetString(PyExc_ValueError,
5072 "prec too big");
5073 goto onError;
5074 }
5075 prec = prec*10 + (c - '0');
5076 }
5077 }
5078 } /* prec */
5079 if (fmtcnt >= 0) {
5080 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 if (--fmtcnt >= 0)
5082 c = *fmt++;
5083 }
5084 }
5085 if (fmtcnt < 0) {
5086 PyErr_SetString(PyExc_ValueError,
5087 "incomplete format");
5088 goto onError;
5089 }
5090 if (c != '%') {
5091 v = getnextarg(args, arglen, &argidx);
5092 if (v == NULL)
5093 goto onError;
5094 }
5095 sign = 0;
5096 fill = ' ';
5097 switch (c) {
5098
5099 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005100 pbuf = formatbuf;
5101 /* presume that buffer length is at least 1 */
5102 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 len = 1;
5104 break;
5105
5106 case 's':
5107 case 'r':
5108 if (PyUnicode_Check(v) && c == 's') {
5109 temp = v;
5110 Py_INCREF(temp);
5111 }
5112 else {
5113 PyObject *unicode;
5114 if (c == 's')
5115 temp = PyObject_Str(v);
5116 else
5117 temp = PyObject_Repr(v);
5118 if (temp == NULL)
5119 goto onError;
5120 if (!PyString_Check(temp)) {
5121 /* XXX Note: this should never happen, since
5122 PyObject_Repr() and PyObject_Str() assure
5123 this */
5124 Py_DECREF(temp);
5125 PyErr_SetString(PyExc_TypeError,
5126 "%s argument has non-string str()");
5127 goto onError;
5128 }
Fred Drakee4315f52000-05-09 19:53:39 +00005129 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005131 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 "strict");
5133 Py_DECREF(temp);
5134 temp = unicode;
5135 if (temp == NULL)
5136 goto onError;
5137 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005138 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 len = PyUnicode_GET_SIZE(temp);
5140 if (prec >= 0 && len > prec)
5141 len = prec;
5142 break;
5143
5144 case 'i':
5145 case 'd':
5146 case 'u':
5147 case 'o':
5148 case 'x':
5149 case 'X':
5150 if (c == 'i')
5151 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005152 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005153 temp = formatlong(v, flags, prec, c);
5154 if (!temp)
5155 goto onError;
5156 pbuf = PyUnicode_AS_UNICODE(temp);
5157 len = PyUnicode_GET_SIZE(temp);
5158 /* unbounded ints can always produce
5159 a sign character! */
5160 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005162 else {
5163 pbuf = formatbuf;
5164 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5165 flags, prec, c, v);
5166 if (len < 0)
5167 goto onError;
5168 /* only d conversion is signed */
5169 sign = c == 'd';
5170 }
5171 if (flags & F_ZERO)
5172 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 break;
5174
5175 case 'e':
5176 case 'E':
5177 case 'f':
5178 case 'g':
5179 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005180 pbuf = formatbuf;
5181 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5182 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 if (len < 0)
5184 goto onError;
5185 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005186 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 fill = '0';
5188 break;
5189
5190 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005191 pbuf = formatbuf;
5192 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 if (len < 0)
5194 goto onError;
5195 break;
5196
5197 default:
5198 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005199 "unsupported format character '%c' (0x%x) "
5200 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005201 (31<=c && c<=126) ? c : '?',
5202 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 goto onError;
5204 }
5205 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005206 if (*pbuf == '-' || *pbuf == '+') {
5207 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 len--;
5209 }
5210 else if (flags & F_SIGN)
5211 sign = '+';
5212 else if (flags & F_BLANK)
5213 sign = ' ';
5214 else
5215 sign = 0;
5216 }
5217 if (width < len)
5218 width = len;
5219 if (rescnt < width + (sign != 0)) {
5220 reslen -= rescnt;
5221 rescnt = width + fmtcnt + 100;
5222 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005223 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 return NULL;
5225 res = PyUnicode_AS_UNICODE(result)
5226 + reslen - rescnt;
5227 }
5228 if (sign) {
5229 if (fill != ' ')
5230 *res++ = sign;
5231 rescnt--;
5232 if (width > len)
5233 width--;
5234 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005235 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5236 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005237 assert(pbuf[1] == c);
5238 if (fill != ' ') {
5239 *res++ = *pbuf++;
5240 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005241 }
Tim Petersfff53252001-04-12 18:38:48 +00005242 rescnt -= 2;
5243 width -= 2;
5244 if (width < 0)
5245 width = 0;
5246 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 if (width > len && !(flags & F_LJUST)) {
5249 do {
5250 --rescnt;
5251 *res++ = fill;
5252 } while (--width > len);
5253 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005254 if (fill == ' ') {
5255 if (sign)
5256 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005257 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005258 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005259 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005260 *res++ = *pbuf++;
5261 *res++ = *pbuf++;
5262 }
5263 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005264 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 res += len;
5266 rescnt -= len;
5267 while (--width >= len) {
5268 --rescnt;
5269 *res++ = ' ';
5270 }
5271 if (dict && (argidx < arglen) && c != '%') {
5272 PyErr_SetString(PyExc_TypeError,
5273 "not all arguments converted");
5274 goto onError;
5275 }
5276 Py_XDECREF(temp);
5277 } /* '%' */
5278 } /* until end */
5279 if (argidx < arglen && !dict) {
5280 PyErr_SetString(PyExc_TypeError,
5281 "not all arguments converted");
5282 goto onError;
5283 }
5284
5285 if (args_owned) {
5286 Py_DECREF(args);
5287 }
5288 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005289 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005290 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 return (PyObject *)result;
5292
5293 onError:
5294 Py_XDECREF(result);
5295 Py_DECREF(uformat);
5296 if (args_owned) {
5297 Py_DECREF(args);
5298 }
5299 return NULL;
5300}
5301
5302static PyBufferProcs unicode_as_buffer = {
5303 (getreadbufferproc) unicode_buffer_getreadbuf,
5304 (getwritebufferproc) unicode_buffer_getwritebuf,
5305 (getsegcountproc) unicode_buffer_getsegcount,
5306 (getcharbufferproc) unicode_buffer_getcharbuf,
5307};
5308
Guido van Rossume023fe02001-08-30 03:12:59 +00005309staticforward PyObject *
5310unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5311
Tim Peters6d6c1a32001-08-02 04:15:00 +00005312static PyObject *
5313unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5314{
5315 PyObject *x = NULL;
5316 static char *kwlist[] = {"string", "encoding", "errors", 0};
5317 char *encoding = NULL;
5318 char *errors = NULL;
5319
Guido van Rossume023fe02001-08-30 03:12:59 +00005320 if (type != &PyUnicode_Type)
5321 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005322 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5323 kwlist, &x, &encoding, &errors))
5324 return NULL;
5325 if (x == NULL)
5326 return (PyObject *)_PyUnicode_New(0);
5327 return PyUnicode_FromEncodedObject(x, encoding, errors);
5328}
5329
Guido van Rossume023fe02001-08-30 03:12:59 +00005330static PyObject *
5331unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5332{
5333 PyUnicodeObject *tmp, *new;
5334 int n;
5335
5336 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5337 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5338 if (tmp == NULL)
5339 return NULL;
5340 assert(PyUnicode_Check(tmp));
5341 new = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5342 if (new == NULL)
5343 return NULL;
5344 new->str = PyMem_NEW(Py_UNICODE, n+1);
5345 if (new->str == NULL) {
5346 _Py_ForgetReference((PyObject *)new);
5347 PyObject_DEL(new);
5348 return NULL;
5349 }
5350 Py_UNICODE_COPY(new->str, tmp->str, n+1);
5351 new->length = n;
5352 Py_DECREF(tmp);
5353 return (PyObject *)new;
5354}
5355
Tim Peters6d6c1a32001-08-02 04:15:00 +00005356static char unicode_doc[] =
5357"unicode(string [, encoding[, errors]]) -> object\n\
5358\n\
5359Create a new Unicode object from the given encoded string.\n\
5360encoding defaults to the current default string encoding and \n\
5361errors, defining the error handling, to 'strict'.";
5362
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363PyTypeObject PyUnicode_Type = {
5364 PyObject_HEAD_INIT(&PyType_Type)
5365 0, /* ob_size */
5366 "unicode", /* tp_name */
5367 sizeof(PyUnicodeObject), /* tp_size */
5368 0, /* tp_itemsize */
5369 /* Slots */
5370 (destructor)_PyUnicode_Free, /* tp_dealloc */
5371 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005372 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 0, /* tp_setattr */
5374 (cmpfunc) unicode_compare, /* tp_compare */
5375 (reprfunc) unicode_repr, /* tp_repr */
5376 0, /* tp_as_number */
5377 &unicode_as_sequence, /* tp_as_sequence */
5378 0, /* tp_as_mapping */
5379 (hashfunc) unicode_hash, /* tp_hash*/
5380 0, /* tp_call*/
5381 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005382 PyObject_GenericGetAttr, /* tp_getattro */
5383 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005385 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005386 unicode_doc, /* tp_doc */
5387 0, /* tp_traverse */
5388 0, /* tp_clear */
5389 0, /* tp_richcompare */
5390 0, /* tp_weaklistoffset */
5391 0, /* tp_iter */
5392 0, /* tp_iternext */
5393 unicode_methods, /* tp_methods */
5394 0, /* tp_members */
5395 0, /* tp_getset */
5396 0, /* tp_base */
5397 0, /* tp_dict */
5398 0, /* tp_descr_get */
5399 0, /* tp_descr_set */
5400 0, /* tp_dictoffset */
5401 0, /* tp_init */
5402 0, /* tp_alloc */
5403 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404};
5405
5406/* Initialize the Unicode implementation */
5407
Thomas Wouters78890102000-07-22 19:25:51 +00005408void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005410 int i;
5411
Fred Drakee4315f52000-05-09 19:53:39 +00005412 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005413 unicode_freelist = NULL;
5414 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005416 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005417 for (i = 0; i < 256; i++)
5418 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419}
5420
5421/* Finalize the Unicode implementation */
5422
5423void
Thomas Wouters78890102000-07-22 19:25:51 +00005424_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005426 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005427 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005429 Py_XDECREF(unicode_empty);
5430 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005431
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005432 for (i = 0; i < 256; i++) {
5433 if (unicode_latin1[i]) {
5434 Py_DECREF(unicode_latin1[i]);
5435 unicode_latin1[i] = NULL;
5436 }
5437 }
5438
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005439 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 PyUnicodeObject *v = u;
5441 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005442 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005443 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005444 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005445 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005447 unicode_freelist = NULL;
5448 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449}