blob: c25c5ac9cb3e104886a65f0c39371cbd386e74fd [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
433 }
434 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
439 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450
451 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 if (len == 0) {
453 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000454 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000458
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000460 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463 return v;
464
465 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000466 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470}
471
472PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *buffer = NULL, *unicode;
478
Fred Drakee4315f52000-05-09 19:53:39 +0000479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
481
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000499 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
503 }
504 Py_DECREF(buffer);
505 return unicode;
506
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
510}
511
512PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
516{
517 PyObject *v, *unicode;
518
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
525}
526
527PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v;
532
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
Fred Drakee4315f52000-05-09 19:53:39 +0000537
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000544 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000558 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
562 }
563 return v;
564
565 onError:
566 return NULL;
567}
568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000569PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
570 const char *errors)
571{
572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
573
574 if (v)
575 return v;
576 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
577 if (v && errors == NULL)
578 ((PyUnicodeObject *)unicode)->defenc = v;
579 return v;
580}
581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
583{
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
587 }
588 return PyUnicode_AS_UNICODE(unicode);
589
590 onError:
591 return NULL;
592}
593
594int PyUnicode_GetSize(PyObject *unicode)
595{
596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
600 return PyUnicode_GET_SIZE(unicode);
601
602 onError:
603 return -1;
604}
605
Thomas Wouters78890102000-07-22 19:25:51 +0000606const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000607{
608 return unicode_default_encoding;
609}
610
611int PyUnicode_SetDefaultEncoding(const char *encoding)
612{
613 PyObject *v;
614
615 /* Make sure the encoding is valid. As side effect, this also
616 loads the encoding into the codec registry cache. */
617 v = _PyCodec_Lookup(encoding);
618 if (v == NULL)
619 goto onError;
620 Py_DECREF(v);
621 strncpy(unicode_default_encoding,
622 encoding,
623 sizeof(unicode_default_encoding));
624 return 0;
625
626 onError:
627 return -1;
628}
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630/* --- UTF-8 Codec -------------------------------------------------------- */
631
632static
633char utf8_code_length[256] = {
634 /* Map UTF-8 encoded prefix byte to sequence length. zero means
635 illegal prefix. see RFC 2279 for details */
636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
649 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
650 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
652};
653
654static
655int utf8_decoding_error(const char **source,
656 Py_UNICODE **dest,
657 const char *errors,
658 const char *details)
659{
660 if ((errors == NULL) ||
661 (strcmp(errors,"strict") == 0)) {
662 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 details);
665 return -1;
666 }
667 else if (strcmp(errors,"ignore") == 0) {
668 (*source)++;
669 return 0;
670 }
671 else if (strcmp(errors,"replace") == 0) {
672 (*source)++;
673 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
674 (*dest)++;
675 return 0;
676 }
677 else {
678 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000679 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000680 errors);
681 return -1;
682 }
683}
684
Guido van Rossumd57fd912000-03-10 22:53:23 +0000685PyObject *PyUnicode_DecodeUTF8(const char *s,
686 int size,
687 const char *errors)
688{
689 int n;
690 const char *e;
691 PyUnicodeObject *unicode;
692 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000693 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694
695 /* Note: size will always be longer than the resulting Unicode
696 character count */
697 unicode = _PyUnicode_New(size);
698 if (!unicode)
699 return NULL;
700 if (size == 0)
701 return (PyObject *)unicode;
702
703 /* Unpack UTF-8 encoded data */
704 p = unicode->str;
705 e = s + size;
706
707 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000708 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000711 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712 s++;
713 continue;
714 }
715
716 n = utf8_code_length[ch];
717
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if (s + n > e) {
719 errmsg = "unexpected end of data";
720 goto utf8Error;
721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722
723 switch (n) {
724
725 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000726 errmsg = "unexpected code byte";
727 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728
729 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000730 errmsg = "internal error";
731 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732
733 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000734 if ((s[1] & 0xc0) != 0x80) {
735 errmsg = "invalid data";
736 goto utf8Error;
737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000739 if (ch < 0x80) {
740 errmsg = "illegal encoding";
741 goto utf8Error;
742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000743 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 break;
746
747 case 3:
748 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000749 (s[2] & 0xc0) != 0x80) {
750 errmsg = "invalid data";
751 goto utf8Error;
752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000754 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
755 errmsg = "illegal encoding";
756 goto utf8Error;
757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000759 *p++ = (Py_UNICODE)ch;
760 break;
761
762 case 4:
763 if ((s[1] & 0xc0) != 0x80 ||
764 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000765 (s[3] & 0xc0) != 0x80) {
766 errmsg = "invalid data";
767 goto utf8Error;
768 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000769 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
770 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
771 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000772 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000773 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000774 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000775 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000776 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 errmsg = "illegal encoding";
778 goto utf8Error;
779 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000781 *p++ = (Py_UNICODE)ch;
782#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000783 /* compute and append the two surrogates: */
784
785 /* translate from 10000..10FFFF to 0..FFFF */
786 ch -= 0x10000;
787
788 /* high surrogate = top 10 bits added to D800 */
789 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
790
791 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000792 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000793#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 break;
795
796 default:
797 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000798 errmsg = "unsupported Unicode code range";
799 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800 }
801 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000802 continue;
803
804 utf8Error:
805 if (utf8_decoding_error(&s, &p, errors, errmsg))
806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
809 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000810 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 goto onError;
812
813 return (PyObject *)unicode;
814
815onError:
816 Py_DECREF(unicode);
817 return NULL;
818}
819
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000820/* Not used anymore, now that the encoder supports UTF-16
821 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000822#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823static
824int utf8_encoding_error(const Py_UNICODE **source,
825 char **dest,
826 const char *errors,
827 const char *details)
828{
829 if ((errors == NULL) ||
830 (strcmp(errors,"strict") == 0)) {
831 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000832 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000833 details);
834 return -1;
835 }
836 else if (strcmp(errors,"ignore") == 0) {
837 return 0;
838 }
839 else if (strcmp(errors,"replace") == 0) {
840 **dest = '?';
841 (*dest)++;
842 return 0;
843 }
844 else {
845 PyErr_Format(PyExc_ValueError,
846 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000847 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 errors);
849 return -1;
850 }
851}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000852#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000853
854PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
855 int size,
856 const char *errors)
857{
858 PyObject *v;
859 char *p;
860 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000861 Py_UCS4 ch2;
862 unsigned int cbAllocated = 3 * size;
863 unsigned int cbWritten = 0;
864 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000865
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000866 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867 if (v == NULL)
868 return NULL;
869 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000870 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871
872 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000873 while (i < size) {
874 Py_UCS4 ch = s[i++];
875 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000877 cbWritten++;
878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879 else if (ch < 0x0800) {
880 *p++ = 0xc0 | (ch >> 6);
881 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000882 cbWritten += 2;
883 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000884 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000885 /* Check for high surrogate */
886 if (0xD800 <= ch && ch <= 0xDBFF) {
887 if (i != size) {
888 ch2 = s[i];
889 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
890
891 if (cbWritten >= (cbAllocated - 4)) {
892 /* Provide enough room for some more
893 surrogates */
894 cbAllocated += 4*10;
895 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000896 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000897 }
898
899 /* combine the two values */
900 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
901
902 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000903 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000904 i++;
905 cbWritten += 4;
906 }
907 }
908 }
909 else {
910 *p++ = (char)(0xe0 | (ch >> 12));
911 cbWritten += 3;
912 }
913 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
914 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000915 } else {
916 *p++ = 0xf0 | (ch>>18);
917 *p++ = 0x80 | ((ch>>12) & 0x3f);
918 *p++ = 0x80 | ((ch>>6) & 0x3f);
919 *p++ = 0x80 | (ch & 0x3f);
920 cbWritten += 4;
921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922 }
923 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000924 if (_PyString_Resize(&v, p - q))
925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 return v;
927
928 onError:
929 Py_DECREF(v);
930 return NULL;
931}
932
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
934{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 if (!PyUnicode_Check(unicode)) {
936 PyErr_BadArgument();
937 return NULL;
938 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000939 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
940 PyUnicode_GET_SIZE(unicode),
941 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942}
943
944/* --- UTF-16 Codec ------------------------------------------------------- */
945
946static
Tim Peters772747b2001-08-09 22:21:55 +0000947int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948 const char *errors,
949 const char *details)
950{
951 if ((errors == NULL) ||
952 (strcmp(errors,"strict") == 0)) {
953 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000954 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955 details);
956 return -1;
957 }
958 else if (strcmp(errors,"ignore") == 0) {
959 return 0;
960 }
961 else if (strcmp(errors,"replace") == 0) {
962 if (dest) {
963 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
964 (*dest)++;
965 }
966 return 0;
967 }
968 else {
969 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000970 "UTF-16 decoding error; "
971 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 errors);
973 return -1;
974 }
975}
976
Tim Peters772747b2001-08-09 22:21:55 +0000977PyObject *
978PyUnicode_DecodeUTF16(const char *s,
979 int size,
980 const char *errors,
981 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982{
983 PyUnicodeObject *unicode;
984 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +0000985 const unsigned char *q, *e;
986 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000987 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +0000988 /* Offsets from q for retrieving byte pairs in the right order. */
989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
990 int ihi = 1, ilo = 0;
991#else
992 int ihi = 0, ilo = 1;
993#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994
995 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +0000996 if (size & 1) {
997 if (utf16_decoding_error(NULL, errors, "truncated data"))
998 return NULL;
999 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 }
1001
1002 /* Note: size will always be longer than the resulting Unicode
1003 character count */
1004 unicode = _PyUnicode_New(size);
1005 if (!unicode)
1006 return NULL;
1007 if (size == 0)
1008 return (PyObject *)unicode;
1009
1010 /* Unpack UTF-16 encoded data */
1011 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001012 q = (unsigned char *)s;
1013 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001014
1015 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001016 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001018 /* Check for BOM marks (U+FEFF) in the input and adjust current
1019 byte order setting accordingly. In native mode, the leading BOM
1020 mark is skipped, in all other modes, it is copied to the output
1021 stream as-is (giving a ZWNBSP character). */
1022 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001023 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001025 if (bom == 0xFEFF) {
1026 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001027 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001028 }
1029 else if (bom == 0xFFFE) {
1030 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001031 bo = 1;
1032 }
1033#else
Tim Peters772747b2001-08-09 22:21:55 +00001034 if (bom == 0xFEFF) {
1035 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001036 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001037 }
1038 else if (bom == 0xFFFE) {
1039 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001040 bo = -1;
1041 }
1042#endif
1043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044
Tim Peters772747b2001-08-09 22:21:55 +00001045 if (bo == -1) {
1046 /* force LE */
1047 ihi = 1;
1048 ilo = 0;
1049 }
1050 else if (bo == 1) {
1051 /* force BE */
1052 ihi = 0;
1053 ilo = 1;
1054 }
1055
1056 while (q < e) {
1057 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1058 q += 2;
1059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 if (ch < 0xD800 || ch > 0xDFFF) {
1061 *p++ = ch;
1062 continue;
1063 }
1064
1065 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001066 if (q >= e) {
1067 errmsg = "unexpected end of data";
1068 goto utf16Error;
1069 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001070 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001071 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1072 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001073 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001074#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001075 *p++ = ch;
1076 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001077#else
1078 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001079#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001080 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001081 }
1082 else {
1083 errmsg = "illegal UTF-16 surrogate";
1084 goto utf16Error;
1085 }
1086
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001088 errmsg = "illegal encoding";
1089 /* Fall through to report the error */
1090
1091 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001092 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001093 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 }
1095
1096 if (byteorder)
1097 *byteorder = bo;
1098
1099 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001100 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 goto onError;
1102
1103 return (PyObject *)unicode;
1104
1105onError:
1106 Py_DECREF(unicode);
1107 return NULL;
1108}
1109
Tim Peters772747b2001-08-09 22:21:55 +00001110PyObject *
1111PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1112 int size,
1113 const char *errors,
1114 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115{
1116 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001117 unsigned char *p;
1118 int i, pairs;
1119 /* Offsets from p for storing byte pairs in the right order. */
1120#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1121 int ihi = 1, ilo = 0;
1122#else
1123 int ihi = 0, ilo = 1;
1124#endif
1125
1126#define STORECHAR(CH) \
1127 do { \
1128 p[ihi] = ((CH) >> 8) & 0xff; \
1129 p[ilo] = (CH) & 0xff; \
1130 p += 2; \
1131 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001133 for (i = pairs = 0; i < size; i++)
1134 if (s[i] >= 0x10000)
1135 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001137 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 if (v == NULL)
1139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
Tim Peters772747b2001-08-09 22:21:55 +00001141 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001143 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001144 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001145 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001146
1147 if (byteorder == -1) {
1148 /* force LE */
1149 ihi = 1;
1150 ilo = 0;
1151 }
1152 else if (byteorder == 1) {
1153 /* force BE */
1154 ihi = 0;
1155 ilo = 1;
1156 }
1157
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001158 while (size-- > 0) {
1159 Py_UNICODE ch = *s++;
1160 Py_UNICODE ch2 = 0;
1161 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001162 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1163 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Tim Peters772747b2001-08-09 22:21:55 +00001165 STORECHAR(ch);
1166 if (ch2)
1167 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001170#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171}
1172
1173PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1174{
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 return NULL;
1178 }
1179 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1180 PyUnicode_GET_SIZE(unicode),
1181 NULL,
1182 0);
1183}
1184
1185/* --- Unicode Escape Codec ----------------------------------------------- */
1186
1187static
1188int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001189 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 const char *errors,
1191 const char *details)
1192{
1193 if ((errors == NULL) ||
1194 (strcmp(errors,"strict") == 0)) {
1195 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001196 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 details);
1198 return -1;
1199 }
1200 else if (strcmp(errors,"ignore") == 0) {
1201 return 0;
1202 }
1203 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001204 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 return 0;
1206 }
1207 else {
1208 PyErr_Format(PyExc_ValueError,
1209 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001210 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 errors);
1212 return -1;
1213 }
1214}
1215
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1219 int size,
1220 const char *errors)
1221{
1222 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001225 char* message;
1226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 /* Escaped strings will always be longer than the resulting
1229 Unicode string, so we start with size here and then reduce the
1230 length after conversion to the true value. */
1231 v = _PyUnicode_New(size);
1232 if (v == NULL)
1233 goto onError;
1234 if (size == 0)
1235 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 p = buf = PyUnicode_AS_UNICODE(v);
1238 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 while (s < end) {
1241 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001242 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001243 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244
1245 /* Non-escape characters are interpreted as Unicode ordinals */
1246 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 continue;
1249 }
1250
1251 /* \ - Escapes */
1252 s++;
1253 switch (*s++) {
1254
1255 /* \x escapes */
1256 case '\n': break;
1257 case '\\': *p++ = '\\'; break;
1258 case '\'': *p++ = '\''; break;
1259 case '\"': *p++ = '\"'; break;
1260 case 'b': *p++ = '\b'; break;
1261 case 'f': *p++ = '\014'; break; /* FF */
1262 case 't': *p++ = '\t'; break;
1263 case 'n': *p++ = '\n'; break;
1264 case 'r': *p++ = '\r'; break;
1265 case 'v': *p++ = '\013'; break; /* VT */
1266 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1267
1268 /* \OOO (octal) escapes */
1269 case '0': case '1': case '2': case '3':
1270 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001271 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001273 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001275 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001277 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 break;
1279
Fredrik Lundhccc74732001-02-18 22:13:49 +00001280 /* hex escapes */
1281 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001283 digits = 2;
1284 message = "truncated \\xXX escape";
1285 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
Fredrik Lundhccc74732001-02-18 22:13:49 +00001287 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001289 digits = 4;
1290 message = "truncated \\uXXXX escape";
1291 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292
Fredrik Lundhccc74732001-02-18 22:13:49 +00001293 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001294 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001295 digits = 8;
1296 message = "truncated \\UXXXXXXXX escape";
1297 hexescape:
1298 chr = 0;
1299 for (i = 0; i < digits; i++) {
1300 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001301 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001302 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001303 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001304 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001305 i++;
1306 break;
1307 }
1308 chr = (chr<<4) & ~0xF;
1309 if (c >= '0' && c <= '9')
1310 chr += c - '0';
1311 else if (c >= 'a' && c <= 'f')
1312 chr += 10 + c - 'a';
1313 else
1314 chr += 10 + c - 'A';
1315 }
1316 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001317 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001318 /* when we get here, chr is a 32-bit unicode character */
1319 if (chr <= 0xffff)
1320 /* UCS-2 character */
1321 *p++ = (Py_UNICODE) chr;
1322 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001323 /* UCS-4 character. Either store directly, or as
1324 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001325#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001326 *p++ = chr;
1327#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001328 chr -= 0x10000L;
1329 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001330 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001331#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001332 } else {
1333 if (unicodeescape_decoding_error(
1334 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001335 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 )
1337 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001338 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001339 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001340 break;
1341
1342 /* \N{name} */
1343 case 'N':
1344 message = "malformed \\N character escape";
1345 if (ucnhash_CAPI == NULL) {
1346 /* load the unicode data module */
1347 PyObject *m, *v;
1348 m = PyImport_ImportModule("unicodedata");
1349 if (m == NULL)
1350 goto ucnhashError;
1351 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1352 Py_DECREF(m);
1353 if (v == NULL)
1354 goto ucnhashError;
1355 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1356 Py_DECREF(v);
1357 if (ucnhash_CAPI == NULL)
1358 goto ucnhashError;
1359 }
1360 if (*s == '{') {
1361 const char *start = s+1;
1362 /* look for the closing brace */
1363 while (*s != '}' && s < end)
1364 s++;
1365 if (s > start && s < end && *s == '}') {
1366 /* found a name. look it up in the unicode database */
1367 message = "unknown Unicode character name";
1368 s++;
1369 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1370 goto store;
1371 }
1372 }
1373 if (unicodeescape_decoding_error(&s, &x, errors, message))
1374 goto onError;
1375 *p++ = x;
1376 break;
1377
1378 default:
1379 *p++ = '\\';
1380 *p++ = (unsigned char)s[-1];
1381 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
1383 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001384 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 return (PyObject *)v;
1387
Fredrik Lundhccc74732001-02-18 22:13:49 +00001388ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001389 PyErr_SetString(
1390 PyExc_UnicodeError,
1391 "\\N escapes not supported (can't load unicodedata module)"
1392 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001393 return NULL;
1394
Fredrik Lundhccc74732001-02-18 22:13:49 +00001395onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_XDECREF(v);
1397 return NULL;
1398}
1399
1400/* Return a Unicode-Escape string version of the Unicode object.
1401
1402 If quotes is true, the string is enclosed in u"" or u'' quotes as
1403 appropriate.
1404
1405*/
1406
Barry Warsaw51ac5802000-03-20 16:36:48 +00001407static const Py_UNICODE *findchar(const Py_UNICODE *s,
1408 int size,
1409 Py_UNICODE ch);
1410
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411static
1412PyObject *unicodeescape_string(const Py_UNICODE *s,
1413 int size,
1414 int quotes)
1415{
1416 PyObject *repr;
1417 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001419 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420
1421 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1422 if (repr == NULL)
1423 return NULL;
1424
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001425 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426
1427 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428 *p++ = 'u';
1429 *p++ = (findchar(s, size, '\'') &&
1430 !findchar(s, size, '"')) ? '"' : '\'';
1431 }
1432 while (size-- > 0) {
1433 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001434
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001436 if (quotes &&
1437 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 *p++ = '\\';
1439 *p++ = (char) ch;
1440 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001441
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001442#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001443 /* Map 21-bit characters to '\U00xxxxxx' */
1444 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001445 int offset = p - PyString_AS_STRING(repr);
1446
1447 /* Resize the string if necessary */
1448 if (offset + 12 > PyString_GET_SIZE(repr)) {
1449 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1450 goto onError;
1451 p = PyString_AS_STRING(repr) + offset;
1452 }
1453
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001454 *p++ = '\\';
1455 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001456 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1457 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1458 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1459 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1460 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1461 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1462 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001463 *p++ = hexdigit[ch & 0x0000000F];
1464 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001465 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001466#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001467 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1468 else if (ch >= 0xD800 && ch < 0xDC00) {
1469 Py_UNICODE ch2;
1470 Py_UCS4 ucs;
1471
1472 ch2 = *s++;
1473 size--;
1474 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1475 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1476 *p++ = '\\';
1477 *p++ = 'U';
1478 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1479 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1480 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1481 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1482 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1483 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1484 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1485 *p++ = hexdigit[ucs & 0x0000000F];
1486 continue;
1487 }
1488 /* Fall through: isolated surrogates are copied as-is */
1489 s--;
1490 size++;
1491 }
1492
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001494 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 *p++ = '\\';
1496 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001497 *p++ = hexdigit[(ch >> 12) & 0x000F];
1498 *p++ = hexdigit[(ch >> 8) & 0x000F];
1499 *p++ = hexdigit[(ch >> 4) & 0x000F];
1500 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001502
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001503 /* Map special whitespace to '\t', \n', '\r' */
1504 else if (ch == '\t') {
1505 *p++ = '\\';
1506 *p++ = 't';
1507 }
1508 else if (ch == '\n') {
1509 *p++ = '\\';
1510 *p++ = 'n';
1511 }
1512 else if (ch == '\r') {
1513 *p++ = '\\';
1514 *p++ = 'r';
1515 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001516
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001517 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 else if (ch < ' ' || ch >= 128) {
1519 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001520 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001521 *p++ = hexdigit[(ch >> 4) & 0x000F];
1522 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001524
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 /* Copy everything else as-is */
1526 else
1527 *p++ = (char) ch;
1528 }
1529 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001530 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531
1532 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001533 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535
1536 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001537
1538 onError:
1539 Py_DECREF(repr);
1540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541}
1542
1543PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1544 int size)
1545{
1546 return unicodeescape_string(s, size, 0);
1547}
1548
1549PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1550{
1551 if (!PyUnicode_Check(unicode)) {
1552 PyErr_BadArgument();
1553 return NULL;
1554 }
1555 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1556 PyUnicode_GET_SIZE(unicode));
1557}
1558
1559/* --- Raw Unicode Escape Codec ------------------------------------------- */
1560
1561PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1562 int size,
1563 const char *errors)
1564{
1565 PyUnicodeObject *v;
1566 Py_UNICODE *p, *buf;
1567 const char *end;
1568 const char *bs;
1569
1570 /* Escaped strings will always be longer than the resulting
1571 Unicode string, so we start with size here and then reduce the
1572 length after conversion to the true value. */
1573 v = _PyUnicode_New(size);
1574 if (v == NULL)
1575 goto onError;
1576 if (size == 0)
1577 return (PyObject *)v;
1578 p = buf = PyUnicode_AS_UNICODE(v);
1579 end = s + size;
1580 while (s < end) {
1581 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001582 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 int i;
1584
1585 /* Non-escape characters are interpreted as Unicode ordinals */
1586 if (*s != '\\') {
1587 *p++ = (unsigned char)*s++;
1588 continue;
1589 }
1590
1591 /* \u-escapes are only interpreted iff the number of leading
1592 backslashes if odd */
1593 bs = s;
1594 for (;s < end;) {
1595 if (*s != '\\')
1596 break;
1597 *p++ = (unsigned char)*s++;
1598 }
1599 if (((s - bs) & 1) == 0 ||
1600 s >= end ||
1601 *s != 'u') {
1602 continue;
1603 }
1604 p--;
1605 s++;
1606
1607 /* \uXXXX with 4 hex digits */
1608 for (x = 0, i = 0; i < 4; i++) {
1609 c = (unsigned char)s[i];
1610 if (!isxdigit(c)) {
1611 if (unicodeescape_decoding_error(&s, &x, errors,
1612 "truncated \\uXXXX"))
1613 goto onError;
1614 i++;
1615 break;
1616 }
1617 x = (x<<4) & ~0xF;
1618 if (c >= '0' && c <= '9')
1619 x += c - '0';
1620 else if (c >= 'a' && c <= 'f')
1621 x += 10 + c - 'a';
1622 else
1623 x += 10 + c - 'A';
1624 }
1625 s += i;
1626 *p++ = x;
1627 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001629 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return (PyObject *)v;
1631
1632 onError:
1633 Py_XDECREF(v);
1634 return NULL;
1635}
1636
1637PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1638 int size)
1639{
1640 PyObject *repr;
1641 char *p;
1642 char *q;
1643
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001644 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645
1646 repr = PyString_FromStringAndSize(NULL, 6 * size);
1647 if (repr == NULL)
1648 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001649 if (size == 0)
1650 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
1652 p = q = PyString_AS_STRING(repr);
1653 while (size-- > 0) {
1654 Py_UNICODE ch = *s++;
1655 /* Map 16-bit characters to '\uxxxx' */
1656 if (ch >= 256) {
1657 *p++ = '\\';
1658 *p++ = 'u';
1659 *p++ = hexdigit[(ch >> 12) & 0xf];
1660 *p++ = hexdigit[(ch >> 8) & 0xf];
1661 *p++ = hexdigit[(ch >> 4) & 0xf];
1662 *p++ = hexdigit[ch & 15];
1663 }
1664 /* Copy everything else as-is */
1665 else
1666 *p++ = (char) ch;
1667 }
1668 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001669 if (_PyString_Resize(&repr, p - q))
1670 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
1672 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001673
1674 onError:
1675 Py_DECREF(repr);
1676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677}
1678
1679PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1680{
1681 if (!PyUnicode_Check(unicode)) {
1682 PyErr_BadArgument();
1683 return NULL;
1684 }
1685 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1686 PyUnicode_GET_SIZE(unicode));
1687}
1688
1689/* --- Latin-1 Codec ------------------------------------------------------ */
1690
1691PyObject *PyUnicode_DecodeLatin1(const char *s,
1692 int size,
1693 const char *errors)
1694{
1695 PyUnicodeObject *v;
1696 Py_UNICODE *p;
1697
1698 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699 if (size == 1 && *(unsigned char*)s < 256) {
1700 Py_UNICODE r = *(unsigned char*)s;
1701 return PyUnicode_FromUnicode(&r, 1);
1702 }
1703
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 v = _PyUnicode_New(size);
1705 if (v == NULL)
1706 goto onError;
1707 if (size == 0)
1708 return (PyObject *)v;
1709 p = PyUnicode_AS_UNICODE(v);
1710 while (size-- > 0)
1711 *p++ = (unsigned char)*s++;
1712 return (PyObject *)v;
1713
1714 onError:
1715 Py_XDECREF(v);
1716 return NULL;
1717}
1718
1719static
1720int latin1_encoding_error(const Py_UNICODE **source,
1721 char **dest,
1722 const char *errors,
1723 const char *details)
1724{
1725 if ((errors == NULL) ||
1726 (strcmp(errors,"strict") == 0)) {
1727 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001728 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 details);
1730 return -1;
1731 }
1732 else if (strcmp(errors,"ignore") == 0) {
1733 return 0;
1734 }
1735 else if (strcmp(errors,"replace") == 0) {
1736 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001737 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 return 0;
1739 }
1740 else {
1741 PyErr_Format(PyExc_ValueError,
1742 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001743 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 errors);
1745 return -1;
1746 }
1747}
1748
1749PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1750 int size,
1751 const char *errors)
1752{
1753 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001754 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 repr = PyString_FromStringAndSize(NULL, size);
1757 if (repr == NULL)
1758 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001759 if (size == 0)
1760 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761
1762 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001763 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 while (size-- > 0) {
1765 Py_UNICODE ch = *p++;
1766 if (ch >= 256) {
1767 if (latin1_encoding_error(&p, &s, errors,
1768 "ordinal not in range(256)"))
1769 goto onError;
1770 }
1771 else
1772 *s++ = (char)ch;
1773 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001774 /* Resize if error handling skipped some characters */
1775 if (s - start < PyString_GET_SIZE(repr))
1776 if (_PyString_Resize(&repr, s - start))
1777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 return repr;
1779
1780 onError:
1781 Py_DECREF(repr);
1782 return NULL;
1783}
1784
1785PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1786{
1787 if (!PyUnicode_Check(unicode)) {
1788 PyErr_BadArgument();
1789 return NULL;
1790 }
1791 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1792 PyUnicode_GET_SIZE(unicode),
1793 NULL);
1794}
1795
1796/* --- 7-bit ASCII Codec -------------------------------------------------- */
1797
1798static
1799int ascii_decoding_error(const char **source,
1800 Py_UNICODE **dest,
1801 const char *errors,
1802 const char *details)
1803{
1804 if ((errors == NULL) ||
1805 (strcmp(errors,"strict") == 0)) {
1806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001807 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 details);
1809 return -1;
1810 }
1811 else if (strcmp(errors,"ignore") == 0) {
1812 return 0;
1813 }
1814 else if (strcmp(errors,"replace") == 0) {
1815 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1816 (*dest)++;
1817 return 0;
1818 }
1819 else {
1820 PyErr_Format(PyExc_ValueError,
1821 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001822 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 errors);
1824 return -1;
1825 }
1826}
1827
1828PyObject *PyUnicode_DecodeASCII(const char *s,
1829 int size,
1830 const char *errors)
1831{
1832 PyUnicodeObject *v;
1833 Py_UNICODE *p;
1834
1835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001836 if (size == 1 && *(unsigned char*)s < 128) {
1837 Py_UNICODE r = *(unsigned char*)s;
1838 return PyUnicode_FromUnicode(&r, 1);
1839 }
1840
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 v = _PyUnicode_New(size);
1842 if (v == NULL)
1843 goto onError;
1844 if (size == 0)
1845 return (PyObject *)v;
1846 p = PyUnicode_AS_UNICODE(v);
1847 while (size-- > 0) {
1848 register unsigned char c;
1849
1850 c = (unsigned char)*s++;
1851 if (c < 128)
1852 *p++ = c;
1853 else if (ascii_decoding_error(&s, &p, errors,
1854 "ordinal not in range(128)"))
1855 goto onError;
1856 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001857 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001858 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001859 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 return (PyObject *)v;
1861
1862 onError:
1863 Py_XDECREF(v);
1864 return NULL;
1865}
1866
1867static
1868int ascii_encoding_error(const Py_UNICODE **source,
1869 char **dest,
1870 const char *errors,
1871 const char *details)
1872{
1873 if ((errors == NULL) ||
1874 (strcmp(errors,"strict") == 0)) {
1875 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001876 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 details);
1878 return -1;
1879 }
1880 else if (strcmp(errors,"ignore") == 0) {
1881 return 0;
1882 }
1883 else if (strcmp(errors,"replace") == 0) {
1884 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001885 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 return 0;
1887 }
1888 else {
1889 PyErr_Format(PyExc_ValueError,
1890 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001891 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 errors);
1893 return -1;
1894 }
1895}
1896
1897PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1898 int size,
1899 const char *errors)
1900{
1901 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001902 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 repr = PyString_FromStringAndSize(NULL, size);
1905 if (repr == NULL)
1906 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001907 if (size == 0)
1908 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
1910 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001911 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 while (size-- > 0) {
1913 Py_UNICODE ch = *p++;
1914 if (ch >= 128) {
1915 if (ascii_encoding_error(&p, &s, errors,
1916 "ordinal not in range(128)"))
1917 goto onError;
1918 }
1919 else
1920 *s++ = (char)ch;
1921 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001922 /* Resize if error handling skipped some characters */
1923 if (s - start < PyString_GET_SIZE(repr))
1924 if (_PyString_Resize(&repr, s - start))
1925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 return repr;
1927
1928 onError:
1929 Py_DECREF(repr);
1930 return NULL;
1931}
1932
1933PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1934{
1935 if (!PyUnicode_Check(unicode)) {
1936 PyErr_BadArgument();
1937 return NULL;
1938 }
1939 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1940 PyUnicode_GET_SIZE(unicode),
1941 NULL);
1942}
1943
Fredrik Lundh30831632001-06-26 15:11:00 +00001944#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001945
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001946/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001948PyObject *PyUnicode_DecodeMBCS(const char *s,
1949 int size,
1950 const char *errors)
1951{
1952 PyUnicodeObject *v;
1953 Py_UNICODE *p;
1954
1955 /* First get the size of the result */
1956 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001957 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001958 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1959
1960 v = _PyUnicode_New(usize);
1961 if (v == NULL)
1962 return NULL;
1963 if (usize == 0)
1964 return (PyObject *)v;
1965 p = PyUnicode_AS_UNICODE(v);
1966 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1967 Py_DECREF(v);
1968 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1969 }
1970
1971 return (PyObject *)v;
1972}
1973
1974PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1975 int size,
1976 const char *errors)
1977{
1978 PyObject *repr;
1979 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001980 DWORD mbcssize;
1981
1982 /* If there are no characters, bail now! */
1983 if (size==0)
1984 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001985
1986 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001987 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001988 if (mbcssize==0)
1989 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1990
1991 repr = PyString_FromStringAndSize(NULL, mbcssize);
1992 if (repr == NULL)
1993 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001994 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001995 return repr;
1996
1997 /* Do the conversion */
1998 s = PyString_AS_STRING(repr);
1999 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2000 Py_DECREF(repr);
2001 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2002 }
2003 return repr;
2004}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002005
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002006#endif /* MS_WIN32 */
2007
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008/* --- Character Mapping Codec -------------------------------------------- */
2009
2010static
2011int charmap_decoding_error(const char **source,
2012 Py_UNICODE **dest,
2013 const char *errors,
2014 const char *details)
2015{
2016 if ((errors == NULL) ||
2017 (strcmp(errors,"strict") == 0)) {
2018 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002019 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 details);
2021 return -1;
2022 }
2023 else if (strcmp(errors,"ignore") == 0) {
2024 return 0;
2025 }
2026 else if (strcmp(errors,"replace") == 0) {
2027 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2028 (*dest)++;
2029 return 0;
2030 }
2031 else {
2032 PyErr_Format(PyExc_ValueError,
2033 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002034 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 errors);
2036 return -1;
2037 }
2038}
2039
2040PyObject *PyUnicode_DecodeCharmap(const char *s,
2041 int size,
2042 PyObject *mapping,
2043 const char *errors)
2044{
2045 PyUnicodeObject *v;
2046 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002047 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 /* Default to Latin-1 */
2050 if (mapping == NULL)
2051 return PyUnicode_DecodeLatin1(s, size, errors);
2052
2053 v = _PyUnicode_New(size);
2054 if (v == NULL)
2055 goto onError;
2056 if (size == 0)
2057 return (PyObject *)v;
2058 p = PyUnicode_AS_UNICODE(v);
2059 while (size-- > 0) {
2060 unsigned char ch = *s++;
2061 PyObject *w, *x;
2062
2063 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2064 w = PyInt_FromLong((long)ch);
2065 if (w == NULL)
2066 goto onError;
2067 x = PyObject_GetItem(mapping, w);
2068 Py_DECREF(w);
2069 if (x == NULL) {
2070 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002071 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002073 x = Py_None;
2074 Py_INCREF(x);
2075 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002076 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 }
2078
2079 /* Apply mapping */
2080 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002081 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 if (value < 0 || value > 65535) {
2083 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002084 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 Py_DECREF(x);
2086 goto onError;
2087 }
2088 *p++ = (Py_UNICODE)value;
2089 }
2090 else if (x == Py_None) {
2091 /* undefined mapping */
2092 if (charmap_decoding_error(&s, &p, errors,
2093 "character maps to <undefined>")) {
2094 Py_DECREF(x);
2095 goto onError;
2096 }
2097 }
2098 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002099 int targetsize = PyUnicode_GET_SIZE(x);
2100
2101 if (targetsize == 1)
2102 /* 1-1 mapping */
2103 *p++ = *PyUnicode_AS_UNICODE(x);
2104
2105 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002107 if (targetsize > extrachars) {
2108 /* resize first */
2109 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2110 int needed = (targetsize - extrachars) + \
2111 (targetsize << 2);
2112 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002113 if (_PyUnicode_Resize(&v,
2114 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002115 Py_DECREF(x);
2116 goto onError;
2117 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002118 p = PyUnicode_AS_UNICODE(v) + oldpos;
2119 }
2120 Py_UNICODE_COPY(p,
2121 PyUnicode_AS_UNICODE(x),
2122 targetsize);
2123 p += targetsize;
2124 extrachars -= targetsize;
2125 }
2126 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 }
2128 else {
2129 /* wrong return value */
2130 PyErr_SetString(PyExc_TypeError,
2131 "character mapping must return integer, None or unicode");
2132 Py_DECREF(x);
2133 goto onError;
2134 }
2135 Py_DECREF(x);
2136 }
2137 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002138 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 goto onError;
2140 return (PyObject *)v;
2141
2142 onError:
2143 Py_XDECREF(v);
2144 return NULL;
2145}
2146
2147static
2148int charmap_encoding_error(const Py_UNICODE **source,
2149 char **dest,
2150 const char *errors,
2151 const char *details)
2152{
2153 if ((errors == NULL) ||
2154 (strcmp(errors,"strict") == 0)) {
2155 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002156 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 details);
2158 return -1;
2159 }
2160 else if (strcmp(errors,"ignore") == 0) {
2161 return 0;
2162 }
2163 else if (strcmp(errors,"replace") == 0) {
2164 **dest = '?';
2165 (*dest)++;
2166 return 0;
2167 }
2168 else {
2169 PyErr_Format(PyExc_ValueError,
2170 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002171 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 errors);
2173 return -1;
2174 }
2175}
2176
2177PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2178 int size,
2179 PyObject *mapping,
2180 const char *errors)
2181{
2182 PyObject *v;
2183 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002184 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185
2186 /* Default to Latin-1 */
2187 if (mapping == NULL)
2188 return PyUnicode_EncodeLatin1(p, size, errors);
2189
2190 v = PyString_FromStringAndSize(NULL, size);
2191 if (v == NULL)
2192 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002193 if (size == 0)
2194 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 s = PyString_AS_STRING(v);
2196 while (size-- > 0) {
2197 Py_UNICODE ch = *p++;
2198 PyObject *w, *x;
2199
2200 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2201 w = PyInt_FromLong((long)ch);
2202 if (w == NULL)
2203 goto onError;
2204 x = PyObject_GetItem(mapping, w);
2205 Py_DECREF(w);
2206 if (x == NULL) {
2207 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002208 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002210 x = Py_None;
2211 Py_INCREF(x);
2212 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002213 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 }
2215
2216 /* Apply mapping */
2217 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002218 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 if (value < 0 || value > 255) {
2220 PyErr_SetString(PyExc_TypeError,
2221 "character mapping must be in range(256)");
2222 Py_DECREF(x);
2223 goto onError;
2224 }
2225 *s++ = (char)value;
2226 }
2227 else if (x == Py_None) {
2228 /* undefined mapping */
2229 if (charmap_encoding_error(&p, &s, errors,
2230 "character maps to <undefined>")) {
2231 Py_DECREF(x);
2232 goto onError;
2233 }
2234 }
2235 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002236 int targetsize = PyString_GET_SIZE(x);
2237
2238 if (targetsize == 1)
2239 /* 1-1 mapping */
2240 *s++ = *PyString_AS_STRING(x);
2241
2242 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002244 if (targetsize > extrachars) {
2245 /* resize first */
2246 int oldpos = (int)(s - PyString_AS_STRING(v));
2247 int needed = (targetsize - extrachars) + \
2248 (targetsize << 2);
2249 extrachars += needed;
2250 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002251 Py_DECREF(x);
2252 goto onError;
2253 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002254 s = PyString_AS_STRING(v) + oldpos;
2255 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002256 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002257 s += targetsize;
2258 extrachars -= targetsize;
2259 }
2260 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 }
2262 else {
2263 /* wrong return value */
2264 PyErr_SetString(PyExc_TypeError,
2265 "character mapping must return integer, None or unicode");
2266 Py_DECREF(x);
2267 goto onError;
2268 }
2269 Py_DECREF(x);
2270 }
2271 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2272 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2273 goto onError;
2274 return v;
2275
2276 onError:
2277 Py_DECREF(v);
2278 return NULL;
2279}
2280
2281PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2282 PyObject *mapping)
2283{
2284 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2285 PyErr_BadArgument();
2286 return NULL;
2287 }
2288 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2289 PyUnicode_GET_SIZE(unicode),
2290 mapping,
2291 NULL);
2292}
2293
2294static
2295int translate_error(const Py_UNICODE **source,
2296 Py_UNICODE **dest,
2297 const char *errors,
2298 const char *details)
2299{
2300 if ((errors == NULL) ||
2301 (strcmp(errors,"strict") == 0)) {
2302 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002303 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 details);
2305 return -1;
2306 }
2307 else if (strcmp(errors,"ignore") == 0) {
2308 return 0;
2309 }
2310 else if (strcmp(errors,"replace") == 0) {
2311 **dest = '?';
2312 (*dest)++;
2313 return 0;
2314 }
2315 else {
2316 PyErr_Format(PyExc_ValueError,
2317 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002318 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 errors);
2320 return -1;
2321 }
2322}
2323
2324PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2325 int size,
2326 PyObject *mapping,
2327 const char *errors)
2328{
2329 PyUnicodeObject *v;
2330 Py_UNICODE *p;
2331
2332 if (mapping == NULL) {
2333 PyErr_BadArgument();
2334 return NULL;
2335 }
2336
2337 /* Output will never be longer than input */
2338 v = _PyUnicode_New(size);
2339 if (v == NULL)
2340 goto onError;
2341 if (size == 0)
2342 goto done;
2343 p = PyUnicode_AS_UNICODE(v);
2344 while (size-- > 0) {
2345 Py_UNICODE ch = *s++;
2346 PyObject *w, *x;
2347
2348 /* Get mapping */
2349 w = PyInt_FromLong(ch);
2350 if (w == NULL)
2351 goto onError;
2352 x = PyObject_GetItem(mapping, w);
2353 Py_DECREF(w);
2354 if (x == NULL) {
2355 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2356 /* No mapping found: default to 1-1 mapping */
2357 PyErr_Clear();
2358 *p++ = ch;
2359 continue;
2360 }
2361 goto onError;
2362 }
2363
2364 /* Apply mapping */
2365 if (PyInt_Check(x))
2366 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2367 else if (x == Py_None) {
2368 /* undefined mapping */
2369 if (translate_error(&s, &p, errors,
2370 "character maps to <undefined>")) {
2371 Py_DECREF(x);
2372 goto onError;
2373 }
2374 }
2375 else if (PyUnicode_Check(x)) {
2376 if (PyUnicode_GET_SIZE(x) != 1) {
2377 /* 1-n mapping */
2378 PyErr_SetString(PyExc_NotImplementedError,
2379 "1-n mappings are currently not implemented");
2380 Py_DECREF(x);
2381 goto onError;
2382 }
2383 *p++ = *PyUnicode_AS_UNICODE(x);
2384 }
2385 else {
2386 /* wrong return value */
2387 PyErr_SetString(PyExc_TypeError,
2388 "translate mapping must return integer, None or unicode");
2389 Py_DECREF(x);
2390 goto onError;
2391 }
2392 Py_DECREF(x);
2393 }
2394 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002395 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002396 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397
2398 done:
2399 return (PyObject *)v;
2400
2401 onError:
2402 Py_XDECREF(v);
2403 return NULL;
2404}
2405
2406PyObject *PyUnicode_Translate(PyObject *str,
2407 PyObject *mapping,
2408 const char *errors)
2409{
2410 PyObject *result;
2411
2412 str = PyUnicode_FromObject(str);
2413 if (str == NULL)
2414 goto onError;
2415 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2416 PyUnicode_GET_SIZE(str),
2417 mapping,
2418 errors);
2419 Py_DECREF(str);
2420 return result;
2421
2422 onError:
2423 Py_XDECREF(str);
2424 return NULL;
2425}
2426
Guido van Rossum9e896b32000-04-05 20:11:21 +00002427/* --- Decimal Encoder ---------------------------------------------------- */
2428
2429int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2430 int length,
2431 char *output,
2432 const char *errors)
2433{
2434 Py_UNICODE *p, *end;
2435
2436 if (output == NULL) {
2437 PyErr_BadArgument();
2438 return -1;
2439 }
2440
2441 p = s;
2442 end = s + length;
2443 while (p < end) {
2444 register Py_UNICODE ch = *p++;
2445 int decimal;
2446
2447 if (Py_UNICODE_ISSPACE(ch)) {
2448 *output++ = ' ';
2449 continue;
2450 }
2451 decimal = Py_UNICODE_TODECIMAL(ch);
2452 if (decimal >= 0) {
2453 *output++ = '0' + decimal;
2454 continue;
2455 }
Guido van Rossumba477042000-04-06 18:18:10 +00002456 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002457 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002458 continue;
2459 }
2460 /* All other characters are considered invalid */
2461 if (errors == NULL || strcmp(errors, "strict") == 0) {
2462 PyErr_SetString(PyExc_ValueError,
2463 "invalid decimal Unicode string");
2464 goto onError;
2465 }
2466 else if (strcmp(errors, "ignore") == 0)
2467 continue;
2468 else if (strcmp(errors, "replace") == 0) {
2469 *output++ = '?';
2470 continue;
2471 }
2472 }
2473 /* 0-terminate the output string */
2474 *output++ = '\0';
2475 return 0;
2476
2477 onError:
2478 return -1;
2479}
2480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481/* --- Helpers ------------------------------------------------------------ */
2482
2483static
2484int count(PyUnicodeObject *self,
2485 int start,
2486 int end,
2487 PyUnicodeObject *substring)
2488{
2489 int count = 0;
2490
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002491 if (start < 0)
2492 start += self->length;
2493 if (start < 0)
2494 start = 0;
2495 if (end > self->length)
2496 end = self->length;
2497 if (end < 0)
2498 end += self->length;
2499 if (end < 0)
2500 end = 0;
2501
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002502 if (substring->length == 0)
2503 return (end - start + 1);
2504
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 end -= substring->length;
2506
2507 while (start <= end)
2508 if (Py_UNICODE_MATCH(self, start, substring)) {
2509 count++;
2510 start += substring->length;
2511 } else
2512 start++;
2513
2514 return count;
2515}
2516
2517int PyUnicode_Count(PyObject *str,
2518 PyObject *substr,
2519 int start,
2520 int end)
2521{
2522 int result;
2523
2524 str = PyUnicode_FromObject(str);
2525 if (str == NULL)
2526 return -1;
2527 substr = PyUnicode_FromObject(substr);
2528 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002529 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 return -1;
2531 }
2532
2533 result = count((PyUnicodeObject *)str,
2534 start, end,
2535 (PyUnicodeObject *)substr);
2536
2537 Py_DECREF(str);
2538 Py_DECREF(substr);
2539 return result;
2540}
2541
2542static
2543int findstring(PyUnicodeObject *self,
2544 PyUnicodeObject *substring,
2545 int start,
2546 int end,
2547 int direction)
2548{
2549 if (start < 0)
2550 start += self->length;
2551 if (start < 0)
2552 start = 0;
2553
2554 if (substring->length == 0)
2555 return start;
2556
2557 if (end > self->length)
2558 end = self->length;
2559 if (end < 0)
2560 end += self->length;
2561 if (end < 0)
2562 end = 0;
2563
2564 end -= substring->length;
2565
2566 if (direction < 0) {
2567 for (; end >= start; end--)
2568 if (Py_UNICODE_MATCH(self, end, substring))
2569 return end;
2570 } else {
2571 for (; start <= end; start++)
2572 if (Py_UNICODE_MATCH(self, start, substring))
2573 return start;
2574 }
2575
2576 return -1;
2577}
2578
2579int PyUnicode_Find(PyObject *str,
2580 PyObject *substr,
2581 int start,
2582 int end,
2583 int direction)
2584{
2585 int result;
2586
2587 str = PyUnicode_FromObject(str);
2588 if (str == NULL)
2589 return -1;
2590 substr = PyUnicode_FromObject(substr);
2591 if (substr == NULL) {
2592 Py_DECREF(substr);
2593 return -1;
2594 }
2595
2596 result = findstring((PyUnicodeObject *)str,
2597 (PyUnicodeObject *)substr,
2598 start, end, direction);
2599 Py_DECREF(str);
2600 Py_DECREF(substr);
2601 return result;
2602}
2603
2604static
2605int tailmatch(PyUnicodeObject *self,
2606 PyUnicodeObject *substring,
2607 int start,
2608 int end,
2609 int direction)
2610{
2611 if (start < 0)
2612 start += self->length;
2613 if (start < 0)
2614 start = 0;
2615
2616 if (substring->length == 0)
2617 return 1;
2618
2619 if (end > self->length)
2620 end = self->length;
2621 if (end < 0)
2622 end += self->length;
2623 if (end < 0)
2624 end = 0;
2625
2626 end -= substring->length;
2627 if (end < start)
2628 return 0;
2629
2630 if (direction > 0) {
2631 if (Py_UNICODE_MATCH(self, end, substring))
2632 return 1;
2633 } else {
2634 if (Py_UNICODE_MATCH(self, start, substring))
2635 return 1;
2636 }
2637
2638 return 0;
2639}
2640
2641int PyUnicode_Tailmatch(PyObject *str,
2642 PyObject *substr,
2643 int start,
2644 int end,
2645 int direction)
2646{
2647 int result;
2648
2649 str = PyUnicode_FromObject(str);
2650 if (str == NULL)
2651 return -1;
2652 substr = PyUnicode_FromObject(substr);
2653 if (substr == NULL) {
2654 Py_DECREF(substr);
2655 return -1;
2656 }
2657
2658 result = tailmatch((PyUnicodeObject *)str,
2659 (PyUnicodeObject *)substr,
2660 start, end, direction);
2661 Py_DECREF(str);
2662 Py_DECREF(substr);
2663 return result;
2664}
2665
2666static
2667const Py_UNICODE *findchar(const Py_UNICODE *s,
2668 int size,
2669 Py_UNICODE ch)
2670{
2671 /* like wcschr, but doesn't stop at NULL characters */
2672
2673 while (size-- > 0) {
2674 if (*s == ch)
2675 return s;
2676 s++;
2677 }
2678
2679 return NULL;
2680}
2681
2682/* Apply fixfct filter to the Unicode object self and return a
2683 reference to the modified object */
2684
2685static
2686PyObject *fixup(PyUnicodeObject *self,
2687 int (*fixfct)(PyUnicodeObject *s))
2688{
2689
2690 PyUnicodeObject *u;
2691
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002692 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 if (u == NULL)
2694 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002695
2696 Py_UNICODE_COPY(u->str, self->str, self->length);
2697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 if (!fixfct(u)) {
2699 /* fixfct should return TRUE if it modified the buffer. If
2700 FALSE, return a reference to the original buffer instead
2701 (to save space, not time) */
2702 Py_INCREF(self);
2703 Py_DECREF(u);
2704 return (PyObject*) self;
2705 }
2706 return (PyObject*) u;
2707}
2708
2709static
2710int fixupper(PyUnicodeObject *self)
2711{
2712 int len = self->length;
2713 Py_UNICODE *s = self->str;
2714 int status = 0;
2715
2716 while (len-- > 0) {
2717 register Py_UNICODE ch;
2718
2719 ch = Py_UNICODE_TOUPPER(*s);
2720 if (ch != *s) {
2721 status = 1;
2722 *s = ch;
2723 }
2724 s++;
2725 }
2726
2727 return status;
2728}
2729
2730static
2731int fixlower(PyUnicodeObject *self)
2732{
2733 int len = self->length;
2734 Py_UNICODE *s = self->str;
2735 int status = 0;
2736
2737 while (len-- > 0) {
2738 register Py_UNICODE ch;
2739
2740 ch = Py_UNICODE_TOLOWER(*s);
2741 if (ch != *s) {
2742 status = 1;
2743 *s = ch;
2744 }
2745 s++;
2746 }
2747
2748 return status;
2749}
2750
2751static
2752int fixswapcase(PyUnicodeObject *self)
2753{
2754 int len = self->length;
2755 Py_UNICODE *s = self->str;
2756 int status = 0;
2757
2758 while (len-- > 0) {
2759 if (Py_UNICODE_ISUPPER(*s)) {
2760 *s = Py_UNICODE_TOLOWER(*s);
2761 status = 1;
2762 } else if (Py_UNICODE_ISLOWER(*s)) {
2763 *s = Py_UNICODE_TOUPPER(*s);
2764 status = 1;
2765 }
2766 s++;
2767 }
2768
2769 return status;
2770}
2771
2772static
2773int fixcapitalize(PyUnicodeObject *self)
2774{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002775 int len = self->length;
2776 Py_UNICODE *s = self->str;
2777 int status = 0;
2778
2779 if (len == 0)
2780 return 0;
2781 if (Py_UNICODE_ISLOWER(*s)) {
2782 *s = Py_UNICODE_TOUPPER(*s);
2783 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002785 s++;
2786 while (--len > 0) {
2787 if (Py_UNICODE_ISUPPER(*s)) {
2788 *s = Py_UNICODE_TOLOWER(*s);
2789 status = 1;
2790 }
2791 s++;
2792 }
2793 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794}
2795
2796static
2797int fixtitle(PyUnicodeObject *self)
2798{
2799 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2800 register Py_UNICODE *e;
2801 int previous_is_cased;
2802
2803 /* Shortcut for single character strings */
2804 if (PyUnicode_GET_SIZE(self) == 1) {
2805 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2806 if (*p != ch) {
2807 *p = ch;
2808 return 1;
2809 }
2810 else
2811 return 0;
2812 }
2813
2814 e = p + PyUnicode_GET_SIZE(self);
2815 previous_is_cased = 0;
2816 for (; p < e; p++) {
2817 register const Py_UNICODE ch = *p;
2818
2819 if (previous_is_cased)
2820 *p = Py_UNICODE_TOLOWER(ch);
2821 else
2822 *p = Py_UNICODE_TOTITLE(ch);
2823
2824 if (Py_UNICODE_ISLOWER(ch) ||
2825 Py_UNICODE_ISUPPER(ch) ||
2826 Py_UNICODE_ISTITLE(ch))
2827 previous_is_cased = 1;
2828 else
2829 previous_is_cased = 0;
2830 }
2831 return 1;
2832}
2833
2834PyObject *PyUnicode_Join(PyObject *separator,
2835 PyObject *seq)
2836{
2837 Py_UNICODE *sep;
2838 int seplen;
2839 PyUnicodeObject *res = NULL;
2840 int reslen = 0;
2841 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 int sz = 100;
2843 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002844 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845
Tim Peters2cfe3682001-05-05 05:36:48 +00002846 it = PyObject_GetIter(seq);
2847 if (it == NULL)
2848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849
2850 if (separator == NULL) {
2851 Py_UNICODE blank = ' ';
2852 sep = &blank;
2853 seplen = 1;
2854 }
2855 else {
2856 separator = PyUnicode_FromObject(separator);
2857 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002858 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 sep = PyUnicode_AS_UNICODE(separator);
2860 seplen = PyUnicode_GET_SIZE(separator);
2861 }
2862
2863 res = _PyUnicode_New(sz);
2864 if (res == NULL)
2865 goto onError;
2866 p = PyUnicode_AS_UNICODE(res);
2867 reslen = 0;
2868
Tim Peters2cfe3682001-05-05 05:36:48 +00002869 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002871 PyObject *item = PyIter_Next(it);
2872 if (item == NULL) {
2873 if (PyErr_Occurred())
2874 goto onError;
2875 break;
2876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 if (!PyUnicode_Check(item)) {
2878 PyObject *v;
2879 v = PyUnicode_FromObject(item);
2880 Py_DECREF(item);
2881 item = v;
2882 if (item == NULL)
2883 goto onError;
2884 }
2885 itemlen = PyUnicode_GET_SIZE(item);
2886 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002887 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 goto onError;
2889 sz *= 2;
2890 p = PyUnicode_AS_UNICODE(res) + reslen;
2891 }
2892 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002893 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 p += seplen;
2895 reslen += seplen;
2896 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002897 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 p += itemlen;
2899 reslen += itemlen;
2900 Py_DECREF(item);
2901 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002902 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 goto onError;
2904
2905 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002906 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 return (PyObject *)res;
2908
2909 onError:
2910 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002911 Py_XDECREF(res);
2912 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 return NULL;
2914}
2915
2916static
2917PyUnicodeObject *pad(PyUnicodeObject *self,
2918 int left,
2919 int right,
2920 Py_UNICODE fill)
2921{
2922 PyUnicodeObject *u;
2923
2924 if (left < 0)
2925 left = 0;
2926 if (right < 0)
2927 right = 0;
2928
2929 if (left == 0 && right == 0) {
2930 Py_INCREF(self);
2931 return self;
2932 }
2933
2934 u = _PyUnicode_New(left + self->length + right);
2935 if (u) {
2936 if (left)
2937 Py_UNICODE_FILL(u->str, fill, left);
2938 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2939 if (right)
2940 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2941 }
2942
2943 return u;
2944}
2945
2946#define SPLIT_APPEND(data, left, right) \
2947 str = PyUnicode_FromUnicode(data + left, right - left); \
2948 if (!str) \
2949 goto onError; \
2950 if (PyList_Append(list, str)) { \
2951 Py_DECREF(str); \
2952 goto onError; \
2953 } \
2954 else \
2955 Py_DECREF(str);
2956
2957static
2958PyObject *split_whitespace(PyUnicodeObject *self,
2959 PyObject *list,
2960 int maxcount)
2961{
2962 register int i;
2963 register int j;
2964 int len = self->length;
2965 PyObject *str;
2966
2967 for (i = j = 0; i < len; ) {
2968 /* find a token */
2969 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2970 i++;
2971 j = i;
2972 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2973 i++;
2974 if (j < i) {
2975 if (maxcount-- <= 0)
2976 break;
2977 SPLIT_APPEND(self->str, j, i);
2978 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2979 i++;
2980 j = i;
2981 }
2982 }
2983 if (j < len) {
2984 SPLIT_APPEND(self->str, j, len);
2985 }
2986 return list;
2987
2988 onError:
2989 Py_DECREF(list);
2990 return NULL;
2991}
2992
2993PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002994 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995{
2996 register int i;
2997 register int j;
2998 int len;
2999 PyObject *list;
3000 PyObject *str;
3001 Py_UNICODE *data;
3002
3003 string = PyUnicode_FromObject(string);
3004 if (string == NULL)
3005 return NULL;
3006 data = PyUnicode_AS_UNICODE(string);
3007 len = PyUnicode_GET_SIZE(string);
3008
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 list = PyList_New(0);
3010 if (!list)
3011 goto onError;
3012
3013 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003014 int eol;
3015
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 /* Find a line and append it */
3017 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3018 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019
3020 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003021 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 if (i < len) {
3023 if (data[i] == '\r' && i + 1 < len &&
3024 data[i+1] == '\n')
3025 i += 2;
3026 else
3027 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003028 if (keepends)
3029 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Guido van Rossum86662912000-04-11 15:38:46 +00003031 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 j = i;
3033 }
3034 if (j < len) {
3035 SPLIT_APPEND(data, j, len);
3036 }
3037
3038 Py_DECREF(string);
3039 return list;
3040
3041 onError:
3042 Py_DECREF(list);
3043 Py_DECREF(string);
3044 return NULL;
3045}
3046
3047static
3048PyObject *split_char(PyUnicodeObject *self,
3049 PyObject *list,
3050 Py_UNICODE ch,
3051 int maxcount)
3052{
3053 register int i;
3054 register int j;
3055 int len = self->length;
3056 PyObject *str;
3057
3058 for (i = j = 0; i < len; ) {
3059 if (self->str[i] == ch) {
3060 if (maxcount-- <= 0)
3061 break;
3062 SPLIT_APPEND(self->str, j, i);
3063 i = j = i + 1;
3064 } else
3065 i++;
3066 }
3067 if (j <= len) {
3068 SPLIT_APPEND(self->str, j, len);
3069 }
3070 return list;
3071
3072 onError:
3073 Py_DECREF(list);
3074 return NULL;
3075}
3076
3077static
3078PyObject *split_substring(PyUnicodeObject *self,
3079 PyObject *list,
3080 PyUnicodeObject *substring,
3081 int maxcount)
3082{
3083 register int i;
3084 register int j;
3085 int len = self->length;
3086 int sublen = substring->length;
3087 PyObject *str;
3088
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003089 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (Py_UNICODE_MATCH(self, i, substring)) {
3091 if (maxcount-- <= 0)
3092 break;
3093 SPLIT_APPEND(self->str, j, i);
3094 i = j = i + sublen;
3095 } else
3096 i++;
3097 }
3098 if (j <= len) {
3099 SPLIT_APPEND(self->str, j, len);
3100 }
3101 return list;
3102
3103 onError:
3104 Py_DECREF(list);
3105 return NULL;
3106}
3107
3108#undef SPLIT_APPEND
3109
3110static
3111PyObject *split(PyUnicodeObject *self,
3112 PyUnicodeObject *substring,
3113 int maxcount)
3114{
3115 PyObject *list;
3116
3117 if (maxcount < 0)
3118 maxcount = INT_MAX;
3119
3120 list = PyList_New(0);
3121 if (!list)
3122 return NULL;
3123
3124 if (substring == NULL)
3125 return split_whitespace(self,list,maxcount);
3126
3127 else if (substring->length == 1)
3128 return split_char(self,list,substring->str[0],maxcount);
3129
3130 else if (substring->length == 0) {
3131 Py_DECREF(list);
3132 PyErr_SetString(PyExc_ValueError, "empty separator");
3133 return NULL;
3134 }
3135 else
3136 return split_substring(self,list,substring,maxcount);
3137}
3138
3139static
3140PyObject *strip(PyUnicodeObject *self,
3141 int left,
3142 int right)
3143{
3144 Py_UNICODE *p = self->str;
3145 int start = 0;
3146 int end = self->length;
3147
3148 if (left)
3149 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3150 start++;
3151
3152 if (right)
3153 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3154 end--;
3155
3156 if (start == 0 && end == self->length) {
3157 /* couldn't strip anything off, return original string */
3158 Py_INCREF(self);
3159 return (PyObject*) self;
3160 }
3161
3162 return (PyObject*) PyUnicode_FromUnicode(
3163 self->str + start,
3164 end - start
3165 );
3166}
3167
3168static
3169PyObject *replace(PyUnicodeObject *self,
3170 PyUnicodeObject *str1,
3171 PyUnicodeObject *str2,
3172 int maxcount)
3173{
3174 PyUnicodeObject *u;
3175
3176 if (maxcount < 0)
3177 maxcount = INT_MAX;
3178
3179 if (str1->length == 1 && str2->length == 1) {
3180 int i;
3181
3182 /* replace characters */
3183 if (!findchar(self->str, self->length, str1->str[0])) {
3184 /* nothing to replace, return original string */
3185 Py_INCREF(self);
3186 u = self;
3187 } else {
3188 Py_UNICODE u1 = str1->str[0];
3189 Py_UNICODE u2 = str2->str[0];
3190
3191 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003192 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 self->length
3194 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003195 if (u != NULL) {
3196 Py_UNICODE_COPY(u->str, self->str,
3197 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 for (i = 0; i < u->length; i++)
3199 if (u->str[i] == u1) {
3200 if (--maxcount < 0)
3201 break;
3202 u->str[i] = u2;
3203 }
3204 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 } else {
3208 int n, i;
3209 Py_UNICODE *p;
3210
3211 /* replace strings */
3212 n = count(self, 0, self->length, str1);
3213 if (n > maxcount)
3214 n = maxcount;
3215 if (n == 0) {
3216 /* nothing to replace, return original string */
3217 Py_INCREF(self);
3218 u = self;
3219 } else {
3220 u = _PyUnicode_New(
3221 self->length + n * (str2->length - str1->length));
3222 if (u) {
3223 i = 0;
3224 p = u->str;
3225 while (i <= self->length - str1->length)
3226 if (Py_UNICODE_MATCH(self, i, str1)) {
3227 /* replace string segment */
3228 Py_UNICODE_COPY(p, str2->str, str2->length);
3229 p += str2->length;
3230 i += str1->length;
3231 if (--n <= 0) {
3232 /* copy remaining part */
3233 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3234 break;
3235 }
3236 } else
3237 *p++ = self->str[i++];
3238 }
3239 }
3240 }
3241
3242 return (PyObject *) u;
3243}
3244
3245/* --- Unicode Object Methods --------------------------------------------- */
3246
3247static char title__doc__[] =
3248"S.title() -> unicode\n\
3249\n\
3250Return a titlecased version of S, i.e. words start with title case\n\
3251characters, all remaining cased characters have lower case.";
3252
3253static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003254unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 return fixup(self, fixtitle);
3257}
3258
3259static char capitalize__doc__[] =
3260"S.capitalize() -> unicode\n\
3261\n\
3262Return a capitalized version of S, i.e. make the first character\n\
3263have upper case.";
3264
3265static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003266unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 return fixup(self, fixcapitalize);
3269}
3270
3271#if 0
3272static char capwords__doc__[] =
3273"S.capwords() -> unicode\n\
3274\n\
3275Apply .capitalize() to all words in S and return the result with\n\
3276normalized whitespace (all whitespace strings are replaced by ' ').";
3277
3278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003279unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280{
3281 PyObject *list;
3282 PyObject *item;
3283 int i;
3284
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 /* Split into words */
3286 list = split(self, NULL, -1);
3287 if (!list)
3288 return NULL;
3289
3290 /* Capitalize each word */
3291 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3292 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3293 fixcapitalize);
3294 if (item == NULL)
3295 goto onError;
3296 Py_DECREF(PyList_GET_ITEM(list, i));
3297 PyList_SET_ITEM(list, i, item);
3298 }
3299
3300 /* Join the words to form a new string */
3301 item = PyUnicode_Join(NULL, list);
3302
3303onError:
3304 Py_DECREF(list);
3305 return (PyObject *)item;
3306}
3307#endif
3308
3309static char center__doc__[] =
3310"S.center(width) -> unicode\n\
3311\n\
3312Return S centered in a Unicode string of length width. Padding is done\n\
3313using spaces.";
3314
3315static PyObject *
3316unicode_center(PyUnicodeObject *self, PyObject *args)
3317{
3318 int marg, left;
3319 int width;
3320
3321 if (!PyArg_ParseTuple(args, "i:center", &width))
3322 return NULL;
3323
3324 if (self->length >= width) {
3325 Py_INCREF(self);
3326 return (PyObject*) self;
3327 }
3328
3329 marg = width - self->length;
3330 left = marg / 2 + (marg & width & 1);
3331
3332 return (PyObject*) pad(self, left, marg - left, ' ');
3333}
3334
Marc-André Lemburge5034372000-08-08 08:04:29 +00003335#if 0
3336
3337/* This code should go into some future Unicode collation support
3338 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003339 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003340
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003341/* speedy UTF-16 code point order comparison */
3342/* gleaned from: */
3343/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3344
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003345static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003346{
3347 0, 0, 0, 0, 0, 0, 0, 0,
3348 0, 0, 0, 0, 0, 0, 0, 0,
3349 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003350 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003351};
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353static int
3354unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3355{
3356 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003357
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 Py_UNICODE *s1 = str1->str;
3359 Py_UNICODE *s2 = str2->str;
3360
3361 len1 = str1->length;
3362 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003363
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003365 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003366
3367 c1 = *s1++;
3368 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003369
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003370 if (c1 > (1<<11) * 26)
3371 c1 += utf16Fixup[c1>>11];
3372 if (c2 > (1<<11) * 26)
3373 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003374 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003375
3376 if (c1 != c2)
3377 return (c1 < c2) ? -1 : 1;
3378
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003379 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
3381
3382 return (len1 < len2) ? -1 : (len1 != len2);
3383}
3384
Marc-André Lemburge5034372000-08-08 08:04:29 +00003385#else
3386
3387static int
3388unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3389{
3390 register int len1, len2;
3391
3392 Py_UNICODE *s1 = str1->str;
3393 Py_UNICODE *s2 = str2->str;
3394
3395 len1 = str1->length;
3396 len2 = str2->length;
3397
3398 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003399 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003400
Fredrik Lundh45714e92001-06-26 16:39:36 +00003401 c1 = *s1++;
3402 c2 = *s2++;
3403
3404 if (c1 != c2)
3405 return (c1 < c2) ? -1 : 1;
3406
Marc-André Lemburge5034372000-08-08 08:04:29 +00003407 len1--; len2--;
3408 }
3409
3410 return (len1 < len2) ? -1 : (len1 != len2);
3411}
3412
3413#endif
3414
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415int PyUnicode_Compare(PyObject *left,
3416 PyObject *right)
3417{
3418 PyUnicodeObject *u = NULL, *v = NULL;
3419 int result;
3420
3421 /* Coerce the two arguments */
3422 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3423 if (u == NULL)
3424 goto onError;
3425 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3426 if (v == NULL)
3427 goto onError;
3428
Thomas Wouters7e474022000-07-16 12:04:32 +00003429 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 if (v == u) {
3431 Py_DECREF(u);
3432 Py_DECREF(v);
3433 return 0;
3434 }
3435
3436 result = unicode_compare(u, v);
3437
3438 Py_DECREF(u);
3439 Py_DECREF(v);
3440 return result;
3441
3442onError:
3443 Py_XDECREF(u);
3444 Py_XDECREF(v);
3445 return -1;
3446}
3447
Guido van Rossum403d68b2000-03-13 15:55:09 +00003448int PyUnicode_Contains(PyObject *container,
3449 PyObject *element)
3450{
3451 PyUnicodeObject *u = NULL, *v = NULL;
3452 int result;
3453 register const Py_UNICODE *p, *e;
3454 register Py_UNICODE ch;
3455
3456 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003457 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003458 if (v == NULL) {
3459 PyErr_SetString(PyExc_TypeError,
3460 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003461 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003462 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003463 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3464 if (u == NULL) {
3465 Py_DECREF(v);
3466 goto onError;
3467 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003468
3469 /* Check v in u */
3470 if (PyUnicode_GET_SIZE(v) != 1) {
3471 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003472 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003473 goto onError;
3474 }
3475 ch = *PyUnicode_AS_UNICODE(v);
3476 p = PyUnicode_AS_UNICODE(u);
3477 e = p + PyUnicode_GET_SIZE(u);
3478 result = 0;
3479 while (p < e) {
3480 if (*p++ == ch) {
3481 result = 1;
3482 break;
3483 }
3484 }
3485
3486 Py_DECREF(u);
3487 Py_DECREF(v);
3488 return result;
3489
3490onError:
3491 Py_XDECREF(u);
3492 Py_XDECREF(v);
3493 return -1;
3494}
3495
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496/* Concat to string or Unicode object giving a new Unicode object. */
3497
3498PyObject *PyUnicode_Concat(PyObject *left,
3499 PyObject *right)
3500{
3501 PyUnicodeObject *u = NULL, *v = NULL, *w;
3502
3503 /* Coerce the two arguments */
3504 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3505 if (u == NULL)
3506 goto onError;
3507 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3508 if (v == NULL)
3509 goto onError;
3510
3511 /* Shortcuts */
3512 if (v == unicode_empty) {
3513 Py_DECREF(v);
3514 return (PyObject *)u;
3515 }
3516 if (u == unicode_empty) {
3517 Py_DECREF(u);
3518 return (PyObject *)v;
3519 }
3520
3521 /* Concat the two Unicode strings */
3522 w = _PyUnicode_New(u->length + v->length);
3523 if (w == NULL)
3524 goto onError;
3525 Py_UNICODE_COPY(w->str, u->str, u->length);
3526 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3527
3528 Py_DECREF(u);
3529 Py_DECREF(v);
3530 return (PyObject *)w;
3531
3532onError:
3533 Py_XDECREF(u);
3534 Py_XDECREF(v);
3535 return NULL;
3536}
3537
3538static char count__doc__[] =
3539"S.count(sub[, start[, end]]) -> int\n\
3540\n\
3541Return the number of occurrences of substring sub in Unicode string\n\
3542S[start:end]. Optional arguments start and end are\n\
3543interpreted as in slice notation.";
3544
3545static PyObject *
3546unicode_count(PyUnicodeObject *self, PyObject *args)
3547{
3548 PyUnicodeObject *substring;
3549 int start = 0;
3550 int end = INT_MAX;
3551 PyObject *result;
3552
Guido van Rossumb8872e62000-05-09 14:14:27 +00003553 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3554 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 return NULL;
3556
3557 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3558 (PyObject *)substring);
3559 if (substring == NULL)
3560 return NULL;
3561
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 if (start < 0)
3563 start += self->length;
3564 if (start < 0)
3565 start = 0;
3566 if (end > self->length)
3567 end = self->length;
3568 if (end < 0)
3569 end += self->length;
3570 if (end < 0)
3571 end = 0;
3572
3573 result = PyInt_FromLong((long) count(self, start, end, substring));
3574
3575 Py_DECREF(substring);
3576 return result;
3577}
3578
3579static char encode__doc__[] =
3580"S.encode([encoding[,errors]]) -> string\n\
3581\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003582Return an encoded string version of S. Default encoding is the current\n\
3583default string encoding. errors may be given to set a different error\n\
3584handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3585a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586
3587static PyObject *
3588unicode_encode(PyUnicodeObject *self, PyObject *args)
3589{
3590 char *encoding = NULL;
3591 char *errors = NULL;
3592 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3593 return NULL;
3594 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3595}
3596
3597static char expandtabs__doc__[] =
3598"S.expandtabs([tabsize]) -> unicode\n\
3599\n\
3600Return a copy of S where all tab characters are expanded using spaces.\n\
3601If tabsize is not given, a tab size of 8 characters is assumed.";
3602
3603static PyObject*
3604unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3605{
3606 Py_UNICODE *e;
3607 Py_UNICODE *p;
3608 Py_UNICODE *q;
3609 int i, j;
3610 PyUnicodeObject *u;
3611 int tabsize = 8;
3612
3613 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3614 return NULL;
3615
Thomas Wouters7e474022000-07-16 12:04:32 +00003616 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 i = j = 0;
3618 e = self->str + self->length;
3619 for (p = self->str; p < e; p++)
3620 if (*p == '\t') {
3621 if (tabsize > 0)
3622 j += tabsize - (j % tabsize);
3623 }
3624 else {
3625 j++;
3626 if (*p == '\n' || *p == '\r') {
3627 i += j;
3628 j = 0;
3629 }
3630 }
3631
3632 /* Second pass: create output string and fill it */
3633 u = _PyUnicode_New(i + j);
3634 if (!u)
3635 return NULL;
3636
3637 j = 0;
3638 q = u->str;
3639
3640 for (p = self->str; p < e; p++)
3641 if (*p == '\t') {
3642 if (tabsize > 0) {
3643 i = tabsize - (j % tabsize);
3644 j += i;
3645 while (i--)
3646 *q++ = ' ';
3647 }
3648 }
3649 else {
3650 j++;
3651 *q++ = *p;
3652 if (*p == '\n' || *p == '\r')
3653 j = 0;
3654 }
3655
3656 return (PyObject*) u;
3657}
3658
3659static char find__doc__[] =
3660"S.find(sub [,start [,end]]) -> int\n\
3661\n\
3662Return the lowest index in S where substring sub is found,\n\
3663such that sub is contained within s[start,end]. Optional\n\
3664arguments start and end are interpreted as in slice notation.\n\
3665\n\
3666Return -1 on failure.";
3667
3668static PyObject *
3669unicode_find(PyUnicodeObject *self, PyObject *args)
3670{
3671 PyUnicodeObject *substring;
3672 int start = 0;
3673 int end = INT_MAX;
3674 PyObject *result;
3675
Guido van Rossumb8872e62000-05-09 14:14:27 +00003676 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3677 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return NULL;
3679 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3680 (PyObject *)substring);
3681 if (substring == NULL)
3682 return NULL;
3683
3684 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3685
3686 Py_DECREF(substring);
3687 return result;
3688}
3689
3690static PyObject *
3691unicode_getitem(PyUnicodeObject *self, int index)
3692{
3693 if (index < 0 || index >= self->length) {
3694 PyErr_SetString(PyExc_IndexError, "string index out of range");
3695 return NULL;
3696 }
3697
3698 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3699}
3700
3701static long
3702unicode_hash(PyUnicodeObject *self)
3703{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003704 /* Since Unicode objects compare equal to their ASCII string
3705 counterparts, they should use the individual character values
3706 as basis for their hash value. This is needed to assure that
3707 strings and Unicode objects behave in the same way as
3708 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709
Fredrik Lundhdde61642000-07-10 18:27:47 +00003710 register int len;
3711 register Py_UNICODE *p;
3712 register long x;
3713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 if (self->hash != -1)
3715 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003716 len = PyUnicode_GET_SIZE(self);
3717 p = PyUnicode_AS_UNICODE(self);
3718 x = *p << 7;
3719 while (--len >= 0)
3720 x = (1000003*x) ^ *p++;
3721 x ^= PyUnicode_GET_SIZE(self);
3722 if (x == -1)
3723 x = -2;
3724 self->hash = x;
3725 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726}
3727
3728static char index__doc__[] =
3729"S.index(sub [,start [,end]]) -> int\n\
3730\n\
3731Like S.find() but raise ValueError when the substring is not found.";
3732
3733static PyObject *
3734unicode_index(PyUnicodeObject *self, PyObject *args)
3735{
3736 int result;
3737 PyUnicodeObject *substring;
3738 int start = 0;
3739 int end = INT_MAX;
3740
Guido van Rossumb8872e62000-05-09 14:14:27 +00003741 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3742 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 return NULL;
3744
3745 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3746 (PyObject *)substring);
3747 if (substring == NULL)
3748 return NULL;
3749
3750 result = findstring(self, substring, start, end, 1);
3751
3752 Py_DECREF(substring);
3753 if (result < 0) {
3754 PyErr_SetString(PyExc_ValueError, "substring not found");
3755 return NULL;
3756 }
3757 return PyInt_FromLong(result);
3758}
3759
3760static char islower__doc__[] =
3761"S.islower() -> int\n\
3762\n\
3763Return 1 if all cased characters in S are lowercase and there is\n\
3764at least one cased character in S, 0 otherwise.";
3765
3766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003767unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
3769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3770 register const Py_UNICODE *e;
3771 int cased;
3772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 /* Shortcut for single character strings */
3774 if (PyUnicode_GET_SIZE(self) == 1)
3775 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3776
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003777 /* Special case for empty strings */
3778 if (PyString_GET_SIZE(self) == 0)
3779 return PyInt_FromLong(0);
3780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 e = p + PyUnicode_GET_SIZE(self);
3782 cased = 0;
3783 for (; p < e; p++) {
3784 register const Py_UNICODE ch = *p;
3785
3786 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3787 return PyInt_FromLong(0);
3788 else if (!cased && Py_UNICODE_ISLOWER(ch))
3789 cased = 1;
3790 }
3791 return PyInt_FromLong(cased);
3792}
3793
3794static char isupper__doc__[] =
3795"S.isupper() -> int\n\
3796\n\
3797Return 1 if all cased characters in S are uppercase and there is\n\
3798at least one cased character in S, 0 otherwise.";
3799
3800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003801unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802{
3803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3804 register const Py_UNICODE *e;
3805 int cased;
3806
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 /* Shortcut for single character strings */
3808 if (PyUnicode_GET_SIZE(self) == 1)
3809 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003811 /* Special case for empty strings */
3812 if (PyString_GET_SIZE(self) == 0)
3813 return PyInt_FromLong(0);
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 e = p + PyUnicode_GET_SIZE(self);
3816 cased = 0;
3817 for (; p < e; p++) {
3818 register const Py_UNICODE ch = *p;
3819
3820 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3821 return PyInt_FromLong(0);
3822 else if (!cased && Py_UNICODE_ISUPPER(ch))
3823 cased = 1;
3824 }
3825 return PyInt_FromLong(cased);
3826}
3827
3828static char istitle__doc__[] =
3829"S.istitle() -> int\n\
3830\n\
3831Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3832may only follow uncased characters and lowercase characters only cased\n\
3833ones. Return 0 otherwise.";
3834
3835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003836unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837{
3838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3839 register const Py_UNICODE *e;
3840 int cased, previous_is_cased;
3841
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 /* Shortcut for single character strings */
3843 if (PyUnicode_GET_SIZE(self) == 1)
3844 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3845 (Py_UNICODE_ISUPPER(*p) != 0));
3846
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003847 /* Special case for empty strings */
3848 if (PyString_GET_SIZE(self) == 0)
3849 return PyInt_FromLong(0);
3850
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 e = p + PyUnicode_GET_SIZE(self);
3852 cased = 0;
3853 previous_is_cased = 0;
3854 for (; p < e; p++) {
3855 register const Py_UNICODE ch = *p;
3856
3857 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3858 if (previous_is_cased)
3859 return PyInt_FromLong(0);
3860 previous_is_cased = 1;
3861 cased = 1;
3862 }
3863 else if (Py_UNICODE_ISLOWER(ch)) {
3864 if (!previous_is_cased)
3865 return PyInt_FromLong(0);
3866 previous_is_cased = 1;
3867 cased = 1;
3868 }
3869 else
3870 previous_is_cased = 0;
3871 }
3872 return PyInt_FromLong(cased);
3873}
3874
3875static char isspace__doc__[] =
3876"S.isspace() -> int\n\
3877\n\
3878Return 1 if there are only whitespace characters in S,\n\
38790 otherwise.";
3880
3881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003882unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883{
3884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3885 register const Py_UNICODE *e;
3886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 /* Shortcut for single character strings */
3888 if (PyUnicode_GET_SIZE(self) == 1 &&
3889 Py_UNICODE_ISSPACE(*p))
3890 return PyInt_FromLong(1);
3891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003892 /* Special case for empty strings */
3893 if (PyString_GET_SIZE(self) == 0)
3894 return PyInt_FromLong(0);
3895
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 e = p + PyUnicode_GET_SIZE(self);
3897 for (; p < e; p++) {
3898 if (!Py_UNICODE_ISSPACE(*p))
3899 return PyInt_FromLong(0);
3900 }
3901 return PyInt_FromLong(1);
3902}
3903
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003904static char isalpha__doc__[] =
3905"S.isalpha() -> int\n\
3906\n\
3907Return 1 if all characters in S are alphabetic\n\
3908and there is at least one character in S, 0 otherwise.";
3909
3910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003911unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003912{
3913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3914 register const Py_UNICODE *e;
3915
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003916 /* Shortcut for single character strings */
3917 if (PyUnicode_GET_SIZE(self) == 1 &&
3918 Py_UNICODE_ISALPHA(*p))
3919 return PyInt_FromLong(1);
3920
3921 /* Special case for empty strings */
3922 if (PyString_GET_SIZE(self) == 0)
3923 return PyInt_FromLong(0);
3924
3925 e = p + PyUnicode_GET_SIZE(self);
3926 for (; p < e; p++) {
3927 if (!Py_UNICODE_ISALPHA(*p))
3928 return PyInt_FromLong(0);
3929 }
3930 return PyInt_FromLong(1);
3931}
3932
3933static char isalnum__doc__[] =
3934"S.isalnum() -> int\n\
3935\n\
3936Return 1 if all characters in S are alphanumeric\n\
3937and there is at least one character in S, 0 otherwise.";
3938
3939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003940unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003941{
3942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3943 register const Py_UNICODE *e;
3944
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003945 /* Shortcut for single character strings */
3946 if (PyUnicode_GET_SIZE(self) == 1 &&
3947 Py_UNICODE_ISALNUM(*p))
3948 return PyInt_FromLong(1);
3949
3950 /* Special case for empty strings */
3951 if (PyString_GET_SIZE(self) == 0)
3952 return PyInt_FromLong(0);
3953
3954 e = p + PyUnicode_GET_SIZE(self);
3955 for (; p < e; p++) {
3956 if (!Py_UNICODE_ISALNUM(*p))
3957 return PyInt_FromLong(0);
3958 }
3959 return PyInt_FromLong(1);
3960}
3961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962static char isdecimal__doc__[] =
3963"S.isdecimal() -> int\n\
3964\n\
3965Return 1 if there are only decimal characters in S,\n\
39660 otherwise.";
3967
3968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003969unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970{
3971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3972 register const Py_UNICODE *e;
3973
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 /* Shortcut for single character strings */
3975 if (PyUnicode_GET_SIZE(self) == 1 &&
3976 Py_UNICODE_ISDECIMAL(*p))
3977 return PyInt_FromLong(1);
3978
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003979 /* Special case for empty strings */
3980 if (PyString_GET_SIZE(self) == 0)
3981 return PyInt_FromLong(0);
3982
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 e = p + PyUnicode_GET_SIZE(self);
3984 for (; p < e; p++) {
3985 if (!Py_UNICODE_ISDECIMAL(*p))
3986 return PyInt_FromLong(0);
3987 }
3988 return PyInt_FromLong(1);
3989}
3990
3991static char isdigit__doc__[] =
3992"S.isdigit() -> int\n\
3993\n\
3994Return 1 if there are only digit characters in S,\n\
39950 otherwise.";
3996
3997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003998unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999{
4000 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4001 register const Py_UNICODE *e;
4002
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 /* Shortcut for single character strings */
4004 if (PyUnicode_GET_SIZE(self) == 1 &&
4005 Py_UNICODE_ISDIGIT(*p))
4006 return PyInt_FromLong(1);
4007
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004008 /* Special case for empty strings */
4009 if (PyString_GET_SIZE(self) == 0)
4010 return PyInt_FromLong(0);
4011
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 e = p + PyUnicode_GET_SIZE(self);
4013 for (; p < e; p++) {
4014 if (!Py_UNICODE_ISDIGIT(*p))
4015 return PyInt_FromLong(0);
4016 }
4017 return PyInt_FromLong(1);
4018}
4019
4020static char isnumeric__doc__[] =
4021"S.isnumeric() -> int\n\
4022\n\
4023Return 1 if there are only numeric characters in S,\n\
40240 otherwise.";
4025
4026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004027unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028{
4029 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4030 register const Py_UNICODE *e;
4031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 /* Shortcut for single character strings */
4033 if (PyUnicode_GET_SIZE(self) == 1 &&
4034 Py_UNICODE_ISNUMERIC(*p))
4035 return PyInt_FromLong(1);
4036
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004037 /* Special case for empty strings */
4038 if (PyString_GET_SIZE(self) == 0)
4039 return PyInt_FromLong(0);
4040
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 e = p + PyUnicode_GET_SIZE(self);
4042 for (; p < e; p++) {
4043 if (!Py_UNICODE_ISNUMERIC(*p))
4044 return PyInt_FromLong(0);
4045 }
4046 return PyInt_FromLong(1);
4047}
4048
4049static char join__doc__[] =
4050"S.join(sequence) -> unicode\n\
4051\n\
4052Return a string which is the concatenation of the strings in the\n\
4053sequence. The separator between elements is S.";
4054
4055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004056unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004058 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059}
4060
4061static int
4062unicode_length(PyUnicodeObject *self)
4063{
4064 return self->length;
4065}
4066
4067static char ljust__doc__[] =
4068"S.ljust(width) -> unicode\n\
4069\n\
4070Return S left justified in a Unicode string of length width. Padding is\n\
4071done using spaces.";
4072
4073static PyObject *
4074unicode_ljust(PyUnicodeObject *self, PyObject *args)
4075{
4076 int width;
4077 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4078 return NULL;
4079
4080 if (self->length >= width) {
4081 Py_INCREF(self);
4082 return (PyObject*) self;
4083 }
4084
4085 return (PyObject*) pad(self, 0, width - self->length, ' ');
4086}
4087
4088static char lower__doc__[] =
4089"S.lower() -> unicode\n\
4090\n\
4091Return a copy of the string S converted to lowercase.";
4092
4093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004094unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 return fixup(self, fixlower);
4097}
4098
4099static char lstrip__doc__[] =
4100"S.lstrip() -> unicode\n\
4101\n\
4102Return a copy of the string S with leading whitespace removed.";
4103
4104static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004105unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 return strip(self, 1, 0);
4108}
4109
4110static PyObject*
4111unicode_repeat(PyUnicodeObject *str, int len)
4112{
4113 PyUnicodeObject *u;
4114 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004115 int nchars;
4116 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117
4118 if (len < 0)
4119 len = 0;
4120
4121 if (len == 1) {
4122 /* no repeat, return original string */
4123 Py_INCREF(str);
4124 return (PyObject*) str;
4125 }
Tim Peters8f422462000-09-09 06:13:41 +00004126
4127 /* ensure # of chars needed doesn't overflow int and # of bytes
4128 * needed doesn't overflow size_t
4129 */
4130 nchars = len * str->length;
4131 if (len && nchars / len != str->length) {
4132 PyErr_SetString(PyExc_OverflowError,
4133 "repeated string is too long");
4134 return NULL;
4135 }
4136 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4137 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4138 PyErr_SetString(PyExc_OverflowError,
4139 "repeated string is too long");
4140 return NULL;
4141 }
4142 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 if (!u)
4144 return NULL;
4145
4146 p = u->str;
4147
4148 while (len-- > 0) {
4149 Py_UNICODE_COPY(p, str->str, str->length);
4150 p += str->length;
4151 }
4152
4153 return (PyObject*) u;
4154}
4155
4156PyObject *PyUnicode_Replace(PyObject *obj,
4157 PyObject *subobj,
4158 PyObject *replobj,
4159 int maxcount)
4160{
4161 PyObject *self;
4162 PyObject *str1;
4163 PyObject *str2;
4164 PyObject *result;
4165
4166 self = PyUnicode_FromObject(obj);
4167 if (self == NULL)
4168 return NULL;
4169 str1 = PyUnicode_FromObject(subobj);
4170 if (str1 == NULL) {
4171 Py_DECREF(self);
4172 return NULL;
4173 }
4174 str2 = PyUnicode_FromObject(replobj);
4175 if (str2 == NULL) {
4176 Py_DECREF(self);
4177 Py_DECREF(str1);
4178 return NULL;
4179 }
4180 result = replace((PyUnicodeObject *)self,
4181 (PyUnicodeObject *)str1,
4182 (PyUnicodeObject *)str2,
4183 maxcount);
4184 Py_DECREF(self);
4185 Py_DECREF(str1);
4186 Py_DECREF(str2);
4187 return result;
4188}
4189
4190static char replace__doc__[] =
4191"S.replace (old, new[, maxsplit]) -> unicode\n\
4192\n\
4193Return a copy of S with all occurrences of substring\n\
4194old replaced by new. If the optional argument maxsplit is\n\
4195given, only the first maxsplit occurrences are replaced.";
4196
4197static PyObject*
4198unicode_replace(PyUnicodeObject *self, PyObject *args)
4199{
4200 PyUnicodeObject *str1;
4201 PyUnicodeObject *str2;
4202 int maxcount = -1;
4203 PyObject *result;
4204
4205 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4206 return NULL;
4207 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4208 if (str1 == NULL)
4209 return NULL;
4210 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4211 if (str2 == NULL)
4212 return NULL;
4213
4214 result = replace(self, str1, str2, maxcount);
4215
4216 Py_DECREF(str1);
4217 Py_DECREF(str2);
4218 return result;
4219}
4220
4221static
4222PyObject *unicode_repr(PyObject *unicode)
4223{
4224 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4225 PyUnicode_GET_SIZE(unicode),
4226 1);
4227}
4228
4229static char rfind__doc__[] =
4230"S.rfind(sub [,start [,end]]) -> int\n\
4231\n\
4232Return the highest index in S where substring sub is found,\n\
4233such that sub is contained within s[start,end]. Optional\n\
4234arguments start and end are interpreted as in slice notation.\n\
4235\n\
4236Return -1 on failure.";
4237
4238static PyObject *
4239unicode_rfind(PyUnicodeObject *self, PyObject *args)
4240{
4241 PyUnicodeObject *substring;
4242 int start = 0;
4243 int end = INT_MAX;
4244 PyObject *result;
4245
Guido van Rossumb8872e62000-05-09 14:14:27 +00004246 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4247 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 return NULL;
4249 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4250 (PyObject *)substring);
4251 if (substring == NULL)
4252 return NULL;
4253
4254 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4255
4256 Py_DECREF(substring);
4257 return result;
4258}
4259
4260static char rindex__doc__[] =
4261"S.rindex(sub [,start [,end]]) -> int\n\
4262\n\
4263Like S.rfind() but raise ValueError when the substring is not found.";
4264
4265static PyObject *
4266unicode_rindex(PyUnicodeObject *self, PyObject *args)
4267{
4268 int result;
4269 PyUnicodeObject *substring;
4270 int start = 0;
4271 int end = INT_MAX;
4272
Guido van Rossumb8872e62000-05-09 14:14:27 +00004273 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4274 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 return NULL;
4276 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4277 (PyObject *)substring);
4278 if (substring == NULL)
4279 return NULL;
4280
4281 result = findstring(self, substring, start, end, -1);
4282
4283 Py_DECREF(substring);
4284 if (result < 0) {
4285 PyErr_SetString(PyExc_ValueError, "substring not found");
4286 return NULL;
4287 }
4288 return PyInt_FromLong(result);
4289}
4290
4291static char rjust__doc__[] =
4292"S.rjust(width) -> unicode\n\
4293\n\
4294Return S right justified in a Unicode string of length width. Padding is\n\
4295done using spaces.";
4296
4297static PyObject *
4298unicode_rjust(PyUnicodeObject *self, PyObject *args)
4299{
4300 int width;
4301 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4302 return NULL;
4303
4304 if (self->length >= width) {
4305 Py_INCREF(self);
4306 return (PyObject*) self;
4307 }
4308
4309 return (PyObject*) pad(self, width - self->length, 0, ' ');
4310}
4311
4312static char rstrip__doc__[] =
4313"S.rstrip() -> unicode\n\
4314\n\
4315Return a copy of the string S with trailing whitespace removed.";
4316
4317static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004318unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 return strip(self, 0, 1);
4321}
4322
4323static PyObject*
4324unicode_slice(PyUnicodeObject *self, int start, int end)
4325{
4326 /* standard clamping */
4327 if (start < 0)
4328 start = 0;
4329 if (end < 0)
4330 end = 0;
4331 if (end > self->length)
4332 end = self->length;
4333 if (start == 0 && end == self->length) {
4334 /* full slice, return original string */
4335 Py_INCREF(self);
4336 return (PyObject*) self;
4337 }
4338 if (start > end)
4339 start = end;
4340 /* copy slice */
4341 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4342 end - start);
4343}
4344
4345PyObject *PyUnicode_Split(PyObject *s,
4346 PyObject *sep,
4347 int maxsplit)
4348{
4349 PyObject *result;
4350
4351 s = PyUnicode_FromObject(s);
4352 if (s == NULL)
4353 return NULL;
4354 if (sep != NULL) {
4355 sep = PyUnicode_FromObject(sep);
4356 if (sep == NULL) {
4357 Py_DECREF(s);
4358 return NULL;
4359 }
4360 }
4361
4362 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4363
4364 Py_DECREF(s);
4365 Py_XDECREF(sep);
4366 return result;
4367}
4368
4369static char split__doc__[] =
4370"S.split([sep [,maxsplit]]) -> list of strings\n\
4371\n\
4372Return a list of the words in S, using sep as the\n\
4373delimiter string. If maxsplit is given, at most maxsplit\n\
4374splits are done. If sep is not specified, any whitespace string\n\
4375is a separator.";
4376
4377static PyObject*
4378unicode_split(PyUnicodeObject *self, PyObject *args)
4379{
4380 PyObject *substring = Py_None;
4381 int maxcount = -1;
4382
4383 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4384 return NULL;
4385
4386 if (substring == Py_None)
4387 return split(self, NULL, maxcount);
4388 else if (PyUnicode_Check(substring))
4389 return split(self, (PyUnicodeObject *)substring, maxcount);
4390 else
4391 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4392}
4393
4394static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004395"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396\n\
4397Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004398Line breaks are not included in the resulting list unless keepends\n\
4399is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400
4401static PyObject*
4402unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4403{
Guido van Rossum86662912000-04-11 15:38:46 +00004404 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
Guido van Rossum86662912000-04-11 15:38:46 +00004406 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 return NULL;
4408
Guido van Rossum86662912000-04-11 15:38:46 +00004409 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410}
4411
4412static
4413PyObject *unicode_str(PyUnicodeObject *self)
4414{
Fred Drakee4315f52000-05-09 19:53:39 +00004415 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416}
4417
4418static char strip__doc__[] =
4419"S.strip() -> unicode\n\
4420\n\
4421Return a copy of S with leading and trailing whitespace removed.";
4422
4423static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004424unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 return strip(self, 1, 1);
4427}
4428
4429static char swapcase__doc__[] =
4430"S.swapcase() -> unicode\n\
4431\n\
4432Return a copy of S with uppercase characters converted to lowercase\n\
4433and vice versa.";
4434
4435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004436unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 return fixup(self, fixswapcase);
4439}
4440
4441static char translate__doc__[] =
4442"S.translate(table) -> unicode\n\
4443\n\
4444Return a copy of the string S, where all characters have been mapped\n\
4445through the given translation table, which must be a mapping of\n\
4446Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4447are left untouched. Characters mapped to None are deleted.";
4448
4449static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004450unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 return PyUnicode_TranslateCharmap(self->str,
4453 self->length,
4454 table,
4455 "ignore");
4456}
4457
4458static char upper__doc__[] =
4459"S.upper() -> unicode\n\
4460\n\
4461Return a copy of S converted to uppercase.";
4462
4463static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004464unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 return fixup(self, fixupper);
4467}
4468
4469#if 0
4470static char zfill__doc__[] =
4471"S.zfill(width) -> unicode\n\
4472\n\
4473Pad a numeric string x with zeros on the left, to fill a field\n\
4474of the specified width. The string x is never truncated.";
4475
4476static PyObject *
4477unicode_zfill(PyUnicodeObject *self, PyObject *args)
4478{
4479 int fill;
4480 PyUnicodeObject *u;
4481
4482 int width;
4483 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4484 return NULL;
4485
4486 if (self->length >= width) {
4487 Py_INCREF(self);
4488 return (PyObject*) self;
4489 }
4490
4491 fill = width - self->length;
4492
4493 u = pad(self, fill, 0, '0');
4494
4495 if (u->str[fill] == '+' || u->str[fill] == '-') {
4496 /* move sign to beginning of string */
4497 u->str[0] = u->str[fill];
4498 u->str[fill] = '0';
4499 }
4500
4501 return (PyObject*) u;
4502}
4503#endif
4504
4505#if 0
4506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004507unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 return PyInt_FromLong(unicode_freelist_size);
4510}
4511#endif
4512
4513static char startswith__doc__[] =
4514"S.startswith(prefix[, start[, end]]) -> int\n\
4515\n\
4516Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4517optional start, test S beginning at that position. With optional end, stop\n\
4518comparing S at that position.";
4519
4520static PyObject *
4521unicode_startswith(PyUnicodeObject *self,
4522 PyObject *args)
4523{
4524 PyUnicodeObject *substring;
4525 int start = 0;
4526 int end = INT_MAX;
4527 PyObject *result;
4528
Guido van Rossumb8872e62000-05-09 14:14:27 +00004529 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4530 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 return NULL;
4532 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4533 (PyObject *)substring);
4534 if (substring == NULL)
4535 return NULL;
4536
4537 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4538
4539 Py_DECREF(substring);
4540 return result;
4541}
4542
4543
4544static char endswith__doc__[] =
4545"S.endswith(suffix[, start[, end]]) -> int\n\
4546\n\
4547Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4548optional start, test S beginning at that position. With optional end, stop\n\
4549comparing S at that position.";
4550
4551static PyObject *
4552unicode_endswith(PyUnicodeObject *self,
4553 PyObject *args)
4554{
4555 PyUnicodeObject *substring;
4556 int start = 0;
4557 int end = INT_MAX;
4558 PyObject *result;
4559
Guido van Rossumb8872e62000-05-09 14:14:27 +00004560 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4561 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 return NULL;
4563 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4564 (PyObject *)substring);
4565 if (substring == NULL)
4566 return NULL;
4567
4568 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4569
4570 Py_DECREF(substring);
4571 return result;
4572}
4573
4574
4575static PyMethodDef unicode_methods[] = {
4576
4577 /* Order is according to common usage: often used methods should
4578 appear first, since lookup is done sequentially. */
4579
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004580 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4581 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4582 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4583 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4584 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4585 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4586 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4587 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4588 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4589 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4590 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4591 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4592 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4593 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4594/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4595 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4596 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4597 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4598 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4599 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4600 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4601 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4602 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4603 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4604 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4605 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4606 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4607 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4608 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4609 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4610 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4611 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4612 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4613 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4614 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004616 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4617 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618#endif
4619
4620#if 0
4621 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004622 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623#endif
4624
4625 {NULL, NULL}
4626};
4627
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628static PySequenceMethods unicode_as_sequence = {
4629 (inquiry) unicode_length, /* sq_length */
4630 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4631 (intargfunc) unicode_repeat, /* sq_repeat */
4632 (intargfunc) unicode_getitem, /* sq_item */
4633 (intintargfunc) unicode_slice, /* sq_slice */
4634 0, /* sq_ass_item */
4635 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004636 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637};
4638
4639static int
4640unicode_buffer_getreadbuf(PyUnicodeObject *self,
4641 int index,
4642 const void **ptr)
4643{
4644 if (index != 0) {
4645 PyErr_SetString(PyExc_SystemError,
4646 "accessing non-existent unicode segment");
4647 return -1;
4648 }
4649 *ptr = (void *) self->str;
4650 return PyUnicode_GET_DATA_SIZE(self);
4651}
4652
4653static int
4654unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4655 const void **ptr)
4656{
4657 PyErr_SetString(PyExc_TypeError,
4658 "cannot use unicode as modifyable buffer");
4659 return -1;
4660}
4661
4662static int
4663unicode_buffer_getsegcount(PyUnicodeObject *self,
4664 int *lenp)
4665{
4666 if (lenp)
4667 *lenp = PyUnicode_GET_DATA_SIZE(self);
4668 return 1;
4669}
4670
4671static int
4672unicode_buffer_getcharbuf(PyUnicodeObject *self,
4673 int index,
4674 const void **ptr)
4675{
4676 PyObject *str;
4677
4678 if (index != 0) {
4679 PyErr_SetString(PyExc_SystemError,
4680 "accessing non-existent unicode segment");
4681 return -1;
4682 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004683 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 if (str == NULL)
4685 return -1;
4686 *ptr = (void *) PyString_AS_STRING(str);
4687 return PyString_GET_SIZE(str);
4688}
4689
4690/* Helpers for PyUnicode_Format() */
4691
4692static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004693getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694{
4695 int argidx = *p_argidx;
4696 if (argidx < arglen) {
4697 (*p_argidx)++;
4698 if (arglen < 0)
4699 return args;
4700 else
4701 return PyTuple_GetItem(args, argidx);
4702 }
4703 PyErr_SetString(PyExc_TypeError,
4704 "not enough arguments for format string");
4705 return NULL;
4706}
4707
4708#define F_LJUST (1<<0)
4709#define F_SIGN (1<<1)
4710#define F_BLANK (1<<2)
4711#define F_ALT (1<<3)
4712#define F_ZERO (1<<4)
4713
4714static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716{
4717 register int i;
4718 int len;
4719 va_list va;
4720 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
4723 /* First, format the string as char array, then expand to Py_UNICODE
4724 array. */
4725 charbuffer = (char *)buffer;
4726 len = vsprintf(charbuffer, format, va);
4727 for (i = len - 1; i >= 0; i--)
4728 buffer[i] = (Py_UNICODE) charbuffer[i];
4729
4730 va_end(va);
4731 return len;
4732}
4733
4734static int
4735formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004736 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 int flags,
4738 int prec,
4739 int type,
4740 PyObject *v)
4741{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004742 /* fmt = '%#.' + `prec` + `type`
4743 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 char fmt[20];
4745 double x;
4746
4747 x = PyFloat_AsDouble(v);
4748 if (x == -1.0 && PyErr_Occurred())
4749 return -1;
4750 if (prec < 0)
4751 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4753 type = 'g';
4754 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004755 /* worst case length calc to ensure no buffer overrun:
4756 fmt = %#.<prec>g
4757 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4758 for any double rep.)
4759 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4760 If prec=0 the effective precision is 1 (the leading digit is
4761 always given), therefore increase by one to 10+prec. */
4762 if (buflen <= (size_t)10 + (size_t)prec) {
4763 PyErr_SetString(PyExc_OverflowError,
4764 "formatted float is too long (precision too long?)");
4765 return -1;
4766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return usprintf(buf, fmt, x);
4768}
4769
Tim Peters38fd5b62000-09-21 05:43:11 +00004770static PyObject*
4771formatlong(PyObject *val, int flags, int prec, int type)
4772{
4773 char *buf;
4774 int i, len;
4775 PyObject *str; /* temporary string object. */
4776 PyUnicodeObject *result;
4777
4778 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4779 if (!str)
4780 return NULL;
4781 result = _PyUnicode_New(len);
4782 for (i = 0; i < len; i++)
4783 result->str[i] = buf[i];
4784 result->str[len] = 0;
4785 Py_DECREF(str);
4786 return (PyObject*)result;
4787}
4788
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789static int
4790formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004791 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 int flags,
4793 int prec,
4794 int type,
4795 PyObject *v)
4796{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004797 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004798 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4799 + 1 + 1 = 24*/
4800 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004802 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803
4804 x = PyInt_AsLong(v);
4805 if (x == -1 && PyErr_Occurred())
4806 return -1;
4807 if (prec < 0)
4808 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004809 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4810 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4811 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4812 PyErr_SetString(PyExc_OverflowError,
4813 "formatted integer is too long (precision too long?)");
4814 return -1;
4815 }
Tim Petersfff53252001-04-12 18:38:48 +00004816 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4817 * but we want it (for consistency with other %#x conversions, and
4818 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004819 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4820 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4821 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004822 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004823 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4824 /* Only way to know what the platform does is to try it. */
4825 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4826 if (fmt[1] != (char)type) {
4827 /* Supply our own leading 0x/0X -- needed under std C */
4828 use_native_c_format = 0;
4829 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4830 }
4831 }
4832 if (use_native_c_format)
4833 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 return usprintf(buf, fmt, x);
4835}
4836
4837static int
4838formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004839 size_t buflen,
4840 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004842 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004843 if (PyUnicode_Check(v)) {
4844 if (PyUnicode_GET_SIZE(v) != 1)
4845 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004849 else if (PyString_Check(v)) {
4850 if (PyString_GET_SIZE(v) != 1)
4851 goto onError;
4852 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
4855 else {
4856 /* Integer input truncated to a character */
4857 long x;
4858 x = PyInt_AsLong(v);
4859 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 buf[0] = (char) x;
4862 }
4863 buf[1] = '\0';
4864 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004865
4866 onError:
4867 PyErr_SetString(PyExc_TypeError,
4868 "%c requires int or char");
4869 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870}
4871
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004872/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4873
4874 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4875 chars are formatted. XXX This is a magic number. Each formatting
4876 routine does bounds checking to ensure no overflow, but a better
4877 solution may be to malloc a buffer of appropriate size for each
4878 format. For now, the current solution is sufficient.
4879*/
4880#define FORMATBUFLEN (size_t)120
4881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882PyObject *PyUnicode_Format(PyObject *format,
4883 PyObject *args)
4884{
4885 Py_UNICODE *fmt, *res;
4886 int fmtcnt, rescnt, reslen, arglen, argidx;
4887 int args_owned = 0;
4888 PyUnicodeObject *result = NULL;
4889 PyObject *dict = NULL;
4890 PyObject *uformat;
4891
4892 if (format == NULL || args == NULL) {
4893 PyErr_BadInternalCall();
4894 return NULL;
4895 }
4896 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004897 if (uformat == NULL)
4898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 fmt = PyUnicode_AS_UNICODE(uformat);
4900 fmtcnt = PyUnicode_GET_SIZE(uformat);
4901
4902 reslen = rescnt = fmtcnt + 100;
4903 result = _PyUnicode_New(reslen);
4904 if (result == NULL)
4905 goto onError;
4906 res = PyUnicode_AS_UNICODE(result);
4907
4908 if (PyTuple_Check(args)) {
4909 arglen = PyTuple_Size(args);
4910 argidx = 0;
4911 }
4912 else {
4913 arglen = -1;
4914 argidx = -2;
4915 }
4916 if (args->ob_type->tp_as_mapping)
4917 dict = args;
4918
4919 while (--fmtcnt >= 0) {
4920 if (*fmt != '%') {
4921 if (--rescnt < 0) {
4922 rescnt = fmtcnt + 100;
4923 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004924 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 return NULL;
4926 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4927 --rescnt;
4928 }
4929 *res++ = *fmt++;
4930 }
4931 else {
4932 /* Got a format specifier */
4933 int flags = 0;
4934 int width = -1;
4935 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 Py_UNICODE c = '\0';
4937 Py_UNICODE fill;
4938 PyObject *v = NULL;
4939 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004940 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 Py_UNICODE sign;
4942 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004943 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
4945 fmt++;
4946 if (*fmt == '(') {
4947 Py_UNICODE *keystart;
4948 int keylen;
4949 PyObject *key;
4950 int pcount = 1;
4951
4952 if (dict == NULL) {
4953 PyErr_SetString(PyExc_TypeError,
4954 "format requires a mapping");
4955 goto onError;
4956 }
4957 ++fmt;
4958 --fmtcnt;
4959 keystart = fmt;
4960 /* Skip over balanced parentheses */
4961 while (pcount > 0 && --fmtcnt >= 0) {
4962 if (*fmt == ')')
4963 --pcount;
4964 else if (*fmt == '(')
4965 ++pcount;
4966 fmt++;
4967 }
4968 keylen = fmt - keystart - 1;
4969 if (fmtcnt < 0 || pcount > 0) {
4970 PyErr_SetString(PyExc_ValueError,
4971 "incomplete format key");
4972 goto onError;
4973 }
Fred Drakee4315f52000-05-09 19:53:39 +00004974 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 then looked up since Python uses strings to hold
4976 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004977 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 key = PyUnicode_EncodeUTF8(keystart,
4979 keylen,
4980 NULL);
4981 if (key == NULL)
4982 goto onError;
4983 if (args_owned) {
4984 Py_DECREF(args);
4985 args_owned = 0;
4986 }
4987 args = PyObject_GetItem(dict, key);
4988 Py_DECREF(key);
4989 if (args == NULL) {
4990 goto onError;
4991 }
4992 args_owned = 1;
4993 arglen = -1;
4994 argidx = -2;
4995 }
4996 while (--fmtcnt >= 0) {
4997 switch (c = *fmt++) {
4998 case '-': flags |= F_LJUST; continue;
4999 case '+': flags |= F_SIGN; continue;
5000 case ' ': flags |= F_BLANK; continue;
5001 case '#': flags |= F_ALT; continue;
5002 case '0': flags |= F_ZERO; continue;
5003 }
5004 break;
5005 }
5006 if (c == '*') {
5007 v = getnextarg(args, arglen, &argidx);
5008 if (v == NULL)
5009 goto onError;
5010 if (!PyInt_Check(v)) {
5011 PyErr_SetString(PyExc_TypeError,
5012 "* wants int");
5013 goto onError;
5014 }
5015 width = PyInt_AsLong(v);
5016 if (width < 0) {
5017 flags |= F_LJUST;
5018 width = -width;
5019 }
5020 if (--fmtcnt >= 0)
5021 c = *fmt++;
5022 }
5023 else if (c >= '0' && c <= '9') {
5024 width = c - '0';
5025 while (--fmtcnt >= 0) {
5026 c = *fmt++;
5027 if (c < '0' || c > '9')
5028 break;
5029 if ((width*10) / 10 != width) {
5030 PyErr_SetString(PyExc_ValueError,
5031 "width too big");
5032 goto onError;
5033 }
5034 width = width*10 + (c - '0');
5035 }
5036 }
5037 if (c == '.') {
5038 prec = 0;
5039 if (--fmtcnt >= 0)
5040 c = *fmt++;
5041 if (c == '*') {
5042 v = getnextarg(args, arglen, &argidx);
5043 if (v == NULL)
5044 goto onError;
5045 if (!PyInt_Check(v)) {
5046 PyErr_SetString(PyExc_TypeError,
5047 "* wants int");
5048 goto onError;
5049 }
5050 prec = PyInt_AsLong(v);
5051 if (prec < 0)
5052 prec = 0;
5053 if (--fmtcnt >= 0)
5054 c = *fmt++;
5055 }
5056 else if (c >= '0' && c <= '9') {
5057 prec = c - '0';
5058 while (--fmtcnt >= 0) {
5059 c = Py_CHARMASK(*fmt++);
5060 if (c < '0' || c > '9')
5061 break;
5062 if ((prec*10) / 10 != prec) {
5063 PyErr_SetString(PyExc_ValueError,
5064 "prec too big");
5065 goto onError;
5066 }
5067 prec = prec*10 + (c - '0');
5068 }
5069 }
5070 } /* prec */
5071 if (fmtcnt >= 0) {
5072 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 if (--fmtcnt >= 0)
5074 c = *fmt++;
5075 }
5076 }
5077 if (fmtcnt < 0) {
5078 PyErr_SetString(PyExc_ValueError,
5079 "incomplete format");
5080 goto onError;
5081 }
5082 if (c != '%') {
5083 v = getnextarg(args, arglen, &argidx);
5084 if (v == NULL)
5085 goto onError;
5086 }
5087 sign = 0;
5088 fill = ' ';
5089 switch (c) {
5090
5091 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005092 pbuf = formatbuf;
5093 /* presume that buffer length is at least 1 */
5094 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 len = 1;
5096 break;
5097
5098 case 's':
5099 case 'r':
5100 if (PyUnicode_Check(v) && c == 's') {
5101 temp = v;
5102 Py_INCREF(temp);
5103 }
5104 else {
5105 PyObject *unicode;
5106 if (c == 's')
5107 temp = PyObject_Str(v);
5108 else
5109 temp = PyObject_Repr(v);
5110 if (temp == NULL)
5111 goto onError;
5112 if (!PyString_Check(temp)) {
5113 /* XXX Note: this should never happen, since
5114 PyObject_Repr() and PyObject_Str() assure
5115 this */
5116 Py_DECREF(temp);
5117 PyErr_SetString(PyExc_TypeError,
5118 "%s argument has non-string str()");
5119 goto onError;
5120 }
Fred Drakee4315f52000-05-09 19:53:39 +00005121 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005123 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 "strict");
5125 Py_DECREF(temp);
5126 temp = unicode;
5127 if (temp == NULL)
5128 goto onError;
5129 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005130 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 len = PyUnicode_GET_SIZE(temp);
5132 if (prec >= 0 && len > prec)
5133 len = prec;
5134 break;
5135
5136 case 'i':
5137 case 'd':
5138 case 'u':
5139 case 'o':
5140 case 'x':
5141 case 'X':
5142 if (c == 'i')
5143 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005144 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005145 temp = formatlong(v, flags, prec, c);
5146 if (!temp)
5147 goto onError;
5148 pbuf = PyUnicode_AS_UNICODE(temp);
5149 len = PyUnicode_GET_SIZE(temp);
5150 /* unbounded ints can always produce
5151 a sign character! */
5152 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005154 else {
5155 pbuf = formatbuf;
5156 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5157 flags, prec, c, v);
5158 if (len < 0)
5159 goto onError;
5160 /* only d conversion is signed */
5161 sign = c == 'd';
5162 }
5163 if (flags & F_ZERO)
5164 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 break;
5166
5167 case 'e':
5168 case 'E':
5169 case 'f':
5170 case 'g':
5171 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005172 pbuf = formatbuf;
5173 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5174 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 if (len < 0)
5176 goto onError;
5177 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005178 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 fill = '0';
5180 break;
5181
5182 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005183 pbuf = formatbuf;
5184 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 if (len < 0)
5186 goto onError;
5187 break;
5188
5189 default:
5190 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005191 "unsupported format character '%c' (0x%x) "
5192 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005193 (31<=c && c<=126) ? c : '?',
5194 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 goto onError;
5196 }
5197 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005198 if (*pbuf == '-' || *pbuf == '+') {
5199 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 len--;
5201 }
5202 else if (flags & F_SIGN)
5203 sign = '+';
5204 else if (flags & F_BLANK)
5205 sign = ' ';
5206 else
5207 sign = 0;
5208 }
5209 if (width < len)
5210 width = len;
5211 if (rescnt < width + (sign != 0)) {
5212 reslen -= rescnt;
5213 rescnt = width + fmtcnt + 100;
5214 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005215 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 return NULL;
5217 res = PyUnicode_AS_UNICODE(result)
5218 + reslen - rescnt;
5219 }
5220 if (sign) {
5221 if (fill != ' ')
5222 *res++ = sign;
5223 rescnt--;
5224 if (width > len)
5225 width--;
5226 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005227 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5228 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005229 assert(pbuf[1] == c);
5230 if (fill != ' ') {
5231 *res++ = *pbuf++;
5232 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005233 }
Tim Petersfff53252001-04-12 18:38:48 +00005234 rescnt -= 2;
5235 width -= 2;
5236 if (width < 0)
5237 width = 0;
5238 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 if (width > len && !(flags & F_LJUST)) {
5241 do {
5242 --rescnt;
5243 *res++ = fill;
5244 } while (--width > len);
5245 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005246 if (fill == ' ') {
5247 if (sign)
5248 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005249 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005250 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005251 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005252 *res++ = *pbuf++;
5253 *res++ = *pbuf++;
5254 }
5255 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005256 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 res += len;
5258 rescnt -= len;
5259 while (--width >= len) {
5260 --rescnt;
5261 *res++ = ' ';
5262 }
5263 if (dict && (argidx < arglen) && c != '%') {
5264 PyErr_SetString(PyExc_TypeError,
5265 "not all arguments converted");
5266 goto onError;
5267 }
5268 Py_XDECREF(temp);
5269 } /* '%' */
5270 } /* until end */
5271 if (argidx < arglen && !dict) {
5272 PyErr_SetString(PyExc_TypeError,
5273 "not all arguments converted");
5274 goto onError;
5275 }
5276
5277 if (args_owned) {
5278 Py_DECREF(args);
5279 }
5280 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005281 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 return (PyObject *)result;
5284
5285 onError:
5286 Py_XDECREF(result);
5287 Py_DECREF(uformat);
5288 if (args_owned) {
5289 Py_DECREF(args);
5290 }
5291 return NULL;
5292}
5293
5294static PyBufferProcs unicode_as_buffer = {
5295 (getreadbufferproc) unicode_buffer_getreadbuf,
5296 (getwritebufferproc) unicode_buffer_getwritebuf,
5297 (getsegcountproc) unicode_buffer_getsegcount,
5298 (getcharbufferproc) unicode_buffer_getcharbuf,
5299};
5300
Tim Peters6d6c1a32001-08-02 04:15:00 +00005301static PyObject *
5302unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5303{
5304 PyObject *x = NULL;
5305 static char *kwlist[] = {"string", "encoding", "errors", 0};
5306 char *encoding = NULL;
5307 char *errors = NULL;
5308
5309 assert(type == &PyUnicode_Type);
5310 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5311 kwlist, &x, &encoding, &errors))
5312 return NULL;
5313 if (x == NULL)
5314 return (PyObject *)_PyUnicode_New(0);
5315 return PyUnicode_FromEncodedObject(x, encoding, errors);
5316}
5317
5318static char unicode_doc[] =
5319"unicode(string [, encoding[, errors]]) -> object\n\
5320\n\
5321Create a new Unicode object from the given encoded string.\n\
5322encoding defaults to the current default string encoding and \n\
5323errors, defining the error handling, to 'strict'.";
5324
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325PyTypeObject PyUnicode_Type = {
5326 PyObject_HEAD_INIT(&PyType_Type)
5327 0, /* ob_size */
5328 "unicode", /* tp_name */
5329 sizeof(PyUnicodeObject), /* tp_size */
5330 0, /* tp_itemsize */
5331 /* Slots */
5332 (destructor)_PyUnicode_Free, /* tp_dealloc */
5333 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005334 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 0, /* tp_setattr */
5336 (cmpfunc) unicode_compare, /* tp_compare */
5337 (reprfunc) unicode_repr, /* tp_repr */
5338 0, /* tp_as_number */
5339 &unicode_as_sequence, /* tp_as_sequence */
5340 0, /* tp_as_mapping */
5341 (hashfunc) unicode_hash, /* tp_hash*/
5342 0, /* tp_call*/
5343 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005344 PyObject_GenericGetAttr, /* tp_getattro */
5345 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 &unicode_as_buffer, /* tp_as_buffer */
5347 Py_TPFLAGS_DEFAULT, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005348 unicode_doc, /* tp_doc */
5349 0, /* tp_traverse */
5350 0, /* tp_clear */
5351 0, /* tp_richcompare */
5352 0, /* tp_weaklistoffset */
5353 0, /* tp_iter */
5354 0, /* tp_iternext */
5355 unicode_methods, /* tp_methods */
5356 0, /* tp_members */
5357 0, /* tp_getset */
5358 0, /* tp_base */
5359 0, /* tp_dict */
5360 0, /* tp_descr_get */
5361 0, /* tp_descr_set */
5362 0, /* tp_dictoffset */
5363 0, /* tp_init */
5364 0, /* tp_alloc */
5365 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366};
5367
5368/* Initialize the Unicode implementation */
5369
Thomas Wouters78890102000-07-22 19:25:51 +00005370void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005372 int i;
5373
Fred Drakee4315f52000-05-09 19:53:39 +00005374 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005375 unicode_freelist = NULL;
5376 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005378 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005379 for (i = 0; i < 256; i++)
5380 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381}
5382
5383/* Finalize the Unicode implementation */
5384
5385void
Thomas Wouters78890102000-07-22 19:25:51 +00005386_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005388 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005389 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005391 Py_XDECREF(unicode_empty);
5392 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005393
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005394 for (i = 0; i < 256; i++) {
5395 if (unicode_latin1[i]) {
5396 Py_DECREF(unicode_latin1[i]);
5397 unicode_latin1[i] = NULL;
5398 }
5399 }
5400
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005401 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 PyUnicodeObject *v = u;
5403 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005404 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005405 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005406 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005407 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005409 unicode_freelist = NULL;
5410 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411}