blob: 5080eb86661132f63dfa12e808b3b67be80dbc31 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000427 if (encoding) {
Tim Peters78e0fc72001-09-11 03:07:38 +0000428 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000429 "decoding Unicode is not supported");
Tim Peters78e0fc72001-09-11 03:07:38 +0000430 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000431 }
Tim Peters78e0fc72001-09-11 03:07:38 +0000432 if (PyUnicode_CheckExact(obj)) {
433 Py_INCREF(obj);
434 v = obj;
435 }
436 else {
437 /* For a subclass of unicode, return a true unicode object
438 with the same string value. */
439 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
440 PyUnicode_GET_SIZE(obj));
441 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
444 else if (PyString_Check(obj)) {
445 s = PyString_AS_STRING(obj);
446 len = PyString_GET_SIZE(obj);
447 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000448 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
449 /* Overwrite the error message with something more useful in
450 case of a TypeError. */
451 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000452 PyErr_Format(PyExc_TypeError,
453 "coercing to Unicode: need string or buffer, "
454 "%.80s found",
455 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000457 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000458
459 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 if (len == 0) {
461 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000464 else
465 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000466
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 return v;
472
473 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478}
479
480PyObject *PyUnicode_Decode(const char *s,
481 int size,
482 const char *encoding,
483 const char *errors)
484{
485 PyObject *buffer = NULL, *unicode;
486
Fred Drakee4315f52000-05-09 19:53:39 +0000487 if (encoding == NULL)
488 encoding = PyUnicode_GetDefaultEncoding();
489
490 /* Shortcuts for common default encodings */
491 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000492 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000493 else if (strcmp(encoding, "latin-1") == 0)
494 return PyUnicode_DecodeLatin1(s, size, errors);
495 else if (strcmp(encoding, "ascii") == 0)
496 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497
498 /* Decode via the codec registry */
499 buffer = PyBuffer_FromMemory((void *)s, size);
500 if (buffer == NULL)
501 goto onError;
502 unicode = PyCodec_Decode(buffer, encoding, errors);
503 if (unicode == NULL)
504 goto onError;
505 if (!PyUnicode_Check(unicode)) {
506 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000507 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode->ob_type->tp_name);
509 Py_DECREF(unicode);
510 goto onError;
511 }
512 Py_DECREF(buffer);
513 return unicode;
514
515 onError:
516 Py_XDECREF(buffer);
517 return NULL;
518}
519
520PyObject *PyUnicode_Encode(const Py_UNICODE *s,
521 int size,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *v, *unicode;
526
527 unicode = PyUnicode_FromUnicode(s, size);
528 if (unicode == NULL)
529 return NULL;
530 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
531 Py_DECREF(unicode);
532 return v;
533}
534
535PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
536 const char *encoding,
537 const char *errors)
538{
539 PyObject *v;
540
541 if (!PyUnicode_Check(unicode)) {
542 PyErr_BadArgument();
543 goto onError;
544 }
Fred Drakee4315f52000-05-09 19:53:39 +0000545
546 if (encoding == NULL)
547 encoding = PyUnicode_GetDefaultEncoding();
548
549 /* Shortcuts for common default encodings */
550 if (errors == NULL) {
551 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000552 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000553 else if (strcmp(encoding, "latin-1") == 0)
554 return PyUnicode_AsLatin1String(unicode);
555 else if (strcmp(encoding, "ascii") == 0)
556 return PyUnicode_AsASCIIString(unicode);
557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558
559 /* Encode via the codec registry */
560 v = PyCodec_Encode(unicode, encoding, errors);
561 if (v == NULL)
562 goto onError;
563 /* XXX Should we really enforce this ? */
564 if (!PyString_Check(v)) {
565 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000566 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 v->ob_type->tp_name);
568 Py_DECREF(v);
569 goto onError;
570 }
571 return v;
572
573 onError:
574 return NULL;
575}
576
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000577PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
578 const char *errors)
579{
580 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
581
582 if (v)
583 return v;
584 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
585 if (v && errors == NULL)
586 ((PyUnicodeObject *)unicode)->defenc = v;
587 return v;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
591{
592 if (!PyUnicode_Check(unicode)) {
593 PyErr_BadArgument();
594 goto onError;
595 }
596 return PyUnicode_AS_UNICODE(unicode);
597
598 onError:
599 return NULL;
600}
601
602int PyUnicode_GetSize(PyObject *unicode)
603{
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_BadArgument();
606 goto onError;
607 }
608 return PyUnicode_GET_SIZE(unicode);
609
610 onError:
611 return -1;
612}
613
Thomas Wouters78890102000-07-22 19:25:51 +0000614const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000615{
616 return unicode_default_encoding;
617}
618
619int PyUnicode_SetDefaultEncoding(const char *encoding)
620{
621 PyObject *v;
622
623 /* Make sure the encoding is valid. As side effect, this also
624 loads the encoding into the codec registry cache. */
625 v = _PyCodec_Lookup(encoding);
626 if (v == NULL)
627 goto onError;
628 Py_DECREF(v);
629 strncpy(unicode_default_encoding,
630 encoding,
631 sizeof(unicode_default_encoding));
632 return 0;
633
634 onError:
635 return -1;
636}
637
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638/* --- UTF-8 Codec -------------------------------------------------------- */
639
640static
641char utf8_code_length[256] = {
642 /* Map UTF-8 encoded prefix byte to sequence length. zero means
643 illegal prefix. see RFC 2279 for details */
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
647 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
654 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
656 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
657 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
658 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
659 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
660};
661
662static
663int utf8_decoding_error(const char **source,
664 Py_UNICODE **dest,
665 const char *errors,
666 const char *details)
667{
668 if ((errors == NULL) ||
669 (strcmp(errors,"strict") == 0)) {
670 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000671 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 details);
673 return -1;
674 }
675 else if (strcmp(errors,"ignore") == 0) {
676 (*source)++;
677 return 0;
678 }
679 else if (strcmp(errors,"replace") == 0) {
680 (*source)++;
681 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
682 (*dest)++;
683 return 0;
684 }
685 else {
686 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000687 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 errors);
689 return -1;
690 }
691}
692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693PyObject *PyUnicode_DecodeUTF8(const char *s,
694 int size,
695 const char *errors)
696{
697 int n;
698 const char *e;
699 PyUnicodeObject *unicode;
700 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000701 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000702
703 /* Note: size will always be longer than the resulting Unicode
704 character count */
705 unicode = _PyUnicode_New(size);
706 if (!unicode)
707 return NULL;
708 if (size == 0)
709 return (PyObject *)unicode;
710
711 /* Unpack UTF-8 encoded data */
712 p = unicode->str;
713 e = s + size;
714
715 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000716 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717
718 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000719 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720 s++;
721 continue;
722 }
723
724 n = utf8_code_length[ch];
725
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000726 if (s + n > e) {
727 errmsg = "unexpected end of data";
728 goto utf8Error;
729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 switch (n) {
732
733 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000734 errmsg = "unexpected code byte";
735 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736
737 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000738 errmsg = "internal error";
739 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740
741 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000742 if ((s[1] & 0xc0) != 0x80) {
743 errmsg = "invalid data";
744 goto utf8Error;
745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000746 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000747 if (ch < 0x80) {
748 errmsg = "illegal encoding";
749 goto utf8Error;
750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000752 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753 break;
754
755 case 3:
756 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000757 (s[2] & 0xc0) != 0x80) {
758 errmsg = "invalid data";
759 goto utf8Error;
760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000762 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
763 errmsg = "illegal encoding";
764 goto utf8Error;
765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000766 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000767 *p++ = (Py_UNICODE)ch;
768 break;
769
770 case 4:
771 if ((s[1] & 0xc0) != 0x80 ||
772 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000773 (s[3] & 0xc0) != 0x80) {
774 errmsg = "invalid data";
775 goto utf8Error;
776 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000777 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
778 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
779 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000780 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000781 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000782 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000783 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000784 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000785 errmsg = "illegal encoding";
786 goto utf8Error;
787 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000788#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000789 *p++ = (Py_UNICODE)ch;
790#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000791 /* compute and append the two surrogates: */
792
793 /* translate from 10000..10FFFF to 0..FFFF */
794 ch -= 0x10000;
795
796 /* high surrogate = top 10 bits added to D800 */
797 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
798
799 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000800 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000801#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 break;
803
804 default:
805 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000806 errmsg = "unsupported Unicode code range";
807 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 }
809 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000810 continue;
811
812 utf8Error:
813 if (utf8_decoding_error(&s, &p, errors, errmsg))
814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
817 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000818 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819 goto onError;
820
821 return (PyObject *)unicode;
822
823onError:
824 Py_DECREF(unicode);
825 return NULL;
826}
827
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000828/* Not used anymore, now that the encoder supports UTF-16
829 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000830#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831static
832int utf8_encoding_error(const Py_UNICODE **source,
833 char **dest,
834 const char *errors,
835 const char *details)
836{
837 if ((errors == NULL) ||
838 (strcmp(errors,"strict") == 0)) {
839 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000840 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841 details);
842 return -1;
843 }
844 else if (strcmp(errors,"ignore") == 0) {
845 return 0;
846 }
847 else if (strcmp(errors,"replace") == 0) {
848 **dest = '?';
849 (*dest)++;
850 return 0;
851 }
852 else {
853 PyErr_Format(PyExc_ValueError,
854 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000855 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856 errors);
857 return -1;
858 }
859}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000860#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000861
862PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
863 int size,
864 const char *errors)
865{
866 PyObject *v;
867 char *p;
868 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000869 Py_UCS4 ch2;
870 unsigned int cbAllocated = 3 * size;
871 unsigned int cbWritten = 0;
872 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875 if (v == NULL)
876 return NULL;
877 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000878 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879
880 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000881 while (i < size) {
882 Py_UCS4 ch = s[i++];
883 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000885 cbWritten++;
886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000887 else if (ch < 0x0800) {
888 *p++ = 0xc0 | (ch >> 6);
889 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000890 cbWritten += 2;
891 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000892 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000893 /* Check for high surrogate */
894 if (0xD800 <= ch && ch <= 0xDBFF) {
895 if (i != size) {
896 ch2 = s[i];
897 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
898
899 if (cbWritten >= (cbAllocated - 4)) {
900 /* Provide enough room for some more
901 surrogates */
902 cbAllocated += 4*10;
903 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000904 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000905 }
906
907 /* combine the two values */
908 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
909
910 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000911 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000912 i++;
913 cbWritten += 4;
914 }
915 }
916 }
917 else {
918 *p++ = (char)(0xe0 | (ch >> 12));
919 cbWritten += 3;
920 }
921 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
922 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000923 } else {
924 *p++ = 0xf0 | (ch>>18);
925 *p++ = 0x80 | ((ch>>12) & 0x3f);
926 *p++ = 0x80 | ((ch>>6) & 0x3f);
927 *p++ = 0x80 | (ch & 0x3f);
928 cbWritten += 4;
929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 }
931 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000932 if (_PyString_Resize(&v, p - q))
933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 return v;
935
936 onError:
937 Py_DECREF(v);
938 return NULL;
939}
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
942{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943 if (!PyUnicode_Check(unicode)) {
944 PyErr_BadArgument();
945 return NULL;
946 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000947 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
948 PyUnicode_GET_SIZE(unicode),
949 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950}
951
952/* --- UTF-16 Codec ------------------------------------------------------- */
953
954static
Tim Peters772747b2001-08-09 22:21:55 +0000955int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956 const char *errors,
957 const char *details)
958{
959 if ((errors == NULL) ||
960 (strcmp(errors,"strict") == 0)) {
961 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000962 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 details);
964 return -1;
965 }
966 else if (strcmp(errors,"ignore") == 0) {
967 return 0;
968 }
969 else if (strcmp(errors,"replace") == 0) {
970 if (dest) {
971 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
972 (*dest)++;
973 }
974 return 0;
975 }
976 else {
977 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000978 "UTF-16 decoding error; "
979 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 errors);
981 return -1;
982 }
983}
984
Tim Peters772747b2001-08-09 22:21:55 +0000985PyObject *
986PyUnicode_DecodeUTF16(const char *s,
987 int size,
988 const char *errors,
989 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000990{
991 PyUnicodeObject *unicode;
992 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +0000993 const unsigned char *q, *e;
994 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000995 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +0000996 /* Offsets from q for retrieving byte pairs in the right order. */
997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
998 int ihi = 1, ilo = 0;
999#else
1000 int ihi = 0, ilo = 1;
1001#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001004 if (size & 1) {
1005 if (utf16_decoding_error(NULL, errors, "truncated data"))
1006 return NULL;
1007 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 /* Note: size will always be longer than the resulting Unicode
1011 character count */
1012 unicode = _PyUnicode_New(size);
1013 if (!unicode)
1014 return NULL;
1015 if (size == 0)
1016 return (PyObject *)unicode;
1017
1018 /* Unpack UTF-16 encoded data */
1019 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001020 q = (unsigned char *)s;
1021 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022
1023 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001024 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001026 /* Check for BOM marks (U+FEFF) in the input and adjust current
1027 byte order setting accordingly. In native mode, the leading BOM
1028 mark is skipped, in all other modes, it is copied to the output
1029 stream as-is (giving a ZWNBSP character). */
1030 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001031 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001033 if (bom == 0xFEFF) {
1034 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001035 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001036 }
1037 else if (bom == 0xFFFE) {
1038 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001039 bo = 1;
1040 }
1041#else
Tim Peters772747b2001-08-09 22:21:55 +00001042 if (bom == 0xFEFF) {
1043 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001044 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001045 }
1046 else if (bom == 0xFFFE) {
1047 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001048 bo = -1;
1049 }
1050#endif
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052
Tim Peters772747b2001-08-09 22:21:55 +00001053 if (bo == -1) {
1054 /* force LE */
1055 ihi = 1;
1056 ilo = 0;
1057 }
1058 else if (bo == 1) {
1059 /* force BE */
1060 ihi = 0;
1061 ilo = 1;
1062 }
1063
1064 while (q < e) {
1065 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1066 q += 2;
1067
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (ch < 0xD800 || ch > 0xDFFF) {
1069 *p++ = ch;
1070 continue;
1071 }
1072
1073 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001074 if (q >= e) {
1075 errmsg = "unexpected end of data";
1076 goto utf16Error;
1077 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001078 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001079 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1080 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001081 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001082#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001083 *p++ = ch;
1084 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001085#else
1086 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001088 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001089 }
1090 else {
1091 errmsg = "illegal UTF-16 surrogate";
1092 goto utf16Error;
1093 }
1094
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001096 errmsg = "illegal encoding";
1097 /* Fall through to report the error */
1098
1099 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001100 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 }
1103
1104 if (byteorder)
1105 *byteorder = bo;
1106
1107 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001108 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 goto onError;
1110
1111 return (PyObject *)unicode;
1112
1113onError:
1114 Py_DECREF(unicode);
1115 return NULL;
1116}
1117
Tim Peters772747b2001-08-09 22:21:55 +00001118PyObject *
1119PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1120 int size,
1121 const char *errors,
1122 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123{
1124 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001125 unsigned char *p;
1126 int i, pairs;
1127 /* Offsets from p for storing byte pairs in the right order. */
1128#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1129 int ihi = 1, ilo = 0;
1130#else
1131 int ihi = 0, ilo = 1;
1132#endif
1133
1134#define STORECHAR(CH) \
1135 do { \
1136 p[ihi] = ((CH) >> 8) & 0xff; \
1137 p[ilo] = (CH) & 0xff; \
1138 p += 2; \
1139 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001141 for (i = pairs = 0; i < size; i++)
1142 if (s[i] >= 0x10000)
1143 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001145 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146 if (v == NULL)
1147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
Tim Peters772747b2001-08-09 22:21:55 +00001149 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001151 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001152 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001153 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001154
1155 if (byteorder == -1) {
1156 /* force LE */
1157 ihi = 1;
1158 ilo = 0;
1159 }
1160 else if (byteorder == 1) {
1161 /* force BE */
1162 ihi = 0;
1163 ilo = 1;
1164 }
1165
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001166 while (size-- > 0) {
1167 Py_UNICODE ch = *s++;
1168 Py_UNICODE ch2 = 0;
1169 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001170 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1171 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 }
Tim Peters772747b2001-08-09 22:21:55 +00001173 STORECHAR(ch);
1174 if (ch2)
1175 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001178#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179}
1180
1181PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1182{
1183 if (!PyUnicode_Check(unicode)) {
1184 PyErr_BadArgument();
1185 return NULL;
1186 }
1187 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1188 PyUnicode_GET_SIZE(unicode),
1189 NULL,
1190 0);
1191}
1192
1193/* --- Unicode Escape Codec ----------------------------------------------- */
1194
1195static
1196int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001197 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 const char *errors,
1199 const char *details)
1200{
1201 if ((errors == NULL) ||
1202 (strcmp(errors,"strict") == 0)) {
1203 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001204 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 details);
1206 return -1;
1207 }
1208 else if (strcmp(errors,"ignore") == 0) {
1209 return 0;
1210 }
1211 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001212 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 return 0;
1214 }
1215 else {
1216 PyErr_Format(PyExc_ValueError,
1217 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001218 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 errors);
1220 return -1;
1221 }
1222}
1223
Fredrik Lundh06d12682001-01-24 07:59:11 +00001224static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1227 int size,
1228 const char *errors)
1229{
1230 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001231 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001233 char* message;
1234 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1235
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 /* Escaped strings will always be longer than the resulting
1237 Unicode string, so we start with size here and then reduce the
1238 length after conversion to the true value. */
1239 v = _PyUnicode_New(size);
1240 if (v == NULL)
1241 goto onError;
1242 if (size == 0)
1243 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001244
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 p = buf = PyUnicode_AS_UNICODE(v);
1246 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 while (s < end) {
1249 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001250 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001251 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252
1253 /* Non-escape characters are interpreted as Unicode ordinals */
1254 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001255 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 continue;
1257 }
1258
1259 /* \ - Escapes */
1260 s++;
1261 switch (*s++) {
1262
1263 /* \x escapes */
1264 case '\n': break;
1265 case '\\': *p++ = '\\'; break;
1266 case '\'': *p++ = '\''; break;
1267 case '\"': *p++ = '\"'; break;
1268 case 'b': *p++ = '\b'; break;
1269 case 'f': *p++ = '\014'; break; /* FF */
1270 case 't': *p++ = '\t'; break;
1271 case 'n': *p++ = '\n'; break;
1272 case 'r': *p++ = '\r'; break;
1273 case 'v': *p++ = '\013'; break; /* VT */
1274 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1275
1276 /* \OOO (octal) escapes */
1277 case '0': case '1': case '2': case '3':
1278 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001279 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001281 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001283 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001285 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 break;
1287
Fredrik Lundhccc74732001-02-18 22:13:49 +00001288 /* hex escapes */
1289 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001291 digits = 2;
1292 message = "truncated \\xXX escape";
1293 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294
Fredrik Lundhccc74732001-02-18 22:13:49 +00001295 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001297 digits = 4;
1298 message = "truncated \\uXXXX escape";
1299 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
Fredrik Lundhccc74732001-02-18 22:13:49 +00001301 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001302 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001303 digits = 8;
1304 message = "truncated \\UXXXXXXXX escape";
1305 hexescape:
1306 chr = 0;
1307 for (i = 0; i < digits; i++) {
1308 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001309 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001310 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001311 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001312 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001313 i++;
1314 break;
1315 }
1316 chr = (chr<<4) & ~0xF;
1317 if (c >= '0' && c <= '9')
1318 chr += c - '0';
1319 else if (c >= 'a' && c <= 'f')
1320 chr += 10 + c - 'a';
1321 else
1322 chr += 10 + c - 'A';
1323 }
1324 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001325 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001326 /* when we get here, chr is a 32-bit unicode character */
1327 if (chr <= 0xffff)
1328 /* UCS-2 character */
1329 *p++ = (Py_UNICODE) chr;
1330 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001331 /* UCS-4 character. Either store directly, or as
1332 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001333#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001334 *p++ = chr;
1335#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 chr -= 0x10000L;
1337 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001338 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001340 } else {
1341 if (unicodeescape_decoding_error(
1342 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001343 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001344 )
1345 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001346 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001347 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001348 break;
1349
1350 /* \N{name} */
1351 case 'N':
1352 message = "malformed \\N character escape";
1353 if (ucnhash_CAPI == NULL) {
1354 /* load the unicode data module */
1355 PyObject *m, *v;
1356 m = PyImport_ImportModule("unicodedata");
1357 if (m == NULL)
1358 goto ucnhashError;
1359 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1360 Py_DECREF(m);
1361 if (v == NULL)
1362 goto ucnhashError;
1363 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1364 Py_DECREF(v);
1365 if (ucnhash_CAPI == NULL)
1366 goto ucnhashError;
1367 }
1368 if (*s == '{') {
1369 const char *start = s+1;
1370 /* look for the closing brace */
1371 while (*s != '}' && s < end)
1372 s++;
1373 if (s > start && s < end && *s == '}') {
1374 /* found a name. look it up in the unicode database */
1375 message = "unknown Unicode character name";
1376 s++;
1377 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1378 goto store;
1379 }
1380 }
1381 if (unicodeescape_decoding_error(&s, &x, errors, message))
1382 goto onError;
1383 *p++ = x;
1384 break;
1385
1386 default:
1387 *p++ = '\\';
1388 *p++ = (unsigned char)s[-1];
1389 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
1391 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001392 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 return (PyObject *)v;
1395
Fredrik Lundhccc74732001-02-18 22:13:49 +00001396ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001397 PyErr_SetString(
1398 PyExc_UnicodeError,
1399 "\\N escapes not supported (can't load unicodedata module)"
1400 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001401 return NULL;
1402
Fredrik Lundhccc74732001-02-18 22:13:49 +00001403onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 Py_XDECREF(v);
1405 return NULL;
1406}
1407
1408/* Return a Unicode-Escape string version of the Unicode object.
1409
1410 If quotes is true, the string is enclosed in u"" or u'' quotes as
1411 appropriate.
1412
1413*/
1414
Barry Warsaw51ac5802000-03-20 16:36:48 +00001415static const Py_UNICODE *findchar(const Py_UNICODE *s,
1416 int size,
1417 Py_UNICODE ch);
1418
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419static
1420PyObject *unicodeescape_string(const Py_UNICODE *s,
1421 int size,
1422 int quotes)
1423{
1424 PyObject *repr;
1425 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001427 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428
1429 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1430 if (repr == NULL)
1431 return NULL;
1432
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001433 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434
1435 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 *p++ = 'u';
1437 *p++ = (findchar(s, size, '\'') &&
1438 !findchar(s, size, '"')) ? '"' : '\'';
1439 }
1440 while (size-- > 0) {
1441 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001444 if (quotes &&
1445 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 *p++ = '\\';
1447 *p++ = (char) ch;
1448 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001449
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001450#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001451 /* Map 21-bit characters to '\U00xxxxxx' */
1452 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001453 int offset = p - PyString_AS_STRING(repr);
1454
1455 /* Resize the string if necessary */
1456 if (offset + 12 > PyString_GET_SIZE(repr)) {
1457 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1458 goto onError;
1459 p = PyString_AS_STRING(repr) + offset;
1460 }
1461
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001462 *p++ = '\\';
1463 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001464 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1465 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1466 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1467 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1468 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1469 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1470 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001471 *p++ = hexdigit[ch & 0x0000000F];
1472 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001473 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001474#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001475 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1476 else if (ch >= 0xD800 && ch < 0xDC00) {
1477 Py_UNICODE ch2;
1478 Py_UCS4 ucs;
1479
1480 ch2 = *s++;
1481 size--;
1482 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1483 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1484 *p++ = '\\';
1485 *p++ = 'U';
1486 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1487 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1488 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1489 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1490 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1491 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1492 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1493 *p++ = hexdigit[ucs & 0x0000000F];
1494 continue;
1495 }
1496 /* Fall through: isolated surrogates are copied as-is */
1497 s--;
1498 size++;
1499 }
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001502 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 *p++ = '\\';
1504 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001505 *p++ = hexdigit[(ch >> 12) & 0x000F];
1506 *p++ = hexdigit[(ch >> 8) & 0x000F];
1507 *p++ = hexdigit[(ch >> 4) & 0x000F];
1508 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001510
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001511 /* Map special whitespace to '\t', \n', '\r' */
1512 else if (ch == '\t') {
1513 *p++ = '\\';
1514 *p++ = 't';
1515 }
1516 else if (ch == '\n') {
1517 *p++ = '\\';
1518 *p++ = 'n';
1519 }
1520 else if (ch == '\r') {
1521 *p++ = '\\';
1522 *p++ = 'r';
1523 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001524
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001525 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 else if (ch < ' ' || ch >= 128) {
1527 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001528 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001529 *p++ = hexdigit[(ch >> 4) & 0x000F];
1530 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001532
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 /* Copy everything else as-is */
1534 else
1535 *p++ = (char) ch;
1536 }
1537 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001538 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539
1540 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001541 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543
1544 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001545
1546 onError:
1547 Py_DECREF(repr);
1548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549}
1550
1551PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1552 int size)
1553{
1554 return unicodeescape_string(s, size, 0);
1555}
1556
1557PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1558{
1559 if (!PyUnicode_Check(unicode)) {
1560 PyErr_BadArgument();
1561 return NULL;
1562 }
1563 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1564 PyUnicode_GET_SIZE(unicode));
1565}
1566
1567/* --- Raw Unicode Escape Codec ------------------------------------------- */
1568
1569PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1570 int size,
1571 const char *errors)
1572{
1573 PyUnicodeObject *v;
1574 Py_UNICODE *p, *buf;
1575 const char *end;
1576 const char *bs;
1577
1578 /* Escaped strings will always be longer than the resulting
1579 Unicode string, so we start with size here and then reduce the
1580 length after conversion to the true value. */
1581 v = _PyUnicode_New(size);
1582 if (v == NULL)
1583 goto onError;
1584 if (size == 0)
1585 return (PyObject *)v;
1586 p = buf = PyUnicode_AS_UNICODE(v);
1587 end = s + size;
1588 while (s < end) {
1589 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001590 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 int i;
1592
1593 /* Non-escape characters are interpreted as Unicode ordinals */
1594 if (*s != '\\') {
1595 *p++ = (unsigned char)*s++;
1596 continue;
1597 }
1598
1599 /* \u-escapes are only interpreted iff the number of leading
1600 backslashes if odd */
1601 bs = s;
1602 for (;s < end;) {
1603 if (*s != '\\')
1604 break;
1605 *p++ = (unsigned char)*s++;
1606 }
1607 if (((s - bs) & 1) == 0 ||
1608 s >= end ||
1609 *s != 'u') {
1610 continue;
1611 }
1612 p--;
1613 s++;
1614
1615 /* \uXXXX with 4 hex digits */
1616 for (x = 0, i = 0; i < 4; i++) {
1617 c = (unsigned char)s[i];
1618 if (!isxdigit(c)) {
1619 if (unicodeescape_decoding_error(&s, &x, errors,
1620 "truncated \\uXXXX"))
1621 goto onError;
1622 i++;
1623 break;
1624 }
1625 x = (x<<4) & ~0xF;
1626 if (c >= '0' && c <= '9')
1627 x += c - '0';
1628 else if (c >= 'a' && c <= 'f')
1629 x += 10 + c - 'a';
1630 else
1631 x += 10 + c - 'A';
1632 }
1633 s += i;
1634 *p++ = x;
1635 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 return (PyObject *)v;
1639
1640 onError:
1641 Py_XDECREF(v);
1642 return NULL;
1643}
1644
1645PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1646 int size)
1647{
1648 PyObject *repr;
1649 char *p;
1650 char *q;
1651
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001652 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
1654 repr = PyString_FromStringAndSize(NULL, 6 * size);
1655 if (repr == NULL)
1656 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001657 if (size == 0)
1658 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
1660 p = q = PyString_AS_STRING(repr);
1661 while (size-- > 0) {
1662 Py_UNICODE ch = *s++;
1663 /* Map 16-bit characters to '\uxxxx' */
1664 if (ch >= 256) {
1665 *p++ = '\\';
1666 *p++ = 'u';
1667 *p++ = hexdigit[(ch >> 12) & 0xf];
1668 *p++ = hexdigit[(ch >> 8) & 0xf];
1669 *p++ = hexdigit[(ch >> 4) & 0xf];
1670 *p++ = hexdigit[ch & 15];
1671 }
1672 /* Copy everything else as-is */
1673 else
1674 *p++ = (char) ch;
1675 }
1676 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001677 if (_PyString_Resize(&repr, p - q))
1678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679
1680 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001681
1682 onError:
1683 Py_DECREF(repr);
1684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685}
1686
1687PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1688{
1689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
1691 return NULL;
1692 }
1693 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1694 PyUnicode_GET_SIZE(unicode));
1695}
1696
1697/* --- Latin-1 Codec ------------------------------------------------------ */
1698
1699PyObject *PyUnicode_DecodeLatin1(const char *s,
1700 int size,
1701 const char *errors)
1702{
1703 PyUnicodeObject *v;
1704 Py_UNICODE *p;
1705
1706 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001707 if (size == 1 && *(unsigned char*)s < 256) {
1708 Py_UNICODE r = *(unsigned char*)s;
1709 return PyUnicode_FromUnicode(&r, 1);
1710 }
1711
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 v = _PyUnicode_New(size);
1713 if (v == NULL)
1714 goto onError;
1715 if (size == 0)
1716 return (PyObject *)v;
1717 p = PyUnicode_AS_UNICODE(v);
1718 while (size-- > 0)
1719 *p++ = (unsigned char)*s++;
1720 return (PyObject *)v;
1721
1722 onError:
1723 Py_XDECREF(v);
1724 return NULL;
1725}
1726
1727static
1728int latin1_encoding_error(const Py_UNICODE **source,
1729 char **dest,
1730 const char *errors,
1731 const char *details)
1732{
1733 if ((errors == NULL) ||
1734 (strcmp(errors,"strict") == 0)) {
1735 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 details);
1738 return -1;
1739 }
1740 else if (strcmp(errors,"ignore") == 0) {
1741 return 0;
1742 }
1743 else if (strcmp(errors,"replace") == 0) {
1744 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001745 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 return 0;
1747 }
1748 else {
1749 PyErr_Format(PyExc_ValueError,
1750 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001751 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 errors);
1753 return -1;
1754 }
1755}
1756
1757PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1758 int size,
1759 const char *errors)
1760{
1761 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001762 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 repr = PyString_FromStringAndSize(NULL, size);
1765 if (repr == NULL)
1766 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001767 if (size == 0)
1768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001771 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 if (ch >= 256) {
1775 if (latin1_encoding_error(&p, &s, errors,
1776 "ordinal not in range(256)"))
1777 goto onError;
1778 }
1779 else
1780 *s++ = (char)ch;
1781 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001782 /* Resize if error handling skipped some characters */
1783 if (s - start < PyString_GET_SIZE(repr))
1784 if (_PyString_Resize(&repr, s - start))
1785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 return repr;
1787
1788 onError:
1789 Py_DECREF(repr);
1790 return NULL;
1791}
1792
1793PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1794{
1795 if (!PyUnicode_Check(unicode)) {
1796 PyErr_BadArgument();
1797 return NULL;
1798 }
1799 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1800 PyUnicode_GET_SIZE(unicode),
1801 NULL);
1802}
1803
1804/* --- 7-bit ASCII Codec -------------------------------------------------- */
1805
1806static
1807int ascii_decoding_error(const char **source,
1808 Py_UNICODE **dest,
1809 const char *errors,
1810 const char *details)
1811{
1812 if ((errors == NULL) ||
1813 (strcmp(errors,"strict") == 0)) {
1814 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001815 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 details);
1817 return -1;
1818 }
1819 else if (strcmp(errors,"ignore") == 0) {
1820 return 0;
1821 }
1822 else if (strcmp(errors,"replace") == 0) {
1823 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1824 (*dest)++;
1825 return 0;
1826 }
1827 else {
1828 PyErr_Format(PyExc_ValueError,
1829 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001830 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 errors);
1832 return -1;
1833 }
1834}
1835
1836PyObject *PyUnicode_DecodeASCII(const char *s,
1837 int size,
1838 const char *errors)
1839{
1840 PyUnicodeObject *v;
1841 Py_UNICODE *p;
1842
1843 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001844 if (size == 1 && *(unsigned char*)s < 128) {
1845 Py_UNICODE r = *(unsigned char*)s;
1846 return PyUnicode_FromUnicode(&r, 1);
1847 }
1848
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 v = _PyUnicode_New(size);
1850 if (v == NULL)
1851 goto onError;
1852 if (size == 0)
1853 return (PyObject *)v;
1854 p = PyUnicode_AS_UNICODE(v);
1855 while (size-- > 0) {
1856 register unsigned char c;
1857
1858 c = (unsigned char)*s++;
1859 if (c < 128)
1860 *p++ = c;
1861 else if (ascii_decoding_error(&s, &p, errors,
1862 "ordinal not in range(128)"))
1863 goto onError;
1864 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001865 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001866 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 return (PyObject *)v;
1869
1870 onError:
1871 Py_XDECREF(v);
1872 return NULL;
1873}
1874
1875static
1876int ascii_encoding_error(const Py_UNICODE **source,
1877 char **dest,
1878 const char *errors,
1879 const char *details)
1880{
1881 if ((errors == NULL) ||
1882 (strcmp(errors,"strict") == 0)) {
1883 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001884 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 details);
1886 return -1;
1887 }
1888 else if (strcmp(errors,"ignore") == 0) {
1889 return 0;
1890 }
1891 else if (strcmp(errors,"replace") == 0) {
1892 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001893 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894 return 0;
1895 }
1896 else {
1897 PyErr_Format(PyExc_ValueError,
1898 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001899 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 errors);
1901 return -1;
1902 }
1903}
1904
1905PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1906 int size,
1907 const char *errors)
1908{
1909 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001910 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 repr = PyString_FromStringAndSize(NULL, size);
1913 if (repr == NULL)
1914 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001915 if (size == 0)
1916 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917
1918 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001919 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 while (size-- > 0) {
1921 Py_UNICODE ch = *p++;
1922 if (ch >= 128) {
1923 if (ascii_encoding_error(&p, &s, errors,
1924 "ordinal not in range(128)"))
1925 goto onError;
1926 }
1927 else
1928 *s++ = (char)ch;
1929 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001930 /* Resize if error handling skipped some characters */
1931 if (s - start < PyString_GET_SIZE(repr))
1932 if (_PyString_Resize(&repr, s - start))
1933 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 return repr;
1935
1936 onError:
1937 Py_DECREF(repr);
1938 return NULL;
1939}
1940
1941PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1942{
1943 if (!PyUnicode_Check(unicode)) {
1944 PyErr_BadArgument();
1945 return NULL;
1946 }
1947 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1948 PyUnicode_GET_SIZE(unicode),
1949 NULL);
1950}
1951
Fredrik Lundh30831632001-06-26 15:11:00 +00001952#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001953
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001954/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001955
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001956PyObject *PyUnicode_DecodeMBCS(const char *s,
1957 int size,
1958 const char *errors)
1959{
1960 PyUnicodeObject *v;
1961 Py_UNICODE *p;
1962
1963 /* First get the size of the result */
1964 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001965 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001966 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1967
1968 v = _PyUnicode_New(usize);
1969 if (v == NULL)
1970 return NULL;
1971 if (usize == 0)
1972 return (PyObject *)v;
1973 p = PyUnicode_AS_UNICODE(v);
1974 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1975 Py_DECREF(v);
1976 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1977 }
1978
1979 return (PyObject *)v;
1980}
1981
1982PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1983 int size,
1984 const char *errors)
1985{
1986 PyObject *repr;
1987 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001988 DWORD mbcssize;
1989
1990 /* If there are no characters, bail now! */
1991 if (size==0)
1992 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001993
1994 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001995 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001996 if (mbcssize==0)
1997 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1998
1999 repr = PyString_FromStringAndSize(NULL, mbcssize);
2000 if (repr == NULL)
2001 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002002 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002003 return repr;
2004
2005 /* Do the conversion */
2006 s = PyString_AS_STRING(repr);
2007 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2008 Py_DECREF(repr);
2009 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2010 }
2011 return repr;
2012}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002014#endif /* MS_WIN32 */
2015
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016/* --- Character Mapping Codec -------------------------------------------- */
2017
2018static
2019int charmap_decoding_error(const char **source,
2020 Py_UNICODE **dest,
2021 const char *errors,
2022 const char *details)
2023{
2024 if ((errors == NULL) ||
2025 (strcmp(errors,"strict") == 0)) {
2026 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002027 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 details);
2029 return -1;
2030 }
2031 else if (strcmp(errors,"ignore") == 0) {
2032 return 0;
2033 }
2034 else if (strcmp(errors,"replace") == 0) {
2035 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2036 (*dest)++;
2037 return 0;
2038 }
2039 else {
2040 PyErr_Format(PyExc_ValueError,
2041 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002042 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 errors);
2044 return -1;
2045 }
2046}
2047
2048PyObject *PyUnicode_DecodeCharmap(const char *s,
2049 int size,
2050 PyObject *mapping,
2051 const char *errors)
2052{
2053 PyUnicodeObject *v;
2054 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002055 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Default to Latin-1 */
2058 if (mapping == NULL)
2059 return PyUnicode_DecodeLatin1(s, size, errors);
2060
2061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
2066 p = PyUnicode_AS_UNICODE(v);
2067 while (size-- > 0) {
2068 unsigned char ch = *s++;
2069 PyObject *w, *x;
2070
2071 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2072 w = PyInt_FromLong((long)ch);
2073 if (w == NULL)
2074 goto onError;
2075 x = PyObject_GetItem(mapping, w);
2076 Py_DECREF(w);
2077 if (x == NULL) {
2078 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002079 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002081 x = Py_None;
2082 Py_INCREF(x);
2083 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002084 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 }
2086
2087 /* Apply mapping */
2088 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002089 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 if (value < 0 || value > 65535) {
2091 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002092 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 Py_DECREF(x);
2094 goto onError;
2095 }
2096 *p++ = (Py_UNICODE)value;
2097 }
2098 else if (x == Py_None) {
2099 /* undefined mapping */
2100 if (charmap_decoding_error(&s, &p, errors,
2101 "character maps to <undefined>")) {
2102 Py_DECREF(x);
2103 goto onError;
2104 }
2105 }
2106 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002107 int targetsize = PyUnicode_GET_SIZE(x);
2108
2109 if (targetsize == 1)
2110 /* 1-1 mapping */
2111 *p++ = *PyUnicode_AS_UNICODE(x);
2112
2113 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002115 if (targetsize > extrachars) {
2116 /* resize first */
2117 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2118 int needed = (targetsize - extrachars) + \
2119 (targetsize << 2);
2120 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002121 if (_PyUnicode_Resize(&v,
2122 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002123 Py_DECREF(x);
2124 goto onError;
2125 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 p = PyUnicode_AS_UNICODE(v) + oldpos;
2127 }
2128 Py_UNICODE_COPY(p,
2129 PyUnicode_AS_UNICODE(x),
2130 targetsize);
2131 p += targetsize;
2132 extrachars -= targetsize;
2133 }
2134 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
2136 else {
2137 /* wrong return value */
2138 PyErr_SetString(PyExc_TypeError,
2139 "character mapping must return integer, None or unicode");
2140 Py_DECREF(x);
2141 goto onError;
2142 }
2143 Py_DECREF(x);
2144 }
2145 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002146 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 goto onError;
2148 return (PyObject *)v;
2149
2150 onError:
2151 Py_XDECREF(v);
2152 return NULL;
2153}
2154
2155static
2156int charmap_encoding_error(const Py_UNICODE **source,
2157 char **dest,
2158 const char *errors,
2159 const char *details)
2160{
2161 if ((errors == NULL) ||
2162 (strcmp(errors,"strict") == 0)) {
2163 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 details);
2166 return -1;
2167 }
2168 else if (strcmp(errors,"ignore") == 0) {
2169 return 0;
2170 }
2171 else if (strcmp(errors,"replace") == 0) {
2172 **dest = '?';
2173 (*dest)++;
2174 return 0;
2175 }
2176 else {
2177 PyErr_Format(PyExc_ValueError,
2178 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 errors);
2181 return -1;
2182 }
2183}
2184
2185PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2186 int size,
2187 PyObject *mapping,
2188 const char *errors)
2189{
2190 PyObject *v;
2191 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002192 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193
2194 /* Default to Latin-1 */
2195 if (mapping == NULL)
2196 return PyUnicode_EncodeLatin1(p, size, errors);
2197
2198 v = PyString_FromStringAndSize(NULL, size);
2199 if (v == NULL)
2200 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002201 if (size == 0)
2202 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 s = PyString_AS_STRING(v);
2204 while (size-- > 0) {
2205 Py_UNICODE ch = *p++;
2206 PyObject *w, *x;
2207
2208 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2209 w = PyInt_FromLong((long)ch);
2210 if (w == NULL)
2211 goto onError;
2212 x = PyObject_GetItem(mapping, w);
2213 Py_DECREF(w);
2214 if (x == NULL) {
2215 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002216 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002218 x = Py_None;
2219 Py_INCREF(x);
2220 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002221 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
2223
2224 /* Apply mapping */
2225 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002226 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 if (value < 0 || value > 255) {
2228 PyErr_SetString(PyExc_TypeError,
2229 "character mapping must be in range(256)");
2230 Py_DECREF(x);
2231 goto onError;
2232 }
2233 *s++ = (char)value;
2234 }
2235 else if (x == Py_None) {
2236 /* undefined mapping */
2237 if (charmap_encoding_error(&p, &s, errors,
2238 "character maps to <undefined>")) {
2239 Py_DECREF(x);
2240 goto onError;
2241 }
2242 }
2243 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002244 int targetsize = PyString_GET_SIZE(x);
2245
2246 if (targetsize == 1)
2247 /* 1-1 mapping */
2248 *s++ = *PyString_AS_STRING(x);
2249
2250 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002252 if (targetsize > extrachars) {
2253 /* resize first */
2254 int oldpos = (int)(s - PyString_AS_STRING(v));
2255 int needed = (targetsize - extrachars) + \
2256 (targetsize << 2);
2257 extrachars += needed;
2258 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002259 Py_DECREF(x);
2260 goto onError;
2261 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002262 s = PyString_AS_STRING(v) + oldpos;
2263 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002264 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002265 s += targetsize;
2266 extrachars -= targetsize;
2267 }
2268 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
2270 else {
2271 /* wrong return value */
2272 PyErr_SetString(PyExc_TypeError,
2273 "character mapping must return integer, None or unicode");
2274 Py_DECREF(x);
2275 goto onError;
2276 }
2277 Py_DECREF(x);
2278 }
2279 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2280 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2281 goto onError;
2282 return v;
2283
2284 onError:
2285 Py_DECREF(v);
2286 return NULL;
2287}
2288
2289PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2290 PyObject *mapping)
2291{
2292 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2293 PyErr_BadArgument();
2294 return NULL;
2295 }
2296 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2297 PyUnicode_GET_SIZE(unicode),
2298 mapping,
2299 NULL);
2300}
2301
2302static
2303int translate_error(const Py_UNICODE **source,
2304 Py_UNICODE **dest,
2305 const char *errors,
2306 const char *details)
2307{
2308 if ((errors == NULL) ||
2309 (strcmp(errors,"strict") == 0)) {
2310 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002311 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 details);
2313 return -1;
2314 }
2315 else if (strcmp(errors,"ignore") == 0) {
2316 return 0;
2317 }
2318 else if (strcmp(errors,"replace") == 0) {
2319 **dest = '?';
2320 (*dest)++;
2321 return 0;
2322 }
2323 else {
2324 PyErr_Format(PyExc_ValueError,
2325 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002326 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 errors);
2328 return -1;
2329 }
2330}
2331
2332PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2333 int size,
2334 PyObject *mapping,
2335 const char *errors)
2336{
2337 PyUnicodeObject *v;
2338 Py_UNICODE *p;
2339
2340 if (mapping == NULL) {
2341 PyErr_BadArgument();
2342 return NULL;
2343 }
2344
2345 /* Output will never be longer than input */
2346 v = _PyUnicode_New(size);
2347 if (v == NULL)
2348 goto onError;
2349 if (size == 0)
2350 goto done;
2351 p = PyUnicode_AS_UNICODE(v);
2352 while (size-- > 0) {
2353 Py_UNICODE ch = *s++;
2354 PyObject *w, *x;
2355
2356 /* Get mapping */
2357 w = PyInt_FromLong(ch);
2358 if (w == NULL)
2359 goto onError;
2360 x = PyObject_GetItem(mapping, w);
2361 Py_DECREF(w);
2362 if (x == NULL) {
2363 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2364 /* No mapping found: default to 1-1 mapping */
2365 PyErr_Clear();
2366 *p++ = ch;
2367 continue;
2368 }
2369 goto onError;
2370 }
2371
2372 /* Apply mapping */
2373 if (PyInt_Check(x))
2374 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2375 else if (x == Py_None) {
2376 /* undefined mapping */
2377 if (translate_error(&s, &p, errors,
2378 "character maps to <undefined>")) {
2379 Py_DECREF(x);
2380 goto onError;
2381 }
2382 }
2383 else if (PyUnicode_Check(x)) {
2384 if (PyUnicode_GET_SIZE(x) != 1) {
2385 /* 1-n mapping */
2386 PyErr_SetString(PyExc_NotImplementedError,
2387 "1-n mappings are currently not implemented");
2388 Py_DECREF(x);
2389 goto onError;
2390 }
2391 *p++ = *PyUnicode_AS_UNICODE(x);
2392 }
2393 else {
2394 /* wrong return value */
2395 PyErr_SetString(PyExc_TypeError,
2396 "translate mapping must return integer, None or unicode");
2397 Py_DECREF(x);
2398 goto onError;
2399 }
2400 Py_DECREF(x);
2401 }
2402 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002403 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405
2406 done:
2407 return (PyObject *)v;
2408
2409 onError:
2410 Py_XDECREF(v);
2411 return NULL;
2412}
2413
2414PyObject *PyUnicode_Translate(PyObject *str,
2415 PyObject *mapping,
2416 const char *errors)
2417{
2418 PyObject *result;
2419
2420 str = PyUnicode_FromObject(str);
2421 if (str == NULL)
2422 goto onError;
2423 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2424 PyUnicode_GET_SIZE(str),
2425 mapping,
2426 errors);
2427 Py_DECREF(str);
2428 return result;
2429
2430 onError:
2431 Py_XDECREF(str);
2432 return NULL;
2433}
2434
Guido van Rossum9e896b32000-04-05 20:11:21 +00002435/* --- Decimal Encoder ---------------------------------------------------- */
2436
2437int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2438 int length,
2439 char *output,
2440 const char *errors)
2441{
2442 Py_UNICODE *p, *end;
2443
2444 if (output == NULL) {
2445 PyErr_BadArgument();
2446 return -1;
2447 }
2448
2449 p = s;
2450 end = s + length;
2451 while (p < end) {
2452 register Py_UNICODE ch = *p++;
2453 int decimal;
2454
2455 if (Py_UNICODE_ISSPACE(ch)) {
2456 *output++ = ' ';
2457 continue;
2458 }
2459 decimal = Py_UNICODE_TODECIMAL(ch);
2460 if (decimal >= 0) {
2461 *output++ = '0' + decimal;
2462 continue;
2463 }
Guido van Rossumba477042000-04-06 18:18:10 +00002464 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002465 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002466 continue;
2467 }
2468 /* All other characters are considered invalid */
2469 if (errors == NULL || strcmp(errors, "strict") == 0) {
2470 PyErr_SetString(PyExc_ValueError,
2471 "invalid decimal Unicode string");
2472 goto onError;
2473 }
2474 else if (strcmp(errors, "ignore") == 0)
2475 continue;
2476 else if (strcmp(errors, "replace") == 0) {
2477 *output++ = '?';
2478 continue;
2479 }
2480 }
2481 /* 0-terminate the output string */
2482 *output++ = '\0';
2483 return 0;
2484
2485 onError:
2486 return -1;
2487}
2488
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489/* --- Helpers ------------------------------------------------------------ */
2490
2491static
2492int count(PyUnicodeObject *self,
2493 int start,
2494 int end,
2495 PyUnicodeObject *substring)
2496{
2497 int count = 0;
2498
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002499 if (start < 0)
2500 start += self->length;
2501 if (start < 0)
2502 start = 0;
2503 if (end > self->length)
2504 end = self->length;
2505 if (end < 0)
2506 end += self->length;
2507 if (end < 0)
2508 end = 0;
2509
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002510 if (substring->length == 0)
2511 return (end - start + 1);
2512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 end -= substring->length;
2514
2515 while (start <= end)
2516 if (Py_UNICODE_MATCH(self, start, substring)) {
2517 count++;
2518 start += substring->length;
2519 } else
2520 start++;
2521
2522 return count;
2523}
2524
2525int PyUnicode_Count(PyObject *str,
2526 PyObject *substr,
2527 int start,
2528 int end)
2529{
2530 int result;
2531
2532 str = PyUnicode_FromObject(str);
2533 if (str == NULL)
2534 return -1;
2535 substr = PyUnicode_FromObject(substr);
2536 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002537 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 return -1;
2539 }
2540
2541 result = count((PyUnicodeObject *)str,
2542 start, end,
2543 (PyUnicodeObject *)substr);
2544
2545 Py_DECREF(str);
2546 Py_DECREF(substr);
2547 return result;
2548}
2549
2550static
2551int findstring(PyUnicodeObject *self,
2552 PyUnicodeObject *substring,
2553 int start,
2554 int end,
2555 int direction)
2556{
2557 if (start < 0)
2558 start += self->length;
2559 if (start < 0)
2560 start = 0;
2561
2562 if (substring->length == 0)
2563 return start;
2564
2565 if (end > self->length)
2566 end = self->length;
2567 if (end < 0)
2568 end += self->length;
2569 if (end < 0)
2570 end = 0;
2571
2572 end -= substring->length;
2573
2574 if (direction < 0) {
2575 for (; end >= start; end--)
2576 if (Py_UNICODE_MATCH(self, end, substring))
2577 return end;
2578 } else {
2579 for (; start <= end; start++)
2580 if (Py_UNICODE_MATCH(self, start, substring))
2581 return start;
2582 }
2583
2584 return -1;
2585}
2586
2587int PyUnicode_Find(PyObject *str,
2588 PyObject *substr,
2589 int start,
2590 int end,
2591 int direction)
2592{
2593 int result;
2594
2595 str = PyUnicode_FromObject(str);
2596 if (str == NULL)
2597 return -1;
2598 substr = PyUnicode_FromObject(substr);
2599 if (substr == NULL) {
2600 Py_DECREF(substr);
2601 return -1;
2602 }
2603
2604 result = findstring((PyUnicodeObject *)str,
2605 (PyUnicodeObject *)substr,
2606 start, end, direction);
2607 Py_DECREF(str);
2608 Py_DECREF(substr);
2609 return result;
2610}
2611
2612static
2613int tailmatch(PyUnicodeObject *self,
2614 PyUnicodeObject *substring,
2615 int start,
2616 int end,
2617 int direction)
2618{
2619 if (start < 0)
2620 start += self->length;
2621 if (start < 0)
2622 start = 0;
2623
2624 if (substring->length == 0)
2625 return 1;
2626
2627 if (end > self->length)
2628 end = self->length;
2629 if (end < 0)
2630 end += self->length;
2631 if (end < 0)
2632 end = 0;
2633
2634 end -= substring->length;
2635 if (end < start)
2636 return 0;
2637
2638 if (direction > 0) {
2639 if (Py_UNICODE_MATCH(self, end, substring))
2640 return 1;
2641 } else {
2642 if (Py_UNICODE_MATCH(self, start, substring))
2643 return 1;
2644 }
2645
2646 return 0;
2647}
2648
2649int PyUnicode_Tailmatch(PyObject *str,
2650 PyObject *substr,
2651 int start,
2652 int end,
2653 int direction)
2654{
2655 int result;
2656
2657 str = PyUnicode_FromObject(str);
2658 if (str == NULL)
2659 return -1;
2660 substr = PyUnicode_FromObject(substr);
2661 if (substr == NULL) {
2662 Py_DECREF(substr);
2663 return -1;
2664 }
2665
2666 result = tailmatch((PyUnicodeObject *)str,
2667 (PyUnicodeObject *)substr,
2668 start, end, direction);
2669 Py_DECREF(str);
2670 Py_DECREF(substr);
2671 return result;
2672}
2673
2674static
2675const Py_UNICODE *findchar(const Py_UNICODE *s,
2676 int size,
2677 Py_UNICODE ch)
2678{
2679 /* like wcschr, but doesn't stop at NULL characters */
2680
2681 while (size-- > 0) {
2682 if (*s == ch)
2683 return s;
2684 s++;
2685 }
2686
2687 return NULL;
2688}
2689
2690/* Apply fixfct filter to the Unicode object self and return a
2691 reference to the modified object */
2692
2693static
2694PyObject *fixup(PyUnicodeObject *self,
2695 int (*fixfct)(PyUnicodeObject *s))
2696{
2697
2698 PyUnicodeObject *u;
2699
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002700 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 if (u == NULL)
2702 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002703
2704 Py_UNICODE_COPY(u->str, self->str, self->length);
2705
Tim Peters7a29bd52001-09-12 03:03:31 +00002706 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 /* fixfct should return TRUE if it modified the buffer. If
2708 FALSE, return a reference to the original buffer instead
2709 (to save space, not time) */
2710 Py_INCREF(self);
2711 Py_DECREF(u);
2712 return (PyObject*) self;
2713 }
2714 return (PyObject*) u;
2715}
2716
2717static
2718int fixupper(PyUnicodeObject *self)
2719{
2720 int len = self->length;
2721 Py_UNICODE *s = self->str;
2722 int status = 0;
2723
2724 while (len-- > 0) {
2725 register Py_UNICODE ch;
2726
2727 ch = Py_UNICODE_TOUPPER(*s);
2728 if (ch != *s) {
2729 status = 1;
2730 *s = ch;
2731 }
2732 s++;
2733 }
2734
2735 return status;
2736}
2737
2738static
2739int fixlower(PyUnicodeObject *self)
2740{
2741 int len = self->length;
2742 Py_UNICODE *s = self->str;
2743 int status = 0;
2744
2745 while (len-- > 0) {
2746 register Py_UNICODE ch;
2747
2748 ch = Py_UNICODE_TOLOWER(*s);
2749 if (ch != *s) {
2750 status = 1;
2751 *s = ch;
2752 }
2753 s++;
2754 }
2755
2756 return status;
2757}
2758
2759static
2760int fixswapcase(PyUnicodeObject *self)
2761{
2762 int len = self->length;
2763 Py_UNICODE *s = self->str;
2764 int status = 0;
2765
2766 while (len-- > 0) {
2767 if (Py_UNICODE_ISUPPER(*s)) {
2768 *s = Py_UNICODE_TOLOWER(*s);
2769 status = 1;
2770 } else if (Py_UNICODE_ISLOWER(*s)) {
2771 *s = Py_UNICODE_TOUPPER(*s);
2772 status = 1;
2773 }
2774 s++;
2775 }
2776
2777 return status;
2778}
2779
2780static
2781int fixcapitalize(PyUnicodeObject *self)
2782{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002783 int len = self->length;
2784 Py_UNICODE *s = self->str;
2785 int status = 0;
2786
2787 if (len == 0)
2788 return 0;
2789 if (Py_UNICODE_ISLOWER(*s)) {
2790 *s = Py_UNICODE_TOUPPER(*s);
2791 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002793 s++;
2794 while (--len > 0) {
2795 if (Py_UNICODE_ISUPPER(*s)) {
2796 *s = Py_UNICODE_TOLOWER(*s);
2797 status = 1;
2798 }
2799 s++;
2800 }
2801 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802}
2803
2804static
2805int fixtitle(PyUnicodeObject *self)
2806{
2807 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2808 register Py_UNICODE *e;
2809 int previous_is_cased;
2810
2811 /* Shortcut for single character strings */
2812 if (PyUnicode_GET_SIZE(self) == 1) {
2813 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2814 if (*p != ch) {
2815 *p = ch;
2816 return 1;
2817 }
2818 else
2819 return 0;
2820 }
2821
2822 e = p + PyUnicode_GET_SIZE(self);
2823 previous_is_cased = 0;
2824 for (; p < e; p++) {
2825 register const Py_UNICODE ch = *p;
2826
2827 if (previous_is_cased)
2828 *p = Py_UNICODE_TOLOWER(ch);
2829 else
2830 *p = Py_UNICODE_TOTITLE(ch);
2831
2832 if (Py_UNICODE_ISLOWER(ch) ||
2833 Py_UNICODE_ISUPPER(ch) ||
2834 Py_UNICODE_ISTITLE(ch))
2835 previous_is_cased = 1;
2836 else
2837 previous_is_cased = 0;
2838 }
2839 return 1;
2840}
2841
2842PyObject *PyUnicode_Join(PyObject *separator,
2843 PyObject *seq)
2844{
2845 Py_UNICODE *sep;
2846 int seplen;
2847 PyUnicodeObject *res = NULL;
2848 int reslen = 0;
2849 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 int sz = 100;
2851 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002852 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853
Tim Peters2cfe3682001-05-05 05:36:48 +00002854 it = PyObject_GetIter(seq);
2855 if (it == NULL)
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857
2858 if (separator == NULL) {
2859 Py_UNICODE blank = ' ';
2860 sep = &blank;
2861 seplen = 1;
2862 }
2863 else {
2864 separator = PyUnicode_FromObject(separator);
2865 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002866 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 sep = PyUnicode_AS_UNICODE(separator);
2868 seplen = PyUnicode_GET_SIZE(separator);
2869 }
2870
2871 res = _PyUnicode_New(sz);
2872 if (res == NULL)
2873 goto onError;
2874 p = PyUnicode_AS_UNICODE(res);
2875 reslen = 0;
2876
Tim Peters2cfe3682001-05-05 05:36:48 +00002877 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002879 PyObject *item = PyIter_Next(it);
2880 if (item == NULL) {
2881 if (PyErr_Occurred())
2882 goto onError;
2883 break;
2884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 if (!PyUnicode_Check(item)) {
2886 PyObject *v;
2887 v = PyUnicode_FromObject(item);
2888 Py_DECREF(item);
2889 item = v;
2890 if (item == NULL)
2891 goto onError;
2892 }
2893 itemlen = PyUnicode_GET_SIZE(item);
2894 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002895 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 goto onError;
2897 sz *= 2;
2898 p = PyUnicode_AS_UNICODE(res) + reslen;
2899 }
2900 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002901 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 p += seplen;
2903 reslen += seplen;
2904 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002905 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 p += itemlen;
2907 reslen += itemlen;
2908 Py_DECREF(item);
2909 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002910 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 goto onError;
2912
2913 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002914 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 return (PyObject *)res;
2916
2917 onError:
2918 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002919 Py_XDECREF(res);
2920 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 return NULL;
2922}
2923
2924static
2925PyUnicodeObject *pad(PyUnicodeObject *self,
2926 int left,
2927 int right,
2928 Py_UNICODE fill)
2929{
2930 PyUnicodeObject *u;
2931
2932 if (left < 0)
2933 left = 0;
2934 if (right < 0)
2935 right = 0;
2936
Tim Peters7a29bd52001-09-12 03:03:31 +00002937 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 Py_INCREF(self);
2939 return self;
2940 }
2941
2942 u = _PyUnicode_New(left + self->length + right);
2943 if (u) {
2944 if (left)
2945 Py_UNICODE_FILL(u->str, fill, left);
2946 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2947 if (right)
2948 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2949 }
2950
2951 return u;
2952}
2953
2954#define SPLIT_APPEND(data, left, right) \
2955 str = PyUnicode_FromUnicode(data + left, right - left); \
2956 if (!str) \
2957 goto onError; \
2958 if (PyList_Append(list, str)) { \
2959 Py_DECREF(str); \
2960 goto onError; \
2961 } \
2962 else \
2963 Py_DECREF(str);
2964
2965static
2966PyObject *split_whitespace(PyUnicodeObject *self,
2967 PyObject *list,
2968 int maxcount)
2969{
2970 register int i;
2971 register int j;
2972 int len = self->length;
2973 PyObject *str;
2974
2975 for (i = j = 0; i < len; ) {
2976 /* find a token */
2977 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2978 i++;
2979 j = i;
2980 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2981 i++;
2982 if (j < i) {
2983 if (maxcount-- <= 0)
2984 break;
2985 SPLIT_APPEND(self->str, j, i);
2986 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2987 i++;
2988 j = i;
2989 }
2990 }
2991 if (j < len) {
2992 SPLIT_APPEND(self->str, j, len);
2993 }
2994 return list;
2995
2996 onError:
2997 Py_DECREF(list);
2998 return NULL;
2999}
3000
3001PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003002 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003{
3004 register int i;
3005 register int j;
3006 int len;
3007 PyObject *list;
3008 PyObject *str;
3009 Py_UNICODE *data;
3010
3011 string = PyUnicode_FromObject(string);
3012 if (string == NULL)
3013 return NULL;
3014 data = PyUnicode_AS_UNICODE(string);
3015 len = PyUnicode_GET_SIZE(string);
3016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 list = PyList_New(0);
3018 if (!list)
3019 goto onError;
3020
3021 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003022 int eol;
3023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 /* Find a line and append it */
3025 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3026 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003029 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (i < len) {
3031 if (data[i] == '\r' && i + 1 < len &&
3032 data[i+1] == '\n')
3033 i += 2;
3034 else
3035 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003036 if (keepends)
3037 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Guido van Rossum86662912000-04-11 15:38:46 +00003039 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 j = i;
3041 }
3042 if (j < len) {
3043 SPLIT_APPEND(data, j, len);
3044 }
3045
3046 Py_DECREF(string);
3047 return list;
3048
3049 onError:
3050 Py_DECREF(list);
3051 Py_DECREF(string);
3052 return NULL;
3053}
3054
3055static
3056PyObject *split_char(PyUnicodeObject *self,
3057 PyObject *list,
3058 Py_UNICODE ch,
3059 int maxcount)
3060{
3061 register int i;
3062 register int j;
3063 int len = self->length;
3064 PyObject *str;
3065
3066 for (i = j = 0; i < len; ) {
3067 if (self->str[i] == ch) {
3068 if (maxcount-- <= 0)
3069 break;
3070 SPLIT_APPEND(self->str, j, i);
3071 i = j = i + 1;
3072 } else
3073 i++;
3074 }
3075 if (j <= len) {
3076 SPLIT_APPEND(self->str, j, len);
3077 }
3078 return list;
3079
3080 onError:
3081 Py_DECREF(list);
3082 return NULL;
3083}
3084
3085static
3086PyObject *split_substring(PyUnicodeObject *self,
3087 PyObject *list,
3088 PyUnicodeObject *substring,
3089 int maxcount)
3090{
3091 register int i;
3092 register int j;
3093 int len = self->length;
3094 int sublen = substring->length;
3095 PyObject *str;
3096
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003097 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 if (Py_UNICODE_MATCH(self, i, substring)) {
3099 if (maxcount-- <= 0)
3100 break;
3101 SPLIT_APPEND(self->str, j, i);
3102 i = j = i + sublen;
3103 } else
3104 i++;
3105 }
3106 if (j <= len) {
3107 SPLIT_APPEND(self->str, j, len);
3108 }
3109 return list;
3110
3111 onError:
3112 Py_DECREF(list);
3113 return NULL;
3114}
3115
3116#undef SPLIT_APPEND
3117
3118static
3119PyObject *split(PyUnicodeObject *self,
3120 PyUnicodeObject *substring,
3121 int maxcount)
3122{
3123 PyObject *list;
3124
3125 if (maxcount < 0)
3126 maxcount = INT_MAX;
3127
3128 list = PyList_New(0);
3129 if (!list)
3130 return NULL;
3131
3132 if (substring == NULL)
3133 return split_whitespace(self,list,maxcount);
3134
3135 else if (substring->length == 1)
3136 return split_char(self,list,substring->str[0],maxcount);
3137
3138 else if (substring->length == 0) {
3139 Py_DECREF(list);
3140 PyErr_SetString(PyExc_ValueError, "empty separator");
3141 return NULL;
3142 }
3143 else
3144 return split_substring(self,list,substring,maxcount);
3145}
3146
3147static
3148PyObject *strip(PyUnicodeObject *self,
3149 int left,
3150 int right)
3151{
3152 Py_UNICODE *p = self->str;
3153 int start = 0;
3154 int end = self->length;
3155
3156 if (left)
3157 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3158 start++;
3159
3160 if (right)
3161 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3162 end--;
3163
Tim Peters7a29bd52001-09-12 03:03:31 +00003164 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 /* couldn't strip anything off, return original string */
3166 Py_INCREF(self);
3167 return (PyObject*) self;
3168 }
3169
3170 return (PyObject*) PyUnicode_FromUnicode(
3171 self->str + start,
3172 end - start
3173 );
3174}
3175
3176static
3177PyObject *replace(PyUnicodeObject *self,
3178 PyUnicodeObject *str1,
3179 PyUnicodeObject *str2,
3180 int maxcount)
3181{
3182 PyUnicodeObject *u;
3183
3184 if (maxcount < 0)
3185 maxcount = INT_MAX;
3186
3187 if (str1->length == 1 && str2->length == 1) {
3188 int i;
3189
3190 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003191 if (!findchar(self->str, self->length, str1->str[0]) &&
3192 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 /* nothing to replace, return original string */
3194 Py_INCREF(self);
3195 u = self;
3196 } else {
3197 Py_UNICODE u1 = str1->str[0];
3198 Py_UNICODE u2 = str2->str[0];
3199
3200 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003201 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 self->length
3203 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003204 if (u != NULL) {
3205 Py_UNICODE_COPY(u->str, self->str,
3206 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 for (i = 0; i < u->length; i++)
3208 if (u->str[i] == u1) {
3209 if (--maxcount < 0)
3210 break;
3211 u->str[i] = u2;
3212 }
3213 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215
3216 } else {
3217 int n, i;
3218 Py_UNICODE *p;
3219
3220 /* replace strings */
3221 n = count(self, 0, self->length, str1);
3222 if (n > maxcount)
3223 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003224 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 /* nothing to replace, return original string */
3226 Py_INCREF(self);
3227 u = self;
3228 } else {
3229 u = _PyUnicode_New(
3230 self->length + n * (str2->length - str1->length));
3231 if (u) {
3232 i = 0;
3233 p = u->str;
3234 while (i <= self->length - str1->length)
3235 if (Py_UNICODE_MATCH(self, i, str1)) {
3236 /* replace string segment */
3237 Py_UNICODE_COPY(p, str2->str, str2->length);
3238 p += str2->length;
3239 i += str1->length;
3240 if (--n <= 0) {
3241 /* copy remaining part */
3242 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3243 break;
3244 }
3245 } else
3246 *p++ = self->str[i++];
3247 }
3248 }
3249 }
3250
3251 return (PyObject *) u;
3252}
3253
3254/* --- Unicode Object Methods --------------------------------------------- */
3255
3256static char title__doc__[] =
3257"S.title() -> unicode\n\
3258\n\
3259Return a titlecased version of S, i.e. words start with title case\n\
3260characters, all remaining cased characters have lower case.";
3261
3262static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003263unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return fixup(self, fixtitle);
3266}
3267
3268static char capitalize__doc__[] =
3269"S.capitalize() -> unicode\n\
3270\n\
3271Return a capitalized version of S, i.e. make the first character\n\
3272have upper case.";
3273
3274static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003275unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 return fixup(self, fixcapitalize);
3278}
3279
3280#if 0
3281static char capwords__doc__[] =
3282"S.capwords() -> unicode\n\
3283\n\
3284Apply .capitalize() to all words in S and return the result with\n\
3285normalized whitespace (all whitespace strings are replaced by ' ').";
3286
3287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003288unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289{
3290 PyObject *list;
3291 PyObject *item;
3292 int i;
3293
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 /* Split into words */
3295 list = split(self, NULL, -1);
3296 if (!list)
3297 return NULL;
3298
3299 /* Capitalize each word */
3300 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3301 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3302 fixcapitalize);
3303 if (item == NULL)
3304 goto onError;
3305 Py_DECREF(PyList_GET_ITEM(list, i));
3306 PyList_SET_ITEM(list, i, item);
3307 }
3308
3309 /* Join the words to form a new string */
3310 item = PyUnicode_Join(NULL, list);
3311
3312onError:
3313 Py_DECREF(list);
3314 return (PyObject *)item;
3315}
3316#endif
3317
3318static char center__doc__[] =
3319"S.center(width) -> unicode\n\
3320\n\
3321Return S centered in a Unicode string of length width. Padding is done\n\
3322using spaces.";
3323
3324static PyObject *
3325unicode_center(PyUnicodeObject *self, PyObject *args)
3326{
3327 int marg, left;
3328 int width;
3329
3330 if (!PyArg_ParseTuple(args, "i:center", &width))
3331 return NULL;
3332
Tim Peters7a29bd52001-09-12 03:03:31 +00003333 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 Py_INCREF(self);
3335 return (PyObject*) self;
3336 }
3337
3338 marg = width - self->length;
3339 left = marg / 2 + (marg & width & 1);
3340
3341 return (PyObject*) pad(self, left, marg - left, ' ');
3342}
3343
Marc-André Lemburge5034372000-08-08 08:04:29 +00003344#if 0
3345
3346/* This code should go into some future Unicode collation support
3347 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003348 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003349
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003350/* speedy UTF-16 code point order comparison */
3351/* gleaned from: */
3352/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003354static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003355{
3356 0, 0, 0, 0, 0, 0, 0, 0,
3357 0, 0, 0, 0, 0, 0, 0, 0,
3358 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003359 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003360};
3361
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362static int
3363unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3364{
3365 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003366
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 Py_UNICODE *s1 = str1->str;
3368 Py_UNICODE *s2 = str2->str;
3369
3370 len1 = str1->length;
3371 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003374 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003375
3376 c1 = *s1++;
3377 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003378
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003379 if (c1 > (1<<11) * 26)
3380 c1 += utf16Fixup[c1>>11];
3381 if (c2 > (1<<11) * 26)
3382 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003383 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003384
3385 if (c1 != c2)
3386 return (c1 < c2) ? -1 : 1;
3387
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003388 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 }
3390
3391 return (len1 < len2) ? -1 : (len1 != len2);
3392}
3393
Marc-André Lemburge5034372000-08-08 08:04:29 +00003394#else
3395
3396static int
3397unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3398{
3399 register int len1, len2;
3400
3401 Py_UNICODE *s1 = str1->str;
3402 Py_UNICODE *s2 = str2->str;
3403
3404 len1 = str1->length;
3405 len2 = str2->length;
3406
3407 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003408 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003409
Fredrik Lundh45714e92001-06-26 16:39:36 +00003410 c1 = *s1++;
3411 c2 = *s2++;
3412
3413 if (c1 != c2)
3414 return (c1 < c2) ? -1 : 1;
3415
Marc-André Lemburge5034372000-08-08 08:04:29 +00003416 len1--; len2--;
3417 }
3418
3419 return (len1 < len2) ? -1 : (len1 != len2);
3420}
3421
3422#endif
3423
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424int PyUnicode_Compare(PyObject *left,
3425 PyObject *right)
3426{
3427 PyUnicodeObject *u = NULL, *v = NULL;
3428 int result;
3429
3430 /* Coerce the two arguments */
3431 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3432 if (u == NULL)
3433 goto onError;
3434 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3435 if (v == NULL)
3436 goto onError;
3437
Thomas Wouters7e474022000-07-16 12:04:32 +00003438 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439 if (v == u) {
3440 Py_DECREF(u);
3441 Py_DECREF(v);
3442 return 0;
3443 }
3444
3445 result = unicode_compare(u, v);
3446
3447 Py_DECREF(u);
3448 Py_DECREF(v);
3449 return result;
3450
3451onError:
3452 Py_XDECREF(u);
3453 Py_XDECREF(v);
3454 return -1;
3455}
3456
Guido van Rossum403d68b2000-03-13 15:55:09 +00003457int PyUnicode_Contains(PyObject *container,
3458 PyObject *element)
3459{
3460 PyUnicodeObject *u = NULL, *v = NULL;
3461 int result;
3462 register const Py_UNICODE *p, *e;
3463 register Py_UNICODE ch;
3464
3465 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003466 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003467 if (v == NULL) {
3468 PyErr_SetString(PyExc_TypeError,
3469 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003470 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003471 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003472 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3473 if (u == NULL) {
3474 Py_DECREF(v);
3475 goto onError;
3476 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003477
3478 /* Check v in u */
3479 if (PyUnicode_GET_SIZE(v) != 1) {
3480 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003481 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003482 goto onError;
3483 }
3484 ch = *PyUnicode_AS_UNICODE(v);
3485 p = PyUnicode_AS_UNICODE(u);
3486 e = p + PyUnicode_GET_SIZE(u);
3487 result = 0;
3488 while (p < e) {
3489 if (*p++ == ch) {
3490 result = 1;
3491 break;
3492 }
3493 }
3494
3495 Py_DECREF(u);
3496 Py_DECREF(v);
3497 return result;
3498
3499onError:
3500 Py_XDECREF(u);
3501 Py_XDECREF(v);
3502 return -1;
3503}
3504
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505/* Concat to string or Unicode object giving a new Unicode object. */
3506
3507PyObject *PyUnicode_Concat(PyObject *left,
3508 PyObject *right)
3509{
3510 PyUnicodeObject *u = NULL, *v = NULL, *w;
3511
3512 /* Coerce the two arguments */
3513 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3514 if (u == NULL)
3515 goto onError;
3516 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3517 if (v == NULL)
3518 goto onError;
3519
3520 /* Shortcuts */
3521 if (v == unicode_empty) {
3522 Py_DECREF(v);
3523 return (PyObject *)u;
3524 }
3525 if (u == unicode_empty) {
3526 Py_DECREF(u);
3527 return (PyObject *)v;
3528 }
3529
3530 /* Concat the two Unicode strings */
3531 w = _PyUnicode_New(u->length + v->length);
3532 if (w == NULL)
3533 goto onError;
3534 Py_UNICODE_COPY(w->str, u->str, u->length);
3535 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3536
3537 Py_DECREF(u);
3538 Py_DECREF(v);
3539 return (PyObject *)w;
3540
3541onError:
3542 Py_XDECREF(u);
3543 Py_XDECREF(v);
3544 return NULL;
3545}
3546
3547static char count__doc__[] =
3548"S.count(sub[, start[, end]]) -> int\n\
3549\n\
3550Return the number of occurrences of substring sub in Unicode string\n\
3551S[start:end]. Optional arguments start and end are\n\
3552interpreted as in slice notation.";
3553
3554static PyObject *
3555unicode_count(PyUnicodeObject *self, PyObject *args)
3556{
3557 PyUnicodeObject *substring;
3558 int start = 0;
3559 int end = INT_MAX;
3560 PyObject *result;
3561
Guido van Rossumb8872e62000-05-09 14:14:27 +00003562 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3563 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 return NULL;
3565
3566 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3567 (PyObject *)substring);
3568 if (substring == NULL)
3569 return NULL;
3570
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 if (start < 0)
3572 start += self->length;
3573 if (start < 0)
3574 start = 0;
3575 if (end > self->length)
3576 end = self->length;
3577 if (end < 0)
3578 end += self->length;
3579 if (end < 0)
3580 end = 0;
3581
3582 result = PyInt_FromLong((long) count(self, start, end, substring));
3583
3584 Py_DECREF(substring);
3585 return result;
3586}
3587
3588static char encode__doc__[] =
3589"S.encode([encoding[,errors]]) -> string\n\
3590\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003591Return an encoded string version of S. Default encoding is the current\n\
3592default string encoding. errors may be given to set a different error\n\
3593handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3594a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595
3596static PyObject *
3597unicode_encode(PyUnicodeObject *self, PyObject *args)
3598{
3599 char *encoding = NULL;
3600 char *errors = NULL;
3601 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3602 return NULL;
3603 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3604}
3605
3606static char expandtabs__doc__[] =
3607"S.expandtabs([tabsize]) -> unicode\n\
3608\n\
3609Return a copy of S where all tab characters are expanded using spaces.\n\
3610If tabsize is not given, a tab size of 8 characters is assumed.";
3611
3612static PyObject*
3613unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3614{
3615 Py_UNICODE *e;
3616 Py_UNICODE *p;
3617 Py_UNICODE *q;
3618 int i, j;
3619 PyUnicodeObject *u;
3620 int tabsize = 8;
3621
3622 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3623 return NULL;
3624
Thomas Wouters7e474022000-07-16 12:04:32 +00003625 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 i = j = 0;
3627 e = self->str + self->length;
3628 for (p = self->str; p < e; p++)
3629 if (*p == '\t') {
3630 if (tabsize > 0)
3631 j += tabsize - (j % tabsize);
3632 }
3633 else {
3634 j++;
3635 if (*p == '\n' || *p == '\r') {
3636 i += j;
3637 j = 0;
3638 }
3639 }
3640
3641 /* Second pass: create output string and fill it */
3642 u = _PyUnicode_New(i + j);
3643 if (!u)
3644 return NULL;
3645
3646 j = 0;
3647 q = u->str;
3648
3649 for (p = self->str; p < e; p++)
3650 if (*p == '\t') {
3651 if (tabsize > 0) {
3652 i = tabsize - (j % tabsize);
3653 j += i;
3654 while (i--)
3655 *q++ = ' ';
3656 }
3657 }
3658 else {
3659 j++;
3660 *q++ = *p;
3661 if (*p == '\n' || *p == '\r')
3662 j = 0;
3663 }
3664
3665 return (PyObject*) u;
3666}
3667
3668static char find__doc__[] =
3669"S.find(sub [,start [,end]]) -> int\n\
3670\n\
3671Return the lowest index in S where substring sub is found,\n\
3672such that sub is contained within s[start,end]. Optional\n\
3673arguments start and end are interpreted as in slice notation.\n\
3674\n\
3675Return -1 on failure.";
3676
3677static PyObject *
3678unicode_find(PyUnicodeObject *self, PyObject *args)
3679{
3680 PyUnicodeObject *substring;
3681 int start = 0;
3682 int end = INT_MAX;
3683 PyObject *result;
3684
Guido van Rossumb8872e62000-05-09 14:14:27 +00003685 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3686 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 return NULL;
3688 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3689 (PyObject *)substring);
3690 if (substring == NULL)
3691 return NULL;
3692
3693 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3694
3695 Py_DECREF(substring);
3696 return result;
3697}
3698
3699static PyObject *
3700unicode_getitem(PyUnicodeObject *self, int index)
3701{
3702 if (index < 0 || index >= self->length) {
3703 PyErr_SetString(PyExc_IndexError, "string index out of range");
3704 return NULL;
3705 }
3706
3707 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3708}
3709
3710static long
3711unicode_hash(PyUnicodeObject *self)
3712{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003713 /* Since Unicode objects compare equal to their ASCII string
3714 counterparts, they should use the individual character values
3715 as basis for their hash value. This is needed to assure that
3716 strings and Unicode objects behave in the same way as
3717 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Fredrik Lundhdde61642000-07-10 18:27:47 +00003719 register int len;
3720 register Py_UNICODE *p;
3721 register long x;
3722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 if (self->hash != -1)
3724 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003725 len = PyUnicode_GET_SIZE(self);
3726 p = PyUnicode_AS_UNICODE(self);
3727 x = *p << 7;
3728 while (--len >= 0)
3729 x = (1000003*x) ^ *p++;
3730 x ^= PyUnicode_GET_SIZE(self);
3731 if (x == -1)
3732 x = -2;
3733 self->hash = x;
3734 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735}
3736
3737static char index__doc__[] =
3738"S.index(sub [,start [,end]]) -> int\n\
3739\n\
3740Like S.find() but raise ValueError when the substring is not found.";
3741
3742static PyObject *
3743unicode_index(PyUnicodeObject *self, PyObject *args)
3744{
3745 int result;
3746 PyUnicodeObject *substring;
3747 int start = 0;
3748 int end = INT_MAX;
3749
Guido van Rossumb8872e62000-05-09 14:14:27 +00003750 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3751 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 return NULL;
3753
3754 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3755 (PyObject *)substring);
3756 if (substring == NULL)
3757 return NULL;
3758
3759 result = findstring(self, substring, start, end, 1);
3760
3761 Py_DECREF(substring);
3762 if (result < 0) {
3763 PyErr_SetString(PyExc_ValueError, "substring not found");
3764 return NULL;
3765 }
3766 return PyInt_FromLong(result);
3767}
3768
3769static char islower__doc__[] =
3770"S.islower() -> int\n\
3771\n\
3772Return 1 if all cased characters in S are lowercase and there is\n\
3773at least one cased character in S, 0 otherwise.";
3774
3775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003776unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777{
3778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3779 register const Py_UNICODE *e;
3780 int cased;
3781
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 /* Shortcut for single character strings */
3783 if (PyUnicode_GET_SIZE(self) == 1)
3784 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003786 /* Special case for empty strings */
3787 if (PyString_GET_SIZE(self) == 0)
3788 return PyInt_FromLong(0);
3789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 e = p + PyUnicode_GET_SIZE(self);
3791 cased = 0;
3792 for (; p < e; p++) {
3793 register const Py_UNICODE ch = *p;
3794
3795 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3796 return PyInt_FromLong(0);
3797 else if (!cased && Py_UNICODE_ISLOWER(ch))
3798 cased = 1;
3799 }
3800 return PyInt_FromLong(cased);
3801}
3802
3803static char isupper__doc__[] =
3804"S.isupper() -> int\n\
3805\n\
3806Return 1 if all cased characters in S are uppercase and there is\n\
3807at least one cased character in S, 0 otherwise.";
3808
3809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003810unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811{
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814 int cased;
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 /* Shortcut for single character strings */
3817 if (PyUnicode_GET_SIZE(self) == 1)
3818 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3819
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003820 /* Special case for empty strings */
3821 if (PyString_GET_SIZE(self) == 0)
3822 return PyInt_FromLong(0);
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 e = p + PyUnicode_GET_SIZE(self);
3825 cased = 0;
3826 for (; p < e; p++) {
3827 register const Py_UNICODE ch = *p;
3828
3829 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3830 return PyInt_FromLong(0);
3831 else if (!cased && Py_UNICODE_ISUPPER(ch))
3832 cased = 1;
3833 }
3834 return PyInt_FromLong(cased);
3835}
3836
3837static char istitle__doc__[] =
3838"S.istitle() -> int\n\
3839\n\
3840Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3841may only follow uncased characters and lowercase characters only cased\n\
3842ones. Return 0 otherwise.";
3843
3844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003845unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846{
3847 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3848 register const Py_UNICODE *e;
3849 int cased, previous_is_cased;
3850
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 /* Shortcut for single character strings */
3852 if (PyUnicode_GET_SIZE(self) == 1)
3853 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3854 (Py_UNICODE_ISUPPER(*p) != 0));
3855
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003856 /* Special case for empty strings */
3857 if (PyString_GET_SIZE(self) == 0)
3858 return PyInt_FromLong(0);
3859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 e = p + PyUnicode_GET_SIZE(self);
3861 cased = 0;
3862 previous_is_cased = 0;
3863 for (; p < e; p++) {
3864 register const Py_UNICODE ch = *p;
3865
3866 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3867 if (previous_is_cased)
3868 return PyInt_FromLong(0);
3869 previous_is_cased = 1;
3870 cased = 1;
3871 }
3872 else if (Py_UNICODE_ISLOWER(ch)) {
3873 if (!previous_is_cased)
3874 return PyInt_FromLong(0);
3875 previous_is_cased = 1;
3876 cased = 1;
3877 }
3878 else
3879 previous_is_cased = 0;
3880 }
3881 return PyInt_FromLong(cased);
3882}
3883
3884static char isspace__doc__[] =
3885"S.isspace() -> int\n\
3886\n\
3887Return 1 if there are only whitespace characters in S,\n\
38880 otherwise.";
3889
3890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003891unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892{
3893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894 register const Py_UNICODE *e;
3895
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 /* Shortcut for single character strings */
3897 if (PyUnicode_GET_SIZE(self) == 1 &&
3898 Py_UNICODE_ISSPACE(*p))
3899 return PyInt_FromLong(1);
3900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003901 /* Special case for empty strings */
3902 if (PyString_GET_SIZE(self) == 0)
3903 return PyInt_FromLong(0);
3904
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 e = p + PyUnicode_GET_SIZE(self);
3906 for (; p < e; p++) {
3907 if (!Py_UNICODE_ISSPACE(*p))
3908 return PyInt_FromLong(0);
3909 }
3910 return PyInt_FromLong(1);
3911}
3912
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003913static char isalpha__doc__[] =
3914"S.isalpha() -> int\n\
3915\n\
3916Return 1 if all characters in S are alphabetic\n\
3917and there is at least one character in S, 0 otherwise.";
3918
3919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003920unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003921{
3922 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3923 register const Py_UNICODE *e;
3924
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003925 /* Shortcut for single character strings */
3926 if (PyUnicode_GET_SIZE(self) == 1 &&
3927 Py_UNICODE_ISALPHA(*p))
3928 return PyInt_FromLong(1);
3929
3930 /* Special case for empty strings */
3931 if (PyString_GET_SIZE(self) == 0)
3932 return PyInt_FromLong(0);
3933
3934 e = p + PyUnicode_GET_SIZE(self);
3935 for (; p < e; p++) {
3936 if (!Py_UNICODE_ISALPHA(*p))
3937 return PyInt_FromLong(0);
3938 }
3939 return PyInt_FromLong(1);
3940}
3941
3942static char isalnum__doc__[] =
3943"S.isalnum() -> int\n\
3944\n\
3945Return 1 if all characters in S are alphanumeric\n\
3946and there is at least one character in S, 0 otherwise.";
3947
3948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003949unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003950{
3951 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3952 register const Py_UNICODE *e;
3953
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003954 /* Shortcut for single character strings */
3955 if (PyUnicode_GET_SIZE(self) == 1 &&
3956 Py_UNICODE_ISALNUM(*p))
3957 return PyInt_FromLong(1);
3958
3959 /* Special case for empty strings */
3960 if (PyString_GET_SIZE(self) == 0)
3961 return PyInt_FromLong(0);
3962
3963 e = p + PyUnicode_GET_SIZE(self);
3964 for (; p < e; p++) {
3965 if (!Py_UNICODE_ISALNUM(*p))
3966 return PyInt_FromLong(0);
3967 }
3968 return PyInt_FromLong(1);
3969}
3970
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971static char isdecimal__doc__[] =
3972"S.isdecimal() -> int\n\
3973\n\
3974Return 1 if there are only decimal characters in S,\n\
39750 otherwise.";
3976
3977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003978unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979{
3980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3981 register const Py_UNICODE *e;
3982
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 /* Shortcut for single character strings */
3984 if (PyUnicode_GET_SIZE(self) == 1 &&
3985 Py_UNICODE_ISDECIMAL(*p))
3986 return PyInt_FromLong(1);
3987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003988 /* Special case for empty strings */
3989 if (PyString_GET_SIZE(self) == 0)
3990 return PyInt_FromLong(0);
3991
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 e = p + PyUnicode_GET_SIZE(self);
3993 for (; p < e; p++) {
3994 if (!Py_UNICODE_ISDECIMAL(*p))
3995 return PyInt_FromLong(0);
3996 }
3997 return PyInt_FromLong(1);
3998}
3999
4000static char isdigit__doc__[] =
4001"S.isdigit() -> int\n\
4002\n\
4003Return 1 if there are only digit characters in S,\n\
40040 otherwise.";
4005
4006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004007unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008{
4009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4010 register const Py_UNICODE *e;
4011
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 /* Shortcut for single character strings */
4013 if (PyUnicode_GET_SIZE(self) == 1 &&
4014 Py_UNICODE_ISDIGIT(*p))
4015 return PyInt_FromLong(1);
4016
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004017 /* Special case for empty strings */
4018 if (PyString_GET_SIZE(self) == 0)
4019 return PyInt_FromLong(0);
4020
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 e = p + PyUnicode_GET_SIZE(self);
4022 for (; p < e; p++) {
4023 if (!Py_UNICODE_ISDIGIT(*p))
4024 return PyInt_FromLong(0);
4025 }
4026 return PyInt_FromLong(1);
4027}
4028
4029static char isnumeric__doc__[] =
4030"S.isnumeric() -> int\n\
4031\n\
4032Return 1 if there are only numeric characters in S,\n\
40330 otherwise.";
4034
4035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004036unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037{
4038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4039 register const Py_UNICODE *e;
4040
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 /* Shortcut for single character strings */
4042 if (PyUnicode_GET_SIZE(self) == 1 &&
4043 Py_UNICODE_ISNUMERIC(*p))
4044 return PyInt_FromLong(1);
4045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004046 /* Special case for empty strings */
4047 if (PyString_GET_SIZE(self) == 0)
4048 return PyInt_FromLong(0);
4049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 e = p + PyUnicode_GET_SIZE(self);
4051 for (; p < e; p++) {
4052 if (!Py_UNICODE_ISNUMERIC(*p))
4053 return PyInt_FromLong(0);
4054 }
4055 return PyInt_FromLong(1);
4056}
4057
4058static char join__doc__[] =
4059"S.join(sequence) -> unicode\n\
4060\n\
4061Return a string which is the concatenation of the strings in the\n\
4062sequence. The separator between elements is S.";
4063
4064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004065unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004067 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068}
4069
4070static int
4071unicode_length(PyUnicodeObject *self)
4072{
4073 return self->length;
4074}
4075
4076static char ljust__doc__[] =
4077"S.ljust(width) -> unicode\n\
4078\n\
4079Return S left justified in a Unicode string of length width. Padding is\n\
4080done using spaces.";
4081
4082static PyObject *
4083unicode_ljust(PyUnicodeObject *self, PyObject *args)
4084{
4085 int width;
4086 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4087 return NULL;
4088
Tim Peters7a29bd52001-09-12 03:03:31 +00004089 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 Py_INCREF(self);
4091 return (PyObject*) self;
4092 }
4093
4094 return (PyObject*) pad(self, 0, width - self->length, ' ');
4095}
4096
4097static char lower__doc__[] =
4098"S.lower() -> unicode\n\
4099\n\
4100Return a copy of the string S converted to lowercase.";
4101
4102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004103unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 return fixup(self, fixlower);
4106}
4107
4108static char lstrip__doc__[] =
4109"S.lstrip() -> unicode\n\
4110\n\
4111Return a copy of the string S with leading whitespace removed.";
4112
4113static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004114unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 return strip(self, 1, 0);
4117}
4118
4119static PyObject*
4120unicode_repeat(PyUnicodeObject *str, int len)
4121{
4122 PyUnicodeObject *u;
4123 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004124 int nchars;
4125 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126
4127 if (len < 0)
4128 len = 0;
4129
Tim Peters7a29bd52001-09-12 03:03:31 +00004130 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 /* no repeat, return original string */
4132 Py_INCREF(str);
4133 return (PyObject*) str;
4134 }
Tim Peters8f422462000-09-09 06:13:41 +00004135
4136 /* ensure # of chars needed doesn't overflow int and # of bytes
4137 * needed doesn't overflow size_t
4138 */
4139 nchars = len * str->length;
4140 if (len && nchars / len != str->length) {
4141 PyErr_SetString(PyExc_OverflowError,
4142 "repeated string is too long");
4143 return NULL;
4144 }
4145 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4146 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4147 PyErr_SetString(PyExc_OverflowError,
4148 "repeated string is too long");
4149 return NULL;
4150 }
4151 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 if (!u)
4153 return NULL;
4154
4155 p = u->str;
4156
4157 while (len-- > 0) {
4158 Py_UNICODE_COPY(p, str->str, str->length);
4159 p += str->length;
4160 }
4161
4162 return (PyObject*) u;
4163}
4164
4165PyObject *PyUnicode_Replace(PyObject *obj,
4166 PyObject *subobj,
4167 PyObject *replobj,
4168 int maxcount)
4169{
4170 PyObject *self;
4171 PyObject *str1;
4172 PyObject *str2;
4173 PyObject *result;
4174
4175 self = PyUnicode_FromObject(obj);
4176 if (self == NULL)
4177 return NULL;
4178 str1 = PyUnicode_FromObject(subobj);
4179 if (str1 == NULL) {
4180 Py_DECREF(self);
4181 return NULL;
4182 }
4183 str2 = PyUnicode_FromObject(replobj);
4184 if (str2 == NULL) {
4185 Py_DECREF(self);
4186 Py_DECREF(str1);
4187 return NULL;
4188 }
4189 result = replace((PyUnicodeObject *)self,
4190 (PyUnicodeObject *)str1,
4191 (PyUnicodeObject *)str2,
4192 maxcount);
4193 Py_DECREF(self);
4194 Py_DECREF(str1);
4195 Py_DECREF(str2);
4196 return result;
4197}
4198
4199static char replace__doc__[] =
4200"S.replace (old, new[, maxsplit]) -> unicode\n\
4201\n\
4202Return a copy of S with all occurrences of substring\n\
4203old replaced by new. If the optional argument maxsplit is\n\
4204given, only the first maxsplit occurrences are replaced.";
4205
4206static PyObject*
4207unicode_replace(PyUnicodeObject *self, PyObject *args)
4208{
4209 PyUnicodeObject *str1;
4210 PyUnicodeObject *str2;
4211 int maxcount = -1;
4212 PyObject *result;
4213
4214 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4215 return NULL;
4216 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4217 if (str1 == NULL)
4218 return NULL;
4219 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4220 if (str2 == NULL)
4221 return NULL;
4222
4223 result = replace(self, str1, str2, maxcount);
4224
4225 Py_DECREF(str1);
4226 Py_DECREF(str2);
4227 return result;
4228}
4229
4230static
4231PyObject *unicode_repr(PyObject *unicode)
4232{
4233 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4234 PyUnicode_GET_SIZE(unicode),
4235 1);
4236}
4237
4238static char rfind__doc__[] =
4239"S.rfind(sub [,start [,end]]) -> int\n\
4240\n\
4241Return the highest index in S where substring sub is found,\n\
4242such that sub is contained within s[start,end]. Optional\n\
4243arguments start and end are interpreted as in slice notation.\n\
4244\n\
4245Return -1 on failure.";
4246
4247static PyObject *
4248unicode_rfind(PyUnicodeObject *self, PyObject *args)
4249{
4250 PyUnicodeObject *substring;
4251 int start = 0;
4252 int end = INT_MAX;
4253 PyObject *result;
4254
Guido van Rossumb8872e62000-05-09 14:14:27 +00004255 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4256 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 return NULL;
4258 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4259 (PyObject *)substring);
4260 if (substring == NULL)
4261 return NULL;
4262
4263 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4264
4265 Py_DECREF(substring);
4266 return result;
4267}
4268
4269static char rindex__doc__[] =
4270"S.rindex(sub [,start [,end]]) -> int\n\
4271\n\
4272Like S.rfind() but raise ValueError when the substring is not found.";
4273
4274static PyObject *
4275unicode_rindex(PyUnicodeObject *self, PyObject *args)
4276{
4277 int result;
4278 PyUnicodeObject *substring;
4279 int start = 0;
4280 int end = INT_MAX;
4281
Guido van Rossumb8872e62000-05-09 14:14:27 +00004282 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4283 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return NULL;
4285 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4286 (PyObject *)substring);
4287 if (substring == NULL)
4288 return NULL;
4289
4290 result = findstring(self, substring, start, end, -1);
4291
4292 Py_DECREF(substring);
4293 if (result < 0) {
4294 PyErr_SetString(PyExc_ValueError, "substring not found");
4295 return NULL;
4296 }
4297 return PyInt_FromLong(result);
4298}
4299
4300static char rjust__doc__[] =
4301"S.rjust(width) -> unicode\n\
4302\n\
4303Return S right justified in a Unicode string of length width. Padding is\n\
4304done using spaces.";
4305
4306static PyObject *
4307unicode_rjust(PyUnicodeObject *self, PyObject *args)
4308{
4309 int width;
4310 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4311 return NULL;
4312
Tim Peters7a29bd52001-09-12 03:03:31 +00004313 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 Py_INCREF(self);
4315 return (PyObject*) self;
4316 }
4317
4318 return (PyObject*) pad(self, width - self->length, 0, ' ');
4319}
4320
4321static char rstrip__doc__[] =
4322"S.rstrip() -> unicode\n\
4323\n\
4324Return a copy of the string S with trailing whitespace removed.";
4325
4326static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004327unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 return strip(self, 0, 1);
4330}
4331
4332static PyObject*
4333unicode_slice(PyUnicodeObject *self, int start, int end)
4334{
4335 /* standard clamping */
4336 if (start < 0)
4337 start = 0;
4338 if (end < 0)
4339 end = 0;
4340 if (end > self->length)
4341 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004342 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 /* full slice, return original string */
4344 Py_INCREF(self);
4345 return (PyObject*) self;
4346 }
4347 if (start > end)
4348 start = end;
4349 /* copy slice */
4350 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4351 end - start);
4352}
4353
4354PyObject *PyUnicode_Split(PyObject *s,
4355 PyObject *sep,
4356 int maxsplit)
4357{
4358 PyObject *result;
4359
4360 s = PyUnicode_FromObject(s);
4361 if (s == NULL)
4362 return NULL;
4363 if (sep != NULL) {
4364 sep = PyUnicode_FromObject(sep);
4365 if (sep == NULL) {
4366 Py_DECREF(s);
4367 return NULL;
4368 }
4369 }
4370
4371 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4372
4373 Py_DECREF(s);
4374 Py_XDECREF(sep);
4375 return result;
4376}
4377
4378static char split__doc__[] =
4379"S.split([sep [,maxsplit]]) -> list of strings\n\
4380\n\
4381Return a list of the words in S, using sep as the\n\
4382delimiter string. If maxsplit is given, at most maxsplit\n\
4383splits are done. If sep is not specified, any whitespace string\n\
4384is a separator.";
4385
4386static PyObject*
4387unicode_split(PyUnicodeObject *self, PyObject *args)
4388{
4389 PyObject *substring = Py_None;
4390 int maxcount = -1;
4391
4392 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4393 return NULL;
4394
4395 if (substring == Py_None)
4396 return split(self, NULL, maxcount);
4397 else if (PyUnicode_Check(substring))
4398 return split(self, (PyUnicodeObject *)substring, maxcount);
4399 else
4400 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4401}
4402
4403static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004404"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405\n\
4406Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004407Line breaks are not included in the resulting list unless keepends\n\
4408is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410static PyObject*
4411unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4412{
Guido van Rossum86662912000-04-11 15:38:46 +00004413 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414
Guido van Rossum86662912000-04-11 15:38:46 +00004415 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 return NULL;
4417
Guido van Rossum86662912000-04-11 15:38:46 +00004418 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419}
4420
4421static
4422PyObject *unicode_str(PyUnicodeObject *self)
4423{
Fred Drakee4315f52000-05-09 19:53:39 +00004424 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425}
4426
4427static char strip__doc__[] =
4428"S.strip() -> unicode\n\
4429\n\
4430Return a copy of S with leading and trailing whitespace removed.";
4431
4432static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004433unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 return strip(self, 1, 1);
4436}
4437
4438static char swapcase__doc__[] =
4439"S.swapcase() -> unicode\n\
4440\n\
4441Return a copy of S with uppercase characters converted to lowercase\n\
4442and vice versa.";
4443
4444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004445unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 return fixup(self, fixswapcase);
4448}
4449
4450static char translate__doc__[] =
4451"S.translate(table) -> unicode\n\
4452\n\
4453Return a copy of the string S, where all characters have been mapped\n\
4454through the given translation table, which must be a mapping of\n\
4455Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4456are left untouched. Characters mapped to None are deleted.";
4457
4458static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004459unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 return PyUnicode_TranslateCharmap(self->str,
4462 self->length,
4463 table,
4464 "ignore");
4465}
4466
4467static char upper__doc__[] =
4468"S.upper() -> unicode\n\
4469\n\
4470Return a copy of S converted to uppercase.";
4471
4472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004473unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 return fixup(self, fixupper);
4476}
4477
4478#if 0
4479static char zfill__doc__[] =
4480"S.zfill(width) -> unicode\n\
4481\n\
4482Pad a numeric string x with zeros on the left, to fill a field\n\
4483of the specified width. The string x is never truncated.";
4484
4485static PyObject *
4486unicode_zfill(PyUnicodeObject *self, PyObject *args)
4487{
4488 int fill;
4489 PyUnicodeObject *u;
4490
4491 int width;
4492 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4493 return NULL;
4494
4495 if (self->length >= width) {
4496 Py_INCREF(self);
4497 return (PyObject*) self;
4498 }
4499
4500 fill = width - self->length;
4501
4502 u = pad(self, fill, 0, '0');
4503
4504 if (u->str[fill] == '+' || u->str[fill] == '-') {
4505 /* move sign to beginning of string */
4506 u->str[0] = u->str[fill];
4507 u->str[fill] = '0';
4508 }
4509
4510 return (PyObject*) u;
4511}
4512#endif
4513
4514#if 0
4515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004516unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 return PyInt_FromLong(unicode_freelist_size);
4519}
4520#endif
4521
4522static char startswith__doc__[] =
4523"S.startswith(prefix[, start[, end]]) -> int\n\
4524\n\
4525Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4526optional start, test S beginning at that position. With optional end, stop\n\
4527comparing S at that position.";
4528
4529static PyObject *
4530unicode_startswith(PyUnicodeObject *self,
4531 PyObject *args)
4532{
4533 PyUnicodeObject *substring;
4534 int start = 0;
4535 int end = INT_MAX;
4536 PyObject *result;
4537
Guido van Rossumb8872e62000-05-09 14:14:27 +00004538 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4539 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 return NULL;
4541 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4542 (PyObject *)substring);
4543 if (substring == NULL)
4544 return NULL;
4545
4546 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4547
4548 Py_DECREF(substring);
4549 return result;
4550}
4551
4552
4553static char endswith__doc__[] =
4554"S.endswith(suffix[, start[, end]]) -> int\n\
4555\n\
4556Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4557optional start, test S beginning at that position. With optional end, stop\n\
4558comparing S at that position.";
4559
4560static PyObject *
4561unicode_endswith(PyUnicodeObject *self,
4562 PyObject *args)
4563{
4564 PyUnicodeObject *substring;
4565 int start = 0;
4566 int end = INT_MAX;
4567 PyObject *result;
4568
Guido van Rossumb8872e62000-05-09 14:14:27 +00004569 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4570 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 return NULL;
4572 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4573 (PyObject *)substring);
4574 if (substring == NULL)
4575 return NULL;
4576
4577 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4578
4579 Py_DECREF(substring);
4580 return result;
4581}
4582
4583
4584static PyMethodDef unicode_methods[] = {
4585
4586 /* Order is according to common usage: often used methods should
4587 appear first, since lookup is done sequentially. */
4588
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004589 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4590 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4591 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4592 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4593 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4594 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4595 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4596 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4597 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4598 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4599 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4600 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4601 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4602 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4603/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4604 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4605 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4606 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4607 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4608 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4609 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4610 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4611 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4612 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4613 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4614 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4615 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4616 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4617 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4618 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4619 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4620 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4621 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4622 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4623 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004625 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4626 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627#endif
4628
4629#if 0
4630 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004631 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632#endif
4633
4634 {NULL, NULL}
4635};
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637static PySequenceMethods unicode_as_sequence = {
4638 (inquiry) unicode_length, /* sq_length */
4639 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4640 (intargfunc) unicode_repeat, /* sq_repeat */
4641 (intargfunc) unicode_getitem, /* sq_item */
4642 (intintargfunc) unicode_slice, /* sq_slice */
4643 0, /* sq_ass_item */
4644 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004645 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646};
4647
4648static int
4649unicode_buffer_getreadbuf(PyUnicodeObject *self,
4650 int index,
4651 const void **ptr)
4652{
4653 if (index != 0) {
4654 PyErr_SetString(PyExc_SystemError,
4655 "accessing non-existent unicode segment");
4656 return -1;
4657 }
4658 *ptr = (void *) self->str;
4659 return PyUnicode_GET_DATA_SIZE(self);
4660}
4661
4662static int
4663unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4664 const void **ptr)
4665{
4666 PyErr_SetString(PyExc_TypeError,
4667 "cannot use unicode as modifyable buffer");
4668 return -1;
4669}
4670
4671static int
4672unicode_buffer_getsegcount(PyUnicodeObject *self,
4673 int *lenp)
4674{
4675 if (lenp)
4676 *lenp = PyUnicode_GET_DATA_SIZE(self);
4677 return 1;
4678}
4679
4680static int
4681unicode_buffer_getcharbuf(PyUnicodeObject *self,
4682 int index,
4683 const void **ptr)
4684{
4685 PyObject *str;
4686
4687 if (index != 0) {
4688 PyErr_SetString(PyExc_SystemError,
4689 "accessing non-existent unicode segment");
4690 return -1;
4691 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004692 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 if (str == NULL)
4694 return -1;
4695 *ptr = (void *) PyString_AS_STRING(str);
4696 return PyString_GET_SIZE(str);
4697}
4698
4699/* Helpers for PyUnicode_Format() */
4700
4701static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004702getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703{
4704 int argidx = *p_argidx;
4705 if (argidx < arglen) {
4706 (*p_argidx)++;
4707 if (arglen < 0)
4708 return args;
4709 else
4710 return PyTuple_GetItem(args, argidx);
4711 }
4712 PyErr_SetString(PyExc_TypeError,
4713 "not enough arguments for format string");
4714 return NULL;
4715}
4716
4717#define F_LJUST (1<<0)
4718#define F_SIGN (1<<1)
4719#define F_BLANK (1<<2)
4720#define F_ALT (1<<3)
4721#define F_ZERO (1<<4)
4722
4723static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725{
4726 register int i;
4727 int len;
4728 va_list va;
4729 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731
4732 /* First, format the string as char array, then expand to Py_UNICODE
4733 array. */
4734 charbuffer = (char *)buffer;
4735 len = vsprintf(charbuffer, format, va);
4736 for (i = len - 1; i >= 0; i--)
4737 buffer[i] = (Py_UNICODE) charbuffer[i];
4738
4739 va_end(va);
4740 return len;
4741}
4742
4743static int
4744formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004745 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 int flags,
4747 int prec,
4748 int type,
4749 PyObject *v)
4750{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004751 /* fmt = '%#.' + `prec` + `type`
4752 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 char fmt[20];
4754 double x;
4755
4756 x = PyFloat_AsDouble(v);
4757 if (x == -1.0 && PyErr_Occurred())
4758 return -1;
4759 if (prec < 0)
4760 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4762 type = 'g';
4763 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004764 /* worst case length calc to ensure no buffer overrun:
4765 fmt = %#.<prec>g
4766 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4767 for any double rep.)
4768 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4769 If prec=0 the effective precision is 1 (the leading digit is
4770 always given), therefore increase by one to 10+prec. */
4771 if (buflen <= (size_t)10 + (size_t)prec) {
4772 PyErr_SetString(PyExc_OverflowError,
4773 "formatted float is too long (precision too long?)");
4774 return -1;
4775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 return usprintf(buf, fmt, x);
4777}
4778
Tim Peters38fd5b62000-09-21 05:43:11 +00004779static PyObject*
4780formatlong(PyObject *val, int flags, int prec, int type)
4781{
4782 char *buf;
4783 int i, len;
4784 PyObject *str; /* temporary string object. */
4785 PyUnicodeObject *result;
4786
4787 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4788 if (!str)
4789 return NULL;
4790 result = _PyUnicode_New(len);
4791 for (i = 0; i < len; i++)
4792 result->str[i] = buf[i];
4793 result->str[len] = 0;
4794 Py_DECREF(str);
4795 return (PyObject*)result;
4796}
4797
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798static int
4799formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004800 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 int flags,
4802 int prec,
4803 int type,
4804 PyObject *v)
4805{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004806 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004807 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4808 + 1 + 1 = 24*/
4809 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004811 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
4813 x = PyInt_AsLong(v);
4814 if (x == -1 && PyErr_Occurred())
4815 return -1;
4816 if (prec < 0)
4817 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004818 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4819 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4820 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4821 PyErr_SetString(PyExc_OverflowError,
4822 "formatted integer is too long (precision too long?)");
4823 return -1;
4824 }
Tim Petersfff53252001-04-12 18:38:48 +00004825 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4826 * but we want it (for consistency with other %#x conversions, and
4827 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004828 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4829 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4830 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004831 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004832 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4833 /* Only way to know what the platform does is to try it. */
4834 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4835 if (fmt[1] != (char)type) {
4836 /* Supply our own leading 0x/0X -- needed under std C */
4837 use_native_c_format = 0;
4838 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4839 }
4840 }
4841 if (use_native_c_format)
4842 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 return usprintf(buf, fmt, x);
4844}
4845
4846static int
4847formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004848 size_t buflen,
4849 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004851 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004852 if (PyUnicode_Check(v)) {
4853 if (PyUnicode_GET_SIZE(v) != 1)
4854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004858 else if (PyString_Check(v)) {
4859 if (PyString_GET_SIZE(v) != 1)
4860 goto onError;
4861 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
4864 else {
4865 /* Integer input truncated to a character */
4866 long x;
4867 x = PyInt_AsLong(v);
4868 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 buf[0] = (char) x;
4871 }
4872 buf[1] = '\0';
4873 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004874
4875 onError:
4876 PyErr_SetString(PyExc_TypeError,
4877 "%c requires int or char");
4878 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879}
4880
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004881/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4882
4883 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4884 chars are formatted. XXX This is a magic number. Each formatting
4885 routine does bounds checking to ensure no overflow, but a better
4886 solution may be to malloc a buffer of appropriate size for each
4887 format. For now, the current solution is sufficient.
4888*/
4889#define FORMATBUFLEN (size_t)120
4890
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891PyObject *PyUnicode_Format(PyObject *format,
4892 PyObject *args)
4893{
4894 Py_UNICODE *fmt, *res;
4895 int fmtcnt, rescnt, reslen, arglen, argidx;
4896 int args_owned = 0;
4897 PyUnicodeObject *result = NULL;
4898 PyObject *dict = NULL;
4899 PyObject *uformat;
4900
4901 if (format == NULL || args == NULL) {
4902 PyErr_BadInternalCall();
4903 return NULL;
4904 }
4905 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004906 if (uformat == NULL)
4907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 fmt = PyUnicode_AS_UNICODE(uformat);
4909 fmtcnt = PyUnicode_GET_SIZE(uformat);
4910
4911 reslen = rescnt = fmtcnt + 100;
4912 result = _PyUnicode_New(reslen);
4913 if (result == NULL)
4914 goto onError;
4915 res = PyUnicode_AS_UNICODE(result);
4916
4917 if (PyTuple_Check(args)) {
4918 arglen = PyTuple_Size(args);
4919 argidx = 0;
4920 }
4921 else {
4922 arglen = -1;
4923 argidx = -2;
4924 }
4925 if (args->ob_type->tp_as_mapping)
4926 dict = args;
4927
4928 while (--fmtcnt >= 0) {
4929 if (*fmt != '%') {
4930 if (--rescnt < 0) {
4931 rescnt = fmtcnt + 100;
4932 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004933 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 return NULL;
4935 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4936 --rescnt;
4937 }
4938 *res++ = *fmt++;
4939 }
4940 else {
4941 /* Got a format specifier */
4942 int flags = 0;
4943 int width = -1;
4944 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 Py_UNICODE c = '\0';
4946 Py_UNICODE fill;
4947 PyObject *v = NULL;
4948 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004949 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 Py_UNICODE sign;
4951 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004952 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953
4954 fmt++;
4955 if (*fmt == '(') {
4956 Py_UNICODE *keystart;
4957 int keylen;
4958 PyObject *key;
4959 int pcount = 1;
4960
4961 if (dict == NULL) {
4962 PyErr_SetString(PyExc_TypeError,
4963 "format requires a mapping");
4964 goto onError;
4965 }
4966 ++fmt;
4967 --fmtcnt;
4968 keystart = fmt;
4969 /* Skip over balanced parentheses */
4970 while (pcount > 0 && --fmtcnt >= 0) {
4971 if (*fmt == ')')
4972 --pcount;
4973 else if (*fmt == '(')
4974 ++pcount;
4975 fmt++;
4976 }
4977 keylen = fmt - keystart - 1;
4978 if (fmtcnt < 0 || pcount > 0) {
4979 PyErr_SetString(PyExc_ValueError,
4980 "incomplete format key");
4981 goto onError;
4982 }
Fred Drakee4315f52000-05-09 19:53:39 +00004983 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 then looked up since Python uses strings to hold
4985 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004986 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 key = PyUnicode_EncodeUTF8(keystart,
4988 keylen,
4989 NULL);
4990 if (key == NULL)
4991 goto onError;
4992 if (args_owned) {
4993 Py_DECREF(args);
4994 args_owned = 0;
4995 }
4996 args = PyObject_GetItem(dict, key);
4997 Py_DECREF(key);
4998 if (args == NULL) {
4999 goto onError;
5000 }
5001 args_owned = 1;
5002 arglen = -1;
5003 argidx = -2;
5004 }
5005 while (--fmtcnt >= 0) {
5006 switch (c = *fmt++) {
5007 case '-': flags |= F_LJUST; continue;
5008 case '+': flags |= F_SIGN; continue;
5009 case ' ': flags |= F_BLANK; continue;
5010 case '#': flags |= F_ALT; continue;
5011 case '0': flags |= F_ZERO; continue;
5012 }
5013 break;
5014 }
5015 if (c == '*') {
5016 v = getnextarg(args, arglen, &argidx);
5017 if (v == NULL)
5018 goto onError;
5019 if (!PyInt_Check(v)) {
5020 PyErr_SetString(PyExc_TypeError,
5021 "* wants int");
5022 goto onError;
5023 }
5024 width = PyInt_AsLong(v);
5025 if (width < 0) {
5026 flags |= F_LJUST;
5027 width = -width;
5028 }
5029 if (--fmtcnt >= 0)
5030 c = *fmt++;
5031 }
5032 else if (c >= '0' && c <= '9') {
5033 width = c - '0';
5034 while (--fmtcnt >= 0) {
5035 c = *fmt++;
5036 if (c < '0' || c > '9')
5037 break;
5038 if ((width*10) / 10 != width) {
5039 PyErr_SetString(PyExc_ValueError,
5040 "width too big");
5041 goto onError;
5042 }
5043 width = width*10 + (c - '0');
5044 }
5045 }
5046 if (c == '.') {
5047 prec = 0;
5048 if (--fmtcnt >= 0)
5049 c = *fmt++;
5050 if (c == '*') {
5051 v = getnextarg(args, arglen, &argidx);
5052 if (v == NULL)
5053 goto onError;
5054 if (!PyInt_Check(v)) {
5055 PyErr_SetString(PyExc_TypeError,
5056 "* wants int");
5057 goto onError;
5058 }
5059 prec = PyInt_AsLong(v);
5060 if (prec < 0)
5061 prec = 0;
5062 if (--fmtcnt >= 0)
5063 c = *fmt++;
5064 }
5065 else if (c >= '0' && c <= '9') {
5066 prec = c - '0';
5067 while (--fmtcnt >= 0) {
5068 c = Py_CHARMASK(*fmt++);
5069 if (c < '0' || c > '9')
5070 break;
5071 if ((prec*10) / 10 != prec) {
5072 PyErr_SetString(PyExc_ValueError,
5073 "prec too big");
5074 goto onError;
5075 }
5076 prec = prec*10 + (c - '0');
5077 }
5078 }
5079 } /* prec */
5080 if (fmtcnt >= 0) {
5081 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 if (--fmtcnt >= 0)
5083 c = *fmt++;
5084 }
5085 }
5086 if (fmtcnt < 0) {
5087 PyErr_SetString(PyExc_ValueError,
5088 "incomplete format");
5089 goto onError;
5090 }
5091 if (c != '%') {
5092 v = getnextarg(args, arglen, &argidx);
5093 if (v == NULL)
5094 goto onError;
5095 }
5096 sign = 0;
5097 fill = ' ';
5098 switch (c) {
5099
5100 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005101 pbuf = formatbuf;
5102 /* presume that buffer length is at least 1 */
5103 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 len = 1;
5105 break;
5106
5107 case 's':
5108 case 'r':
5109 if (PyUnicode_Check(v) && c == 's') {
5110 temp = v;
5111 Py_INCREF(temp);
5112 }
5113 else {
5114 PyObject *unicode;
5115 if (c == 's')
5116 temp = PyObject_Str(v);
5117 else
5118 temp = PyObject_Repr(v);
5119 if (temp == NULL)
5120 goto onError;
5121 if (!PyString_Check(temp)) {
5122 /* XXX Note: this should never happen, since
5123 PyObject_Repr() and PyObject_Str() assure
5124 this */
5125 Py_DECREF(temp);
5126 PyErr_SetString(PyExc_TypeError,
5127 "%s argument has non-string str()");
5128 goto onError;
5129 }
Fred Drakee4315f52000-05-09 19:53:39 +00005130 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005132 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 "strict");
5134 Py_DECREF(temp);
5135 temp = unicode;
5136 if (temp == NULL)
5137 goto onError;
5138 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005139 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 len = PyUnicode_GET_SIZE(temp);
5141 if (prec >= 0 && len > prec)
5142 len = prec;
5143 break;
5144
5145 case 'i':
5146 case 'd':
5147 case 'u':
5148 case 'o':
5149 case 'x':
5150 case 'X':
5151 if (c == 'i')
5152 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005153 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005154 temp = formatlong(v, flags, prec, c);
5155 if (!temp)
5156 goto onError;
5157 pbuf = PyUnicode_AS_UNICODE(temp);
5158 len = PyUnicode_GET_SIZE(temp);
5159 /* unbounded ints can always produce
5160 a sign character! */
5161 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005163 else {
5164 pbuf = formatbuf;
5165 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5166 flags, prec, c, v);
5167 if (len < 0)
5168 goto onError;
5169 /* only d conversion is signed */
5170 sign = c == 'd';
5171 }
5172 if (flags & F_ZERO)
5173 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 break;
5175
5176 case 'e':
5177 case 'E':
5178 case 'f':
5179 case 'g':
5180 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005181 pbuf = formatbuf;
5182 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5183 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 if (len < 0)
5185 goto onError;
5186 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005187 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 fill = '0';
5189 break;
5190
5191 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005192 pbuf = formatbuf;
5193 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 if (len < 0)
5195 goto onError;
5196 break;
5197
5198 default:
5199 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005200 "unsupported format character '%c' (0x%x) "
5201 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005202 (31<=c && c<=126) ? c : '?',
5203 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 goto onError;
5205 }
5206 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005207 if (*pbuf == '-' || *pbuf == '+') {
5208 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 len--;
5210 }
5211 else if (flags & F_SIGN)
5212 sign = '+';
5213 else if (flags & F_BLANK)
5214 sign = ' ';
5215 else
5216 sign = 0;
5217 }
5218 if (width < len)
5219 width = len;
5220 if (rescnt < width + (sign != 0)) {
5221 reslen -= rescnt;
5222 rescnt = width + fmtcnt + 100;
5223 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005224 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 return NULL;
5226 res = PyUnicode_AS_UNICODE(result)
5227 + reslen - rescnt;
5228 }
5229 if (sign) {
5230 if (fill != ' ')
5231 *res++ = sign;
5232 rescnt--;
5233 if (width > len)
5234 width--;
5235 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005236 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5237 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005238 assert(pbuf[1] == c);
5239 if (fill != ' ') {
5240 *res++ = *pbuf++;
5241 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005242 }
Tim Petersfff53252001-04-12 18:38:48 +00005243 rescnt -= 2;
5244 width -= 2;
5245 if (width < 0)
5246 width = 0;
5247 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 if (width > len && !(flags & F_LJUST)) {
5250 do {
5251 --rescnt;
5252 *res++ = fill;
5253 } while (--width > len);
5254 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005255 if (fill == ' ') {
5256 if (sign)
5257 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005258 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005259 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005260 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005261 *res++ = *pbuf++;
5262 *res++ = *pbuf++;
5263 }
5264 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005265 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 res += len;
5267 rescnt -= len;
5268 while (--width >= len) {
5269 --rescnt;
5270 *res++ = ' ';
5271 }
5272 if (dict && (argidx < arglen) && c != '%') {
5273 PyErr_SetString(PyExc_TypeError,
5274 "not all arguments converted");
5275 goto onError;
5276 }
5277 Py_XDECREF(temp);
5278 } /* '%' */
5279 } /* until end */
5280 if (argidx < arglen && !dict) {
5281 PyErr_SetString(PyExc_TypeError,
5282 "not all arguments converted");
5283 goto onError;
5284 }
5285
5286 if (args_owned) {
5287 Py_DECREF(args);
5288 }
5289 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005290 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005291 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 return (PyObject *)result;
5293
5294 onError:
5295 Py_XDECREF(result);
5296 Py_DECREF(uformat);
5297 if (args_owned) {
5298 Py_DECREF(args);
5299 }
5300 return NULL;
5301}
5302
5303static PyBufferProcs unicode_as_buffer = {
5304 (getreadbufferproc) unicode_buffer_getreadbuf,
5305 (getwritebufferproc) unicode_buffer_getwritebuf,
5306 (getsegcountproc) unicode_buffer_getsegcount,
5307 (getcharbufferproc) unicode_buffer_getcharbuf,
5308};
5309
Guido van Rossume023fe02001-08-30 03:12:59 +00005310staticforward PyObject *
5311unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5312
Tim Peters6d6c1a32001-08-02 04:15:00 +00005313static PyObject *
5314unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5315{
5316 PyObject *x = NULL;
5317 static char *kwlist[] = {"string", "encoding", "errors", 0};
5318 char *encoding = NULL;
5319 char *errors = NULL;
5320
Guido van Rossume023fe02001-08-30 03:12:59 +00005321 if (type != &PyUnicode_Type)
5322 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005323 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5324 kwlist, &x, &encoding, &errors))
5325 return NULL;
5326 if (x == NULL)
5327 return (PyObject *)_PyUnicode_New(0);
5328 return PyUnicode_FromEncodedObject(x, encoding, errors);
5329}
5330
Guido van Rossume023fe02001-08-30 03:12:59 +00005331static PyObject *
5332unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5333{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005334 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005335 int n;
5336
5337 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5338 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5339 if (tmp == NULL)
5340 return NULL;
5341 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005342 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5343 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005344 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005345 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5346 if (pnew->str == NULL) {
5347 _Py_ForgetReference((PyObject *)pnew);
5348 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005349 return NULL;
5350 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005351 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5352 pnew->length = n;
5353 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005354 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005355 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005356}
5357
Tim Peters6d6c1a32001-08-02 04:15:00 +00005358static char unicode_doc[] =
5359"unicode(string [, encoding[, errors]]) -> object\n\
5360\n\
5361Create a new Unicode object from the given encoded string.\n\
5362encoding defaults to the current default string encoding and \n\
5363errors, defining the error handling, to 'strict'.";
5364
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365PyTypeObject PyUnicode_Type = {
5366 PyObject_HEAD_INIT(&PyType_Type)
5367 0, /* ob_size */
5368 "unicode", /* tp_name */
5369 sizeof(PyUnicodeObject), /* tp_size */
5370 0, /* tp_itemsize */
5371 /* Slots */
5372 (destructor)_PyUnicode_Free, /* tp_dealloc */
5373 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005374 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 0, /* tp_setattr */
5376 (cmpfunc) unicode_compare, /* tp_compare */
5377 (reprfunc) unicode_repr, /* tp_repr */
5378 0, /* tp_as_number */
5379 &unicode_as_sequence, /* tp_as_sequence */
5380 0, /* tp_as_mapping */
5381 (hashfunc) unicode_hash, /* tp_hash*/
5382 0, /* tp_call*/
5383 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005384 PyObject_GenericGetAttr, /* tp_getattro */
5385 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005387 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005388 unicode_doc, /* tp_doc */
5389 0, /* tp_traverse */
5390 0, /* tp_clear */
5391 0, /* tp_richcompare */
5392 0, /* tp_weaklistoffset */
5393 0, /* tp_iter */
5394 0, /* tp_iternext */
5395 unicode_methods, /* tp_methods */
5396 0, /* tp_members */
5397 0, /* tp_getset */
5398 0, /* tp_base */
5399 0, /* tp_dict */
5400 0, /* tp_descr_get */
5401 0, /* tp_descr_set */
5402 0, /* tp_dictoffset */
5403 0, /* tp_init */
5404 0, /* tp_alloc */
5405 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406};
5407
5408/* Initialize the Unicode implementation */
5409
Thomas Wouters78890102000-07-22 19:25:51 +00005410void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005412 int i;
5413
Fred Drakee4315f52000-05-09 19:53:39 +00005414 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005415 unicode_freelist = NULL;
5416 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005418 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005419 for (i = 0; i < 256; i++)
5420 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
5423/* Finalize the Unicode implementation */
5424
5425void
Thomas Wouters78890102000-07-22 19:25:51 +00005426_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005428 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005429 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005431 Py_XDECREF(unicode_empty);
5432 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005433
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005434 for (i = 0; i < 256; i++) {
5435 if (unicode_latin1[i]) {
5436 Py_DECREF(unicode_latin1[i]);
5437 unicode_latin1[i] = NULL;
5438 }
5439 }
5440
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005441 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 PyUnicodeObject *v = u;
5443 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005444 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005445 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005446 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005447 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005449 unicode_freelist = NULL;
5450 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451}