blob: 668d6e4d6886c6e282941bacbb6f83a5e5e8b908 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Anthony Baxterac6bd462006-04-13 02:06:09 +000086
87#ifdef __cplusplus
88extern "C" {
89#endif
90
Guido van Rossumd57fd912000-03-10 22:53:23 +000091/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092static PyUnicodeObject *unicode_freelist;
93static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000094
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000095/* The empty Unicode object is shared to improve performance. */
96static PyUnicodeObject *unicode_empty;
97
98/* Single character Unicode strings in the Latin-1 range are being
99 shared as well. */
100static PyUnicodeObject *unicode_latin1[256];
101
Fred Drakee4315f52000-05-09 19:53:39 +0000102/* Default encoding to use and assume when NULL is passed as encoding
103 parameter; it is initialized by _PyUnicode_Init().
104
105 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000106 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000107
108*/
Fred Drakee4315f52000-05-09 19:53:39 +0000109static char unicode_default_encoding[100];
110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Guido van Rossumd57fd912000-03-10 22:53:23 +0000123/* --- Unicode Object ----------------------------------------------------- */
124
125static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000127 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000128{
129 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000130
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000131 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000132 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000133 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000134
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000135 /* Resizing shared object (unicode_empty or single character
136 objects) in-place is not allowed. Use PyUnicode_Resize()
137 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000138 if (unicode == unicode_empty ||
139 (unicode->length == 1 &&
140 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000152 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000157 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000179PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000228 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Martin v. Löwis18e16552006-02-15 17:27:45 +0000266int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000306 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000350 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000369 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
Martin v. Löwis18e16552006-02-15 17:27:45 +0000379Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 wchar_t *w,
381 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000397 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000458 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000523 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000591 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
Martin v. Löwis18e16552006-02-15 17:27:45 +0000699Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000737 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000745 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000747{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000748 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
753 Py_ssize_t requiredsize;
754 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000755 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000756 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000792 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000846/* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000853 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854 (encodeO && (utf7_special[(c)] == 3)))
855
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000856#define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858#define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860#define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000864#define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000868 }
869
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000870#define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000879 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
886 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000890 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000891 const char *errors)
892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000893 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000894 Py_ssize_t startinpos;
895 Py_ssize_t endinpos;
896 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
913
914 p = unicode->str;
915 e = s + size;
916
917 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 Py_UNICODE ch;
919 restart:
920 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
932
933 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000934 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 }
936 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000937 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 here so indicate the potential of a misencoded character. */
939
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000948 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 inShift = 1;
950 }
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 } else {
955 *p++ = ch;
956 }
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
962 }
963 }
964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
971 {
972 inShift = 1;
973 bitsleft = 0;
974 }
975 }
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 }
981 else {
982 *p++ = ch;
983 s++;
984 }
985 continue;
986 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 }
996
997 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006 if (s < e)
1007 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 }
1009
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 goto onError;
1012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 return (PyObject *)unicode;
1016
1017onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024
1025PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1030{
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001033 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001034 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001035 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1040
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1043
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1047
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1051
1052 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001053 if (ch == '+') {
1054 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 } else {
1063 *out++ = (char) ch;
1064 }
1065 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1074 }
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001083 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1086
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1090
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1098 }
1099
1100 }
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1104 }
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1112 }
1113
Tim Peters5de98422002-04-27 18:44:32 +00001114 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 return v;
1116}
1117
1118#undef SPECIAL
1119#undef B64
1120#undef B64CHAR
1121#undef UB64
1122#undef ENCODE
1123#undef DECODE
1124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125/* --- UTF-8 Codec -------------------------------------------------------- */
1126
Tim Petersced69f82003-09-16 20:30:58 +00001127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147};
1148
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001150 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 const char *errors)
1152{
Walter Dörwald69652032004-09-07 20:24:22 +00001153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154}
1155
1156PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001158 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001159 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001163 Py_ssize_t startinpos;
1164 Py_ssize_t endinpos;
1165 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1207 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 switch (n) {
1211
1212 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 startinpos = s-starts;
1215 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220 startinpos = s-starts;
1221 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223
1224 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
1230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "illegal encoding";
1236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 break;
1241
1242 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001243 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001253 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1258 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 startinpos = s-starts;
1261 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 goto utf8Error;
1263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001265 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 break;
1267
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
1276 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001281 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
1289 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 *p++ = (Py_UNICODE)ch;
1292#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001297
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001300
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001301 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001303#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 break;
1305
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 startinpos = s-starts;
1310 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Walter Dörwald69652032004-09-07 20:24:22 +00001325 if (consumed)
1326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 goto onError;
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 return (PyObject *)unicode;
1335
1336onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 Py_DECREF(unicode);
1340 return NULL;
1341}
1342
Tim Peters602f7402002-04-27 18:03:26 +00001343/* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001347*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001348PyObject *
1349PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001350 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352{
Tim Peters602f7402002-04-27 18:03:26 +00001353#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001359 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001360 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362 assert(s != NULL);
1363 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1369 */
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1373 }
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1383 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001384
Tim Peters602f7402002-04-27 18:03:26 +00001385 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001388 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001389 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001393 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001396 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001397 else {
Tim Peters602f7402002-04-27 18:03:26 +00001398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001407 i++;
1408 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 }
Tim Peters602f7402002-04-27 18:03:26 +00001410 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1416 }
1417encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426 if (v == NULL) {
1427 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001428 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1431 }
1432 else {
1433 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001434 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441}
1442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1448 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
1454/* --- UTF-16 Codec ------------------------------------------------------- */
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001458 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001459 const char *errors,
1460 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Walter Dörwald69652032004-09-07 20:24:22 +00001462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463}
1464
1465PyObject *
1466PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001467 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001468 const char *errors,
1469 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001470 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001473 Py_ssize_t startinpos;
1474 Py_ssize_t endinpos;
1475 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001481 /* Offsets from q for retrieving byte pairs in the right order. */
1482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484#else
1485 int ihi = 0, ilo = 1;
1486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1497
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001500 q = (unsigned char *)s;
1501 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502
1503 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001504 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1517 }
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1521 }
Tim Petersced69f82003-09-16 20:30:58 +00001522#else
Walter Dörwald69652032004-09-07 20:24:22 +00001523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1526 }
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001531#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534
Tim Peters772747b2001-08-09 22:21:55 +00001535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1539 }
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1544 }
1545
1546 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001548 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001550 if (consumed)
1551 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1558 }
1559 ch = (q[ihi] << 8) | q[ilo];
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561 q += 2;
1562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1566 }
1567
1568 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 if (q >= e) {
1570 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001573 goto utf16Error;
1574 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001575 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001579#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001580 *p++ = ch;
1581 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001582#else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591 goto utf16Error;
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001595 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 /* Fall through to report the error */
1599
1600 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 }
1609
1610 if (byteorder)
1611 *byteorder = bo;
1612
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 *consumed = (const char *)q-starts;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 goto onError;
1619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623
1624onError:
1625 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return NULL;
1629}
1630
Tim Peters772747b2001-08-09 22:21:55 +00001631PyObject *
1632PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001633 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001634 const char *errors,
1635 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636{
1637 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001638 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001640 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#else
1642 const int pairs = 0;
1643#endif
Tim Peters772747b2001-08-09 22:21:55 +00001644 /* Offsets from p for storing byte pairs in the right order. */
1645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647#else
1648 int ihi = 0, ilo = 1;
1649#endif
1650
1651#define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001662#endif
Tim Petersced69f82003-09-16 20:30:58 +00001663 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001664 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 if (v == NULL)
1666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
Tim Peters772747b2001-08-09 22:21:55 +00001668 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001670 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001673
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1678 }
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1683 }
1684
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001688#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001693#endif
Tim Peters772747b2001-08-09 22:21:55 +00001694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001699#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700}
1701
1702PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703{
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1707 }
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1711 0);
1712}
1713
1714/* --- Unicode Escape Codec ----------------------------------------------- */
1715
Fredrik Lundh06d12682001-01-24 07:59:11 +00001716static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001719 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 const char *errors)
1721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001723 Py_ssize_t startinpos;
1724 Py_ssize_t endinpos;
1725 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (s < end) {
1750 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001751 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001756 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 continue;
1758 }
1759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1764
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001781 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 /* hex escapes */
1791 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1819 }
1820 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001822 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 }
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1839 }
1840 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001841 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001852 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001853#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001854 *p++ = chr;
1855#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001859#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001868 goto onError;
1869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 break;
1871
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001877 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001881 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001883 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001884 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001885 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001886 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1889 }
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001899 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001900 goto store;
1901 }
1902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 break;
1912
1913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001924 goto onError;
1925 }
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 nextByte:
1933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001935 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001940
Fredrik Lundhccc74732001-02-18 22:13:49 +00001941ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1945 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001946 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 Py_XDECREF(errorHandler);
1948 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001949 return NULL;
1950
Fredrik Lundhccc74732001-02-18 22:13:49 +00001951onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 Py_XDECREF(errorHandler);
1954 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 return NULL;
1956}
1957
1958/* Return a Unicode-Escape string version of the Unicode object.
1959
1960 If quotes is true, the string is enclosed in u"" or u'' quotes as
1961 appropriate.
1962
1963*/
1964
Barry Warsaw51ac5802000-03-20 16:36:48 +00001965static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001966 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001967 Py_UNICODE ch);
1968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969static
1970PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 int quotes)
1973{
1974 PyObject *repr;
1975 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001977 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
1979 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1980 if (repr == NULL)
1981 return NULL;
1982
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001983 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984
1985 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001987 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 !findchar(s, size, '"')) ? '"' : '\'';
1989 }
1990 while (size-- > 0) {
1991 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001992
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001993 /* Escape quotes and backslashes */
1994 if ((quotes &&
1995 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 *p++ = '\\';
1997 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001998 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001999 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002001#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002002 /* Map 21-bit characters to '\U00xxxxxx' */
2003 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002004 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002005
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002006 /* Resize the string if necessary */
2007 if (offset + 12 > PyString_GET_SIZE(repr)) {
2008 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002009 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002010 p = PyString_AS_STRING(repr) + offset;
2011 }
2012
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002013 *p++ = '\\';
2014 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002015 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002022 *p++ = hexdigit[ch & 0x0000000F];
2023 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002024 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002025#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002026 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2027 else if (ch >= 0xD800 && ch < 0xDC00) {
2028 Py_UNICODE ch2;
2029 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002030
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002031 ch2 = *s++;
2032 size--;
2033 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2034 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2035 *p++ = '\\';
2036 *p++ = 'U';
2037 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2044 *p++ = hexdigit[ucs & 0x0000000F];
2045 continue;
2046 }
2047 /* Fall through: isolated surrogates are copied as-is */
2048 s--;
2049 size++;
2050 }
2051
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002053 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 *p++ = '\\';
2055 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002056 *p++ = hexdigit[(ch >> 12) & 0x000F];
2057 *p++ = hexdigit[(ch >> 8) & 0x000F];
2058 *p++ = hexdigit[(ch >> 4) & 0x000F];
2059 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002061
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002062 /* Map special whitespace to '\t', \n', '\r' */
2063 else if (ch == '\t') {
2064 *p++ = '\\';
2065 *p++ = 't';
2066 }
2067 else if (ch == '\n') {
2068 *p++ = '\\';
2069 *p++ = 'n';
2070 }
2071 else if (ch == '\r') {
2072 *p++ = '\\';
2073 *p++ = 'r';
2074 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002075
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002076 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002077 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002079 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002080 *p++ = hexdigit[(ch >> 4) & 0x000F];
2081 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002082 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002083
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 /* Copy everything else as-is */
2085 else
2086 *p++ = (char) ch;
2087 }
2088 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002089 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090
2091 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002092 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 return repr;
2094}
2095
2096PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002097 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098{
2099 return unicodeescape_string(s, size, 0);
2100}
2101
2102PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2103{
2104 if (!PyUnicode_Check(unicode)) {
2105 PyErr_BadArgument();
2106 return NULL;
2107 }
2108 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2109 PyUnicode_GET_SIZE(unicode));
2110}
2111
2112/* --- Raw Unicode Escape Codec ------------------------------------------- */
2113
2114PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002115 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 const char *errors)
2117{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002118 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002119 Py_ssize_t startinpos;
2120 Py_ssize_t endinpos;
2121 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002123 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 const char *end;
2125 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 PyObject *errorHandler = NULL;
2127 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002128
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 /* Escaped strings will always be longer than the resulting
2130 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 length after conversion to the true value. (But decoding error
2132 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 v = _PyUnicode_New(size);
2134 if (v == NULL)
2135 goto onError;
2136 if (size == 0)
2137 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 end = s + size;
2140 while (s < end) {
2141 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002142 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002144 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
2146 /* Non-escape characters are interpreted as Unicode ordinals */
2147 if (*s != '\\') {
2148 *p++ = (unsigned char)*s++;
2149 continue;
2150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
2153 /* \u-escapes are only interpreted iff the number of leading
2154 backslashes if odd */
2155 bs = s;
2156 for (;s < end;) {
2157 if (*s != '\\')
2158 break;
2159 *p++ = (unsigned char)*s++;
2160 }
2161 if (((s - bs) & 1) == 0 ||
2162 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002163 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 continue;
2165 }
2166 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002167 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 s++;
2169
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002170 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002172 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002173 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 endinpos = s-starts;
2176 if (unicode_decode_call_errorhandler(
2177 errors, &errorHandler,
2178 "rawunicodeescape", "truncated \\uXXXX",
2179 starts, size, &startinpos, &endinpos, &exc, &s,
2180 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 }
2184 x = (x<<4) & ~0xF;
2185 if (c >= '0' && c <= '9')
2186 x += c - '0';
2187 else if (c >= 'a' && c <= 'f')
2188 x += 10 + c - 'a';
2189 else
2190 x += 10 + c - 'A';
2191 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192#ifndef Py_UNICODE_WIDE
2193 if (x > 0x10000) {
2194 if (unicode_decode_call_errorhandler(
2195 errors, &errorHandler,
2196 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2197 starts, size, &startinpos, &endinpos, &exc, &s,
2198 (PyObject **)&v, &outpos, &p))
2199 goto onError;
2200 }
2201#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 *p++ = x;
2203 nextByte:
2204 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002206 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002207 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002208 Py_XDECREF(errorHandler);
2209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002211
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 onError:
2213 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214 Py_XDECREF(errorHandler);
2215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 return NULL;
2217}
2218
2219PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002220 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221{
2222 PyObject *repr;
2223 char *p;
2224 char *q;
2225
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002226 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228#ifdef Py_UNICODE_WIDE
2229 repr = PyString_FromStringAndSize(NULL, 10 * size);
2230#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002232#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 if (repr == NULL)
2234 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002235 if (size == 0)
2236 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237
2238 p = q = PyString_AS_STRING(repr);
2239 while (size-- > 0) {
2240 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002241#ifdef Py_UNICODE_WIDE
2242 /* Map 32-bit characters to '\Uxxxxxxxx' */
2243 if (ch >= 0x10000) {
2244 *p++ = '\\';
2245 *p++ = 'U';
2246 *p++ = hexdigit[(ch >> 28) & 0xf];
2247 *p++ = hexdigit[(ch >> 24) & 0xf];
2248 *p++ = hexdigit[(ch >> 20) & 0xf];
2249 *p++ = hexdigit[(ch >> 16) & 0xf];
2250 *p++ = hexdigit[(ch >> 12) & 0xf];
2251 *p++ = hexdigit[(ch >> 8) & 0xf];
2252 *p++ = hexdigit[(ch >> 4) & 0xf];
2253 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002254 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002255 else
2256#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 /* Map 16-bit characters to '\uxxxx' */
2258 if (ch >= 256) {
2259 *p++ = '\\';
2260 *p++ = 'u';
2261 *p++ = hexdigit[(ch >> 12) & 0xf];
2262 *p++ = hexdigit[(ch >> 8) & 0xf];
2263 *p++ = hexdigit[(ch >> 4) & 0xf];
2264 *p++ = hexdigit[ch & 15];
2265 }
2266 /* Copy everything else as-is */
2267 else
2268 *p++ = (char) ch;
2269 }
2270 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002271 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 return repr;
2273}
2274
2275PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2276{
2277 if (!PyUnicode_Check(unicode)) {
2278 PyErr_BadArgument();
2279 return NULL;
2280 }
2281 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2282 PyUnicode_GET_SIZE(unicode));
2283}
2284
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002285/* --- Unicode Internal Codec ------------------------------------------- */
2286
2287PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002288 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002289 const char *errors)
2290{
2291 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002292 Py_ssize_t startinpos;
2293 Py_ssize_t endinpos;
2294 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002295 PyUnicodeObject *v;
2296 Py_UNICODE *p;
2297 const char *end;
2298 const char *reason;
2299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
2301
Neal Norwitzd43069c2006-01-08 01:12:10 +00002302#ifdef Py_UNICODE_WIDE
2303 Py_UNICODE unimax = PyUnicode_GetMax();
2304#endif
2305
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002306 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2307 if (v == NULL)
2308 goto onError;
2309 if (PyUnicode_GetSize((PyObject *)v) == 0)
2310 return (PyObject *)v;
2311 p = PyUnicode_AS_UNICODE(v);
2312 end = s + size;
2313
2314 while (s < end) {
2315 *p = *(Py_UNICODE *)s;
2316 /* We have to sanity check the raw data, otherwise doom looms for
2317 some malformed UCS-4 data. */
2318 if (
2319 #ifdef Py_UNICODE_WIDE
2320 *p > unimax || *p < 0 ||
2321 #endif
2322 end-s < Py_UNICODE_SIZE
2323 )
2324 {
2325 startinpos = s - starts;
2326 if (end-s < Py_UNICODE_SIZE) {
2327 endinpos = end-starts;
2328 reason = "truncated input";
2329 }
2330 else {
2331 endinpos = s - starts + Py_UNICODE_SIZE;
2332 reason = "illegal code point (> 0x10FFFF)";
2333 }
2334 outpos = p - PyUnicode_AS_UNICODE(v);
2335 if (unicode_decode_call_errorhandler(
2336 errors, &errorHandler,
2337 "unicode_internal", reason,
2338 starts, size, &startinpos, &endinpos, &exc, &s,
2339 (PyObject **)&v, &outpos, &p)) {
2340 goto onError;
2341 }
2342 }
2343 else {
2344 p++;
2345 s += Py_UNICODE_SIZE;
2346 }
2347 }
2348
Martin v. Löwis412fb672006-04-13 06:34:32 +00002349 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002350 goto onError;
2351 Py_XDECREF(errorHandler);
2352 Py_XDECREF(exc);
2353 return (PyObject *)v;
2354
2355 onError:
2356 Py_XDECREF(v);
2357 Py_XDECREF(errorHandler);
2358 Py_XDECREF(exc);
2359 return NULL;
2360}
2361
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362/* --- Latin-1 Codec ------------------------------------------------------ */
2363
2364PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002365 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 const char *errors)
2367{
2368 PyUnicodeObject *v;
2369 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002370
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002372 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002373 Py_UNICODE r = *(unsigned char*)s;
2374 return PyUnicode_FromUnicode(&r, 1);
2375 }
2376
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 v = _PyUnicode_New(size);
2378 if (v == NULL)
2379 goto onError;
2380 if (size == 0)
2381 return (PyObject *)v;
2382 p = PyUnicode_AS_UNICODE(v);
2383 while (size-- > 0)
2384 *p++ = (unsigned char)*s++;
2385 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002386
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 onError:
2388 Py_XDECREF(v);
2389 return NULL;
2390}
2391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392/* create or adjust a UnicodeEncodeError */
2393static void make_encode_exception(PyObject **exceptionObject,
2394 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002395 const Py_UNICODE *unicode, Py_ssize_t size,
2396 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002397 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 if (*exceptionObject == NULL) {
2400 *exceptionObject = PyUnicodeEncodeError_Create(
2401 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 }
2403 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002404 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2405 goto onError;
2406 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2407 goto onError;
2408 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2409 goto onError;
2410 return;
2411 onError:
2412 Py_DECREF(*exceptionObject);
2413 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 }
2415}
2416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417/* raises a UnicodeEncodeError */
2418static void raise_encode_exception(PyObject **exceptionObject,
2419 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002420 const Py_UNICODE *unicode, Py_ssize_t size,
2421 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 const char *reason)
2423{
2424 make_encode_exception(exceptionObject,
2425 encoding, unicode, size, startpos, endpos, reason);
2426 if (*exceptionObject != NULL)
2427 PyCodec_StrictErrors(*exceptionObject);
2428}
2429
2430/* error handling callback helper:
2431 build arguments, call the callback and check the arguments,
2432 put the result into newpos and return the replacement string, which
2433 has to be freed by the caller */
2434static PyObject *unicode_encode_call_errorhandler(const char *errors,
2435 PyObject **errorHandler,
2436 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002437 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2438 Py_ssize_t startpos, Py_ssize_t endpos,
2439 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002441 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442
2443 PyObject *restuple;
2444 PyObject *resunicode;
2445
2446 if (*errorHandler == NULL) {
2447 *errorHandler = PyCodec_LookupError(errors);
2448 if (*errorHandler == NULL)
2449 return NULL;
2450 }
2451
2452 make_encode_exception(exceptionObject,
2453 encoding, unicode, size, startpos, endpos, reason);
2454 if (*exceptionObject == NULL)
2455 return NULL;
2456
2457 restuple = PyObject_CallFunctionObjArgs(
2458 *errorHandler, *exceptionObject, NULL);
2459 if (restuple == NULL)
2460 return NULL;
2461 if (!PyTuple_Check(restuple)) {
2462 PyErr_Format(PyExc_TypeError, &argparse[4]);
2463 Py_DECREF(restuple);
2464 return NULL;
2465 }
2466 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2467 &resunicode, newpos)) {
2468 Py_DECREF(restuple);
2469 return NULL;
2470 }
2471 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002472 *newpos = size+*newpos;
2473 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002474 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002475 Py_DECREF(restuple);
2476 return NULL;
2477 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 Py_INCREF(resunicode);
2479 Py_DECREF(restuple);
2480 return resunicode;
2481}
2482
2483static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 const char *errors,
2486 int limit)
2487{
2488 /* output object */
2489 PyObject *res;
2490 /* pointers to the beginning and end+1 of input */
2491 const Py_UNICODE *startp = p;
2492 const Py_UNICODE *endp = p + size;
2493 /* pointer to the beginning of the unencodable characters */
2494 /* const Py_UNICODE *badp = NULL; */
2495 /* pointer into the output */
2496 char *str;
2497 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002498 Py_ssize_t respos = 0;
2499 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002500 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2501 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 PyObject *errorHandler = NULL;
2503 PyObject *exc = NULL;
2504 /* the following variable is used for caching string comparisons
2505 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2506 int known_errorHandler = -1;
2507
2508 /* allocate enough for a simple encoding without
2509 replacements, if we need more, we'll resize */
2510 res = PyString_FromStringAndSize(NULL, size);
2511 if (res == NULL)
2512 goto onError;
2513 if (size == 0)
2514 return res;
2515 str = PyString_AS_STRING(res);
2516 ressize = size;
2517
2518 while (p<endp) {
2519 Py_UNICODE c = *p;
2520
2521 /* can we encode this? */
2522 if (c<limit) {
2523 /* no overflow check, because we know that the space is enough */
2524 *str++ = (char)c;
2525 ++p;
2526 }
2527 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002528 Py_ssize_t unicodepos = p-startp;
2529 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 Py_ssize_t repsize;
2532 Py_ssize_t newpos;
2533 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 Py_UNICODE *uni2;
2535 /* startpos for collecting unencodable chars */
2536 const Py_UNICODE *collstart = p;
2537 const Py_UNICODE *collend = p;
2538 /* find all unecodable characters */
2539 while ((collend < endp) && ((*collend)>=limit))
2540 ++collend;
2541 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2542 if (known_errorHandler==-1) {
2543 if ((errors==NULL) || (!strcmp(errors, "strict")))
2544 known_errorHandler = 1;
2545 else if (!strcmp(errors, "replace"))
2546 known_errorHandler = 2;
2547 else if (!strcmp(errors, "ignore"))
2548 known_errorHandler = 3;
2549 else if (!strcmp(errors, "xmlcharrefreplace"))
2550 known_errorHandler = 4;
2551 else
2552 known_errorHandler = 0;
2553 }
2554 switch (known_errorHandler) {
2555 case 1: /* strict */
2556 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2557 goto onError;
2558 case 2: /* replace */
2559 while (collstart++<collend)
2560 *str++ = '?'; /* fall through */
2561 case 3: /* ignore */
2562 p = collend;
2563 break;
2564 case 4: /* xmlcharrefreplace */
2565 respos = str-PyString_AS_STRING(res);
2566 /* determine replacement size (temporarily (mis)uses p) */
2567 for (p = collstart, repsize = 0; p < collend; ++p) {
2568 if (*p<10)
2569 repsize += 2+1+1;
2570 else if (*p<100)
2571 repsize += 2+2+1;
2572 else if (*p<1000)
2573 repsize += 2+3+1;
2574 else if (*p<10000)
2575 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002576#ifndef Py_UNICODE_WIDE
2577 else
2578 repsize += 2+5+1;
2579#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 else if (*p<100000)
2581 repsize += 2+5+1;
2582 else if (*p<1000000)
2583 repsize += 2+6+1;
2584 else
2585 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002586#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 }
2588 requiredsize = respos+repsize+(endp-collend);
2589 if (requiredsize > ressize) {
2590 if (requiredsize<2*ressize)
2591 requiredsize = 2*ressize;
2592 if (_PyString_Resize(&res, requiredsize))
2593 goto onError;
2594 str = PyString_AS_STRING(res) + respos;
2595 ressize = requiredsize;
2596 }
2597 /* generate replacement (temporarily (mis)uses p) */
2598 for (p = collstart; p < collend; ++p) {
2599 str += sprintf(str, "&#%d;", (int)*p);
2600 }
2601 p = collend;
2602 break;
2603 default:
2604 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2605 encoding, reason, startp, size, &exc,
2606 collstart-startp, collend-startp, &newpos);
2607 if (repunicode == NULL)
2608 goto onError;
2609 /* need more space? (at least enough for what we
2610 have+the replacement+the rest of the string, so
2611 we won't have to check space for encodable characters) */
2612 respos = str-PyString_AS_STRING(res);
2613 repsize = PyUnicode_GET_SIZE(repunicode);
2614 requiredsize = respos+repsize+(endp-collend);
2615 if (requiredsize > ressize) {
2616 if (requiredsize<2*ressize)
2617 requiredsize = 2*ressize;
2618 if (_PyString_Resize(&res, requiredsize)) {
2619 Py_DECREF(repunicode);
2620 goto onError;
2621 }
2622 str = PyString_AS_STRING(res) + respos;
2623 ressize = requiredsize;
2624 }
2625 /* check if there is anything unencodable in the replacement
2626 and copy it to the output */
2627 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2628 c = *uni2;
2629 if (c >= limit) {
2630 raise_encode_exception(&exc, encoding, startp, size,
2631 unicodepos, unicodepos+1, reason);
2632 Py_DECREF(repunicode);
2633 goto onError;
2634 }
2635 *str = (char)c;
2636 }
2637 p = startp + newpos;
2638 Py_DECREF(repunicode);
2639 }
2640 }
2641 }
2642 /* Resize if we allocated to much */
2643 respos = str-PyString_AS_STRING(res);
2644 if (respos<ressize)
2645 /* If this falls res will be NULL */
2646 _PyString_Resize(&res, respos);
2647 Py_XDECREF(errorHandler);
2648 Py_XDECREF(exc);
2649 return res;
2650
2651 onError:
2652 Py_XDECREF(res);
2653 Py_XDECREF(errorHandler);
2654 Py_XDECREF(exc);
2655 return NULL;
2656}
2657
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002659 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 const char *errors)
2661{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663}
2664
2665PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2666{
2667 if (!PyUnicode_Check(unicode)) {
2668 PyErr_BadArgument();
2669 return NULL;
2670 }
2671 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2672 PyUnicode_GET_SIZE(unicode),
2673 NULL);
2674}
2675
2676/* --- 7-bit ASCII Codec -------------------------------------------------- */
2677
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002679 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 const char *errors)
2681{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 PyUnicodeObject *v;
2684 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002685 Py_ssize_t startinpos;
2686 Py_ssize_t endinpos;
2687 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 const char *e;
2689 PyObject *errorHandler = NULL;
2690 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002691
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002693 if (size == 1 && *(unsigned char*)s < 128) {
2694 Py_UNICODE r = *(unsigned char*)s;
2695 return PyUnicode_FromUnicode(&r, 1);
2696 }
Tim Petersced69f82003-09-16 20:30:58 +00002697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 v = _PyUnicode_New(size);
2699 if (v == NULL)
2700 goto onError;
2701 if (size == 0)
2702 return (PyObject *)v;
2703 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 e = s + size;
2705 while (s < e) {
2706 register unsigned char c = (unsigned char)*s;
2707 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 ++s;
2710 }
2711 else {
2712 startinpos = s-starts;
2713 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002714 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 if (unicode_decode_call_errorhandler(
2716 errors, &errorHandler,
2717 "ascii", "ordinal not in range(128)",
2718 starts, size, &startinpos, &endinpos, &exc, &s,
2719 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002723 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002724 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002725 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 Py_XDECREF(errorHandler);
2727 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 onError:
2731 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 Py_XDECREF(errorHandler);
2733 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 return NULL;
2735}
2736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002738 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 const char *errors)
2740{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742}
2743
2744PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2745{
2746 if (!PyUnicode_Check(unicode)) {
2747 PyErr_BadArgument();
2748 return NULL;
2749 }
2750 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2751 PyUnicode_GET_SIZE(unicode),
2752 NULL);
2753}
2754
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002755#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002756
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002757/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002758
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002759PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002760 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002761 const char *errors)
2762{
2763 PyUnicodeObject *v;
2764 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002765 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002766
2767 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002768 assert(size < INT_MAX);
2769 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002770 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002771 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2772
2773 v = _PyUnicode_New(usize);
2774 if (v == NULL)
2775 return NULL;
2776 if (usize == 0)
2777 return (PyObject *)v;
2778 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002779 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002780 Py_DECREF(v);
2781 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2782 }
2783
2784 return (PyObject *)v;
2785}
2786
2787PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002788 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002789 const char *errors)
2790{
2791 PyObject *repr;
2792 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002793 DWORD mbcssize;
2794
2795 /* If there are no characters, bail now! */
2796 if (size==0)
2797 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002798
2799 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002800 assert(size<INT_MAX);
2801 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002802 if (mbcssize==0)
2803 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2804
2805 repr = PyString_FromStringAndSize(NULL, mbcssize);
2806 if (repr == NULL)
2807 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002808 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002809 return repr;
2810
2811 /* Do the conversion */
2812 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002813 assert(size < INT_MAX);
2814 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002815 Py_DECREF(repr);
2816 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2817 }
2818 return repr;
2819}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002820
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002821PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2822{
2823 if (!PyUnicode_Check(unicode)) {
2824 PyErr_BadArgument();
2825 return NULL;
2826 }
2827 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2828 PyUnicode_GET_SIZE(unicode),
2829 NULL);
2830}
2831
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002832#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002833
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834/* --- Character Mapping Codec -------------------------------------------- */
2835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 PyObject *mapping,
2839 const char *errors)
2840{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002842 Py_ssize_t startinpos;
2843 Py_ssize_t endinpos;
2844 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 PyUnicodeObject *v;
2847 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002848 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 PyObject *errorHandler = NULL;
2850 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002851 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 /* Default to Latin-1 */
2855 if (mapping == NULL)
2856 return PyUnicode_DecodeLatin1(s, size, errors);
2857
2858 v = _PyUnicode_New(size);
2859 if (v == NULL)
2860 goto onError;
2861 if (size == 0)
2862 return (PyObject *)v;
2863 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002865 if (PyUnicode_CheckExact(mapping)) {
2866 mapstring = PyUnicode_AS_UNICODE(mapping);
2867 maplen = PyUnicode_GET_SIZE(mapping);
2868 while (s < e) {
2869 unsigned char ch = *s;
2870 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002872 if (ch < maplen)
2873 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002875 if (x == 0xfffe) {
2876 /* undefined mapping */
2877 outpos = p-PyUnicode_AS_UNICODE(v);
2878 startinpos = s-starts;
2879 endinpos = startinpos+1;
2880 if (unicode_decode_call_errorhandler(
2881 errors, &errorHandler,
2882 "charmap", "character maps to <undefined>",
2883 starts, size, &startinpos, &endinpos, &exc, &s,
2884 (PyObject **)&v, &outpos, &p)) {
2885 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002886 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002887 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002888 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002889 *p++ = x;
2890 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002892 }
2893 else {
2894 while (s < e) {
2895 unsigned char ch = *s;
2896 PyObject *w, *x;
2897
2898 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2899 w = PyInt_FromLong((long)ch);
2900 if (w == NULL)
2901 goto onError;
2902 x = PyObject_GetItem(mapping, w);
2903 Py_DECREF(w);
2904 if (x == NULL) {
2905 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2906 /* No mapping found means: mapping is undefined. */
2907 PyErr_Clear();
2908 x = Py_None;
2909 Py_INCREF(x);
2910 } else
2911 goto onError;
2912 }
2913
2914 /* Apply mapping */
2915 if (PyInt_Check(x)) {
2916 long value = PyInt_AS_LONG(x);
2917 if (value < 0 || value > 65535) {
2918 PyErr_SetString(PyExc_TypeError,
2919 "character mapping must be in range(65536)");
2920 Py_DECREF(x);
2921 goto onError;
2922 }
2923 *p++ = (Py_UNICODE)value;
2924 }
2925 else if (x == Py_None) {
2926 /* undefined mapping */
2927 outpos = p-PyUnicode_AS_UNICODE(v);
2928 startinpos = s-starts;
2929 endinpos = startinpos+1;
2930 if (unicode_decode_call_errorhandler(
2931 errors, &errorHandler,
2932 "charmap", "character maps to <undefined>",
2933 starts, size, &startinpos, &endinpos, &exc, &s,
2934 (PyObject **)&v, &outpos, &p)) {
2935 Py_DECREF(x);
2936 goto onError;
2937 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002938 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002939 continue;
2940 }
2941 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002942 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002943
2944 if (targetsize == 1)
2945 /* 1-1 mapping */
2946 *p++ = *PyUnicode_AS_UNICODE(x);
2947
2948 else if (targetsize > 1) {
2949 /* 1-n mapping */
2950 if (targetsize > extrachars) {
2951 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002952 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2953 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002954 (targetsize << 2);
2955 extrachars += needed;
2956 if (_PyUnicode_Resize(&v,
2957 PyUnicode_GET_SIZE(v) + needed) < 0) {
2958 Py_DECREF(x);
2959 goto onError;
2960 }
2961 p = PyUnicode_AS_UNICODE(v) + oldpos;
2962 }
2963 Py_UNICODE_COPY(p,
2964 PyUnicode_AS_UNICODE(x),
2965 targetsize);
2966 p += targetsize;
2967 extrachars -= targetsize;
2968 }
2969 /* 1-0 mapping: skip the character */
2970 }
2971 else {
2972 /* wrong return value */
2973 PyErr_SetString(PyExc_TypeError,
2974 "character mapping must return integer, None or unicode");
2975 Py_DECREF(x);
2976 goto onError;
2977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002979 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
2982 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002983 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 Py_XDECREF(errorHandler);
2986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002988
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 Py_XDECREF(errorHandler);
2991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 Py_XDECREF(v);
2993 return NULL;
2994}
2995
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996/* Lookup the character ch in the mapping. If the character
2997 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002998 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001 PyObject *w = PyInt_FromLong((long)c);
3002 PyObject *x;
3003
3004 if (w == NULL)
3005 return NULL;
3006 x = PyObject_GetItem(mapping, w);
3007 Py_DECREF(w);
3008 if (x == NULL) {
3009 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3010 /* No mapping found means: mapping is undefined. */
3011 PyErr_Clear();
3012 x = Py_None;
3013 Py_INCREF(x);
3014 return x;
3015 } else
3016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003018 else if (x == Py_None)
3019 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 else if (PyInt_Check(x)) {
3021 long value = PyInt_AS_LONG(x);
3022 if (value < 0 || value > 255) {
3023 PyErr_SetString(PyExc_TypeError,
3024 "character mapping must be in range(256)");
3025 Py_DECREF(x);
3026 return NULL;
3027 }
3028 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 else if (PyString_Check(x))
3031 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 /* wrong return value */
3034 PyErr_SetString(PyExc_TypeError,
3035 "character mapping must return integer, None or str");
3036 Py_DECREF(x);
3037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
3039}
3040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041/* lookup the character, put the result in the output string and adjust
3042 various state variables. Reallocate the output string if not enough
3043 space is available. Return a new reference to the object that
3044 was put in the output buffer, or Py_None, if the mapping was undefined
3045 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003046 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047static
3048PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003049 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050{
3051 PyObject *rep = charmapencode_lookup(c, mapping);
3052
3053 if (rep==NULL)
3054 return NULL;
3055 else if (rep==Py_None)
3056 return rep;
3057 else {
3058 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003059 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 if (outsize<requiredsize) {
3063 /* exponentially overallocate to minimize reallocations */
3064 if (requiredsize < 2*outsize)
3065 requiredsize = 2*outsize;
3066 if (_PyString_Resize(outobj, requiredsize)) {
3067 Py_DECREF(rep);
3068 return NULL;
3069 }
3070 outstart = PyString_AS_STRING(*outobj);
3071 }
3072 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3073 }
3074 else {
3075 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003076 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3077 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 if (outsize<requiredsize) {
3079 /* exponentially overallocate to minimize reallocations */
3080 if (requiredsize < 2*outsize)
3081 requiredsize = 2*outsize;
3082 if (_PyString_Resize(outobj, requiredsize)) {
3083 Py_DECREF(rep);
3084 return NULL;
3085 }
3086 outstart = PyString_AS_STRING(*outobj);
3087 }
3088 memcpy(outstart + *outpos, repchars, repsize);
3089 *outpos += repsize;
3090 }
3091 }
3092 return rep;
3093}
3094
3095/* handle an error in PyUnicode_EncodeCharmap
3096 Return 0 on success, -1 on error */
3097static
3098int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003099 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003101 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003102 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103{
3104 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003105 Py_ssize_t repsize;
3106 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 Py_UNICODE *uni2;
3108 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 Py_ssize_t collstartpos = *inpos;
3110 Py_ssize_t collendpos = *inpos+1;
3111 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 char *encoding = "charmap";
3113 char *reason = "character maps to <undefined>";
3114
3115 PyObject *x;
3116 /* find all unencodable characters */
3117 while (collendpos < size) {
3118 x = charmapencode_lookup(p[collendpos], mapping);
3119 if (x==NULL)
3120 return -1;
3121 else if (x!=Py_None) {
3122 Py_DECREF(x);
3123 break;
3124 }
3125 Py_DECREF(x);
3126 ++collendpos;
3127 }
3128 /* cache callback name lookup
3129 * (if not done yet, i.e. it's the first error) */
3130 if (*known_errorHandler==-1) {
3131 if ((errors==NULL) || (!strcmp(errors, "strict")))
3132 *known_errorHandler = 1;
3133 else if (!strcmp(errors, "replace"))
3134 *known_errorHandler = 2;
3135 else if (!strcmp(errors, "ignore"))
3136 *known_errorHandler = 3;
3137 else if (!strcmp(errors, "xmlcharrefreplace"))
3138 *known_errorHandler = 4;
3139 else
3140 *known_errorHandler = 0;
3141 }
3142 switch (*known_errorHandler) {
3143 case 1: /* strict */
3144 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3145 return -1;
3146 case 2: /* replace */
3147 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3148 x = charmapencode_output('?', mapping, res, respos);
3149 if (x==NULL) {
3150 return -1;
3151 }
3152 else if (x==Py_None) {
3153 Py_DECREF(x);
3154 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3155 return -1;
3156 }
3157 Py_DECREF(x);
3158 }
3159 /* fall through */
3160 case 3: /* ignore */
3161 *inpos = collendpos;
3162 break;
3163 case 4: /* xmlcharrefreplace */
3164 /* generate replacement (temporarily (mis)uses p) */
3165 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3166 char buffer[2+29+1+1];
3167 char *cp;
3168 sprintf(buffer, "&#%d;", (int)p[collpos]);
3169 for (cp = buffer; *cp; ++cp) {
3170 x = charmapencode_output(*cp, mapping, res, respos);
3171 if (x==NULL)
3172 return -1;
3173 else if (x==Py_None) {
3174 Py_DECREF(x);
3175 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3176 return -1;
3177 }
3178 Py_DECREF(x);
3179 }
3180 }
3181 *inpos = collendpos;
3182 break;
3183 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003184 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 encoding, reason, p, size, exceptionObject,
3186 collstartpos, collendpos, &newpos);
3187 if (repunicode == NULL)
3188 return -1;
3189 /* generate replacement */
3190 repsize = PyUnicode_GET_SIZE(repunicode);
3191 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3192 x = charmapencode_output(*uni2, mapping, res, respos);
3193 if (x==NULL) {
3194 Py_DECREF(repunicode);
3195 return -1;
3196 }
3197 else if (x==Py_None) {
3198 Py_DECREF(repunicode);
3199 Py_DECREF(x);
3200 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3201 return -1;
3202 }
3203 Py_DECREF(x);
3204 }
3205 *inpos = newpos;
3206 Py_DECREF(repunicode);
3207 }
3208 return 0;
3209}
3210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003212 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 PyObject *mapping,
3214 const char *errors)
3215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 /* output object */
3217 PyObject *res = NULL;
3218 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003219 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003221 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 PyObject *errorHandler = NULL;
3223 PyObject *exc = NULL;
3224 /* the following variable is used for caching string comparisons
3225 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3226 * 3=ignore, 4=xmlcharrefreplace */
3227 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228
3229 /* Default to Latin-1 */
3230 if (mapping == NULL)
3231 return PyUnicode_EncodeLatin1(p, size, errors);
3232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 /* allocate enough for a simple encoding without
3234 replacements, if we need more, we'll resize */
3235 res = PyString_FromStringAndSize(NULL, size);
3236 if (res == NULL)
3237 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003238 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 while (inpos<size) {
3242 /* try to encode it */
3243 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3244 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 if (x==Py_None) { /* unencodable character */
3247 if (charmap_encoding_error(p, size, &inpos, mapping,
3248 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003249 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003250 &res, &respos)) {
3251 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003252 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 else
3256 /* done with this character => adjust input position */
3257 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 Py_DECREF(x);
3259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 /* Resize if we allocated to much */
3262 if (respos<PyString_GET_SIZE(res)) {
3263 if (_PyString_Resize(&res, respos))
3264 goto onError;
3265 }
3266 Py_XDECREF(exc);
3267 Py_XDECREF(errorHandler);
3268 return res;
3269
3270 onError:
3271 Py_XDECREF(res);
3272 Py_XDECREF(exc);
3273 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 return NULL;
3275}
3276
3277PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3278 PyObject *mapping)
3279{
3280 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3281 PyErr_BadArgument();
3282 return NULL;
3283 }
3284 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3285 PyUnicode_GET_SIZE(unicode),
3286 mapping,
3287 NULL);
3288}
3289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290/* create or adjust a UnicodeTranslateError */
3291static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003292 const Py_UNICODE *unicode, Py_ssize_t size,
3293 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 if (*exceptionObject == NULL) {
3297 *exceptionObject = PyUnicodeTranslateError_Create(
3298 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
3300 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3302 goto onError;
3303 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3304 goto onError;
3305 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3306 goto onError;
3307 return;
3308 onError:
3309 Py_DECREF(*exceptionObject);
3310 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 }
3312}
3313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314/* raises a UnicodeTranslateError */
3315static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003316 const Py_UNICODE *unicode, Py_ssize_t size,
3317 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 const char *reason)
3319{
3320 make_translate_exception(exceptionObject,
3321 unicode, size, startpos, endpos, reason);
3322 if (*exceptionObject != NULL)
3323 PyCodec_StrictErrors(*exceptionObject);
3324}
3325
3326/* error handling callback helper:
3327 build arguments, call the callback and check the arguments,
3328 put the result into newpos and return the replacement string, which
3329 has to be freed by the caller */
3330static PyObject *unicode_translate_call_errorhandler(const char *errors,
3331 PyObject **errorHandler,
3332 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
3335 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003337 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338
Martin v. Löwis412fb672006-04-13 06:34:32 +00003339 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 PyObject *restuple;
3341 PyObject *resunicode;
3342
3343 if (*errorHandler == NULL) {
3344 *errorHandler = PyCodec_LookupError(errors);
3345 if (*errorHandler == NULL)
3346 return NULL;
3347 }
3348
3349 make_translate_exception(exceptionObject,
3350 unicode, size, startpos, endpos, reason);
3351 if (*exceptionObject == NULL)
3352 return NULL;
3353
3354 restuple = PyObject_CallFunctionObjArgs(
3355 *errorHandler, *exceptionObject, NULL);
3356 if (restuple == NULL)
3357 return NULL;
3358 if (!PyTuple_Check(restuple)) {
3359 PyErr_Format(PyExc_TypeError, &argparse[4]);
3360 Py_DECREF(restuple);
3361 return NULL;
3362 }
3363 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003364 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 Py_DECREF(restuple);
3366 return NULL;
3367 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003368 if (i_newpos<0)
3369 *newpos = size+i_newpos;
3370 else
3371 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003372 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003373 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003374 Py_DECREF(restuple);
3375 return NULL;
3376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 Py_INCREF(resunicode);
3378 Py_DECREF(restuple);
3379 return resunicode;
3380}
3381
3382/* Lookup the character ch in the mapping and put the result in result,
3383 which must be decrefed by the caller.
3384 Return 0 on success, -1 on error */
3385static
3386int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3387{
3388 PyObject *w = PyInt_FromLong((long)c);
3389 PyObject *x;
3390
3391 if (w == NULL)
3392 return -1;
3393 x = PyObject_GetItem(mapping, w);
3394 Py_DECREF(w);
3395 if (x == NULL) {
3396 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3397 /* No mapping found means: use 1:1 mapping. */
3398 PyErr_Clear();
3399 *result = NULL;
3400 return 0;
3401 } else
3402 return -1;
3403 }
3404 else if (x == Py_None) {
3405 *result = x;
3406 return 0;
3407 }
3408 else if (PyInt_Check(x)) {
3409 long value = PyInt_AS_LONG(x);
3410 long max = PyUnicode_GetMax();
3411 if (value < 0 || value > max) {
3412 PyErr_Format(PyExc_TypeError,
3413 "character mapping must be in range(0x%lx)", max+1);
3414 Py_DECREF(x);
3415 return -1;
3416 }
3417 *result = x;
3418 return 0;
3419 }
3420 else if (PyUnicode_Check(x)) {
3421 *result = x;
3422 return 0;
3423 }
3424 else {
3425 /* wrong return value */
3426 PyErr_SetString(PyExc_TypeError,
3427 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003428 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 return -1;
3430 }
3431}
3432/* ensure that *outobj is at least requiredsize characters long,
3433if not reallocate and adjust various state variables.
3434Return 0 on success, -1 on error */
3435static
Walter Dörwald4894c302003-10-24 14:25:28 +00003436int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003439 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003440 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003442 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003444 if (requiredsize < 2 * oldsize)
3445 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003446 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 return -1;
3448 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 }
3450 return 0;
3451}
3452/* lookup the character, put the result in the output string and adjust
3453 various state variables. Return a new reference to the object that
3454 was put in the output buffer in *result, or Py_None, if the mapping was
3455 undefined (in which case no character was written).
3456 The called must decref result.
3457 Return 0 on success, -1 on error. */
3458static
Walter Dörwald4894c302003-10-24 14:25:28 +00003459int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003460 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003461 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462{
Walter Dörwald4894c302003-10-24 14:25:28 +00003463 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 return -1;
3465 if (*res==NULL) {
3466 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003467 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 }
3469 else if (*res==Py_None)
3470 ;
3471 else if (PyInt_Check(*res)) {
3472 /* no overflow check, because we know that the space is enough */
3473 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3474 }
3475 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003476 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 if (repsize==1) {
3478 /* no overflow check, because we know that the space is enough */
3479 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3480 }
3481 else if (repsize!=0) {
3482 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003484 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003485 repsize - 1;
3486 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 return -1;
3488 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3489 *outp += repsize;
3490 }
3491 }
3492 else
3493 return -1;
3494 return 0;
3495}
3496
3497PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 PyObject *mapping,
3500 const char *errors)
3501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 /* output object */
3503 PyObject *res = NULL;
3504 /* pointers to the beginning and end+1 of input */
3505 const Py_UNICODE *startp = p;
3506 const Py_UNICODE *endp = p + size;
3507 /* pointer into the output */
3508 Py_UNICODE *str;
3509 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003510 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 char *reason = "character maps to <undefined>";
3512 PyObject *errorHandler = NULL;
3513 PyObject *exc = NULL;
3514 /* the following variable is used for caching string comparisons
3515 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3516 * 3=ignore, 4=xmlcharrefreplace */
3517 int known_errorHandler = -1;
3518
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (mapping == NULL) {
3520 PyErr_BadArgument();
3521 return NULL;
3522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523
3524 /* allocate enough for a simple 1:1 translation without
3525 replacements, if we need more, we'll resize */
3526 res = PyUnicode_FromUnicode(NULL, size);
3527 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 return res;
3531 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 while (p<endp) {
3534 /* try to encode it */
3535 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003536 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 goto onError;
3539 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003540 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (x!=Py_None) /* it worked => adjust input pointer */
3542 ++p;
3543 else { /* untranslatable character */
3544 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t repsize;
3546 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 Py_UNICODE *uni2;
3548 /* startpos for collecting untranslatable chars */
3549 const Py_UNICODE *collstart = p;
3550 const Py_UNICODE *collend = p+1;
3551 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 /* find all untranslatable characters */
3554 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003555 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 goto onError;
3557 Py_XDECREF(x);
3558 if (x!=Py_None)
3559 break;
3560 ++collend;
3561 }
3562 /* cache callback name lookup
3563 * (if not done yet, i.e. it's the first error) */
3564 if (known_errorHandler==-1) {
3565 if ((errors==NULL) || (!strcmp(errors, "strict")))
3566 known_errorHandler = 1;
3567 else if (!strcmp(errors, "replace"))
3568 known_errorHandler = 2;
3569 else if (!strcmp(errors, "ignore"))
3570 known_errorHandler = 3;
3571 else if (!strcmp(errors, "xmlcharrefreplace"))
3572 known_errorHandler = 4;
3573 else
3574 known_errorHandler = 0;
3575 }
3576 switch (known_errorHandler) {
3577 case 1: /* strict */
3578 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3579 goto onError;
3580 case 2: /* replace */
3581 /* No need to check for space, this is a 1:1 replacement */
3582 for (coll = collstart; coll<collend; ++coll)
3583 *str++ = '?';
3584 /* fall through */
3585 case 3: /* ignore */
3586 p = collend;
3587 break;
3588 case 4: /* xmlcharrefreplace */
3589 /* generate replacement (temporarily (mis)uses p) */
3590 for (p = collstart; p < collend; ++p) {
3591 char buffer[2+29+1+1];
3592 char *cp;
3593 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003594 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3596 goto onError;
3597 for (cp = buffer; *cp; ++cp)
3598 *str++ = *cp;
3599 }
3600 p = collend;
3601 break;
3602 default:
3603 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3604 reason, startp, size, &exc,
3605 collstart-startp, collend-startp, &newpos);
3606 if (repunicode == NULL)
3607 goto onError;
3608 /* generate replacement */
3609 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003610 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3612 Py_DECREF(repunicode);
3613 goto onError;
3614 }
3615 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3616 *str++ = *uni2;
3617 p = startp + newpos;
3618 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 }
3620 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 /* Resize if we allocated to much */
3623 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003624 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003625 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003626 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 }
3628 Py_XDECREF(exc);
3629 Py_XDECREF(errorHandler);
3630 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 onError:
3633 Py_XDECREF(res);
3634 Py_XDECREF(exc);
3635 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 return NULL;
3637}
3638
3639PyObject *PyUnicode_Translate(PyObject *str,
3640 PyObject *mapping,
3641 const char *errors)
3642{
3643 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 str = PyUnicode_FromObject(str);
3646 if (str == NULL)
3647 goto onError;
3648 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3649 PyUnicode_GET_SIZE(str),
3650 mapping,
3651 errors);
3652 Py_DECREF(str);
3653 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 onError:
3656 Py_XDECREF(str);
3657 return NULL;
3658}
Tim Petersced69f82003-09-16 20:30:58 +00003659
Guido van Rossum9e896b32000-04-05 20:11:21 +00003660/* --- Decimal Encoder ---------------------------------------------------- */
3661
3662int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003663 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003664 char *output,
3665 const char *errors)
3666{
3667 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 PyObject *errorHandler = NULL;
3669 PyObject *exc = NULL;
3670 const char *encoding = "decimal";
3671 const char *reason = "invalid decimal Unicode string";
3672 /* the following variable is used for caching string comparisons
3673 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3674 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003675
3676 if (output == NULL) {
3677 PyErr_BadArgument();
3678 return -1;
3679 }
3680
3681 p = s;
3682 end = s + length;
3683 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003685 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003687 Py_ssize_t repsize;
3688 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 Py_UNICODE *uni2;
3690 Py_UNICODE *collstart;
3691 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003692
Guido van Rossum9e896b32000-04-05 20:11:21 +00003693 if (Py_UNICODE_ISSPACE(ch)) {
3694 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003696 continue;
3697 }
3698 decimal = Py_UNICODE_TODECIMAL(ch);
3699 if (decimal >= 0) {
3700 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003702 continue;
3703 }
Guido van Rossumba477042000-04-06 18:18:10 +00003704 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003705 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003707 continue;
3708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003709 /* All other characters are considered unencodable */
3710 collstart = p;
3711 collend = p+1;
3712 while (collend < end) {
3713 if ((0 < *collend && *collend < 256) ||
3714 !Py_UNICODE_ISSPACE(*collend) ||
3715 Py_UNICODE_TODECIMAL(*collend))
3716 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 /* cache callback name lookup
3719 * (if not done yet, i.e. it's the first error) */
3720 if (known_errorHandler==-1) {
3721 if ((errors==NULL) || (!strcmp(errors, "strict")))
3722 known_errorHandler = 1;
3723 else if (!strcmp(errors, "replace"))
3724 known_errorHandler = 2;
3725 else if (!strcmp(errors, "ignore"))
3726 known_errorHandler = 3;
3727 else if (!strcmp(errors, "xmlcharrefreplace"))
3728 known_errorHandler = 4;
3729 else
3730 known_errorHandler = 0;
3731 }
3732 switch (known_errorHandler) {
3733 case 1: /* strict */
3734 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3735 goto onError;
3736 case 2: /* replace */
3737 for (p = collstart; p < collend; ++p)
3738 *output++ = '?';
3739 /* fall through */
3740 case 3: /* ignore */
3741 p = collend;
3742 break;
3743 case 4: /* xmlcharrefreplace */
3744 /* generate replacement (temporarily (mis)uses p) */
3745 for (p = collstart; p < collend; ++p)
3746 output += sprintf(output, "&#%d;", (int)*p);
3747 p = collend;
3748 break;
3749 default:
3750 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3751 encoding, reason, s, length, &exc,
3752 collstart-s, collend-s, &newpos);
3753 if (repunicode == NULL)
3754 goto onError;
3755 /* generate replacement */
3756 repsize = PyUnicode_GET_SIZE(repunicode);
3757 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3758 Py_UNICODE ch = *uni2;
3759 if (Py_UNICODE_ISSPACE(ch))
3760 *output++ = ' ';
3761 else {
3762 decimal = Py_UNICODE_TODECIMAL(ch);
3763 if (decimal >= 0)
3764 *output++ = '0' + decimal;
3765 else if (0 < ch && ch < 256)
3766 *output++ = (char)ch;
3767 else {
3768 Py_DECREF(repunicode);
3769 raise_encode_exception(&exc, encoding,
3770 s, length, collstart-s, collend-s, reason);
3771 goto onError;
3772 }
3773 }
3774 }
3775 p = s + newpos;
3776 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003777 }
3778 }
3779 /* 0-terminate the output string */
3780 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(exc);
3782 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003783 return 0;
3784
3785 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 Py_XDECREF(exc);
3787 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003788 return -1;
3789}
3790
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791/* --- Helpers ------------------------------------------------------------ */
3792
Tim Petersced69f82003-09-16 20:30:58 +00003793static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003794Py_ssize_t count(PyUnicodeObject *self,
3795 Py_ssize_t start,
3796 Py_ssize_t end,
3797 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003799 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003801 if (start < 0)
3802 start += self->length;
3803 if (start < 0)
3804 start = 0;
3805 if (end > self->length)
3806 end = self->length;
3807 if (end < 0)
3808 end += self->length;
3809 if (end < 0)
3810 end = 0;
3811
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003812 if (substring->length == 0)
3813 return (end - start + 1);
3814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 end -= substring->length;
3816
3817 while (start <= end)
3818 if (Py_UNICODE_MATCH(self, start, substring)) {
3819 count++;
3820 start += substring->length;
3821 } else
3822 start++;
3823
3824 return count;
3825}
3826
Martin v. Löwis18e16552006-02-15 17:27:45 +00003827Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003829 Py_ssize_t start,
3830 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003832 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 str = PyUnicode_FromObject(str);
3835 if (str == NULL)
3836 return -1;
3837 substr = PyUnicode_FromObject(substr);
3838 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003839 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 return -1;
3841 }
Tim Petersced69f82003-09-16 20:30:58 +00003842
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 result = count((PyUnicodeObject *)str,
3844 start, end,
3845 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 Py_DECREF(str);
3848 Py_DECREF(substr);
3849 return result;
3850}
3851
Tim Petersced69f82003-09-16 20:30:58 +00003852static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003853Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003855 Py_ssize_t start,
3856 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 int direction)
3858{
3859 if (start < 0)
3860 start += self->length;
3861 if (start < 0)
3862 start = 0;
3863
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 if (end > self->length)
3865 end = self->length;
3866 if (end < 0)
3867 end += self->length;
3868 if (end < 0)
3869 end = 0;
3870
Guido van Rossum76afbd92002-08-20 17:29:29 +00003871 if (substring->length == 0)
3872 return (direction > 0) ? start : end;
3873
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 end -= substring->length;
3875
3876 if (direction < 0) {
3877 for (; end >= start; end--)
3878 if (Py_UNICODE_MATCH(self, end, substring))
3879 return end;
3880 } else {
3881 for (; start <= end; start++)
3882 if (Py_UNICODE_MATCH(self, start, substring))
3883 return start;
3884 }
3885
3886 return -1;
3887}
3888
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 Py_ssize_t start,
3892 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 int direction)
3894{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003895 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003896
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 str = PyUnicode_FromObject(str);
3898 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003899 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 substr = PyUnicode_FromObject(substr);
3901 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003902 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003903 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 }
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 result = findstring((PyUnicodeObject *)str,
3907 (PyUnicodeObject *)substr,
3908 start, end, direction);
3909 Py_DECREF(str);
3910 Py_DECREF(substr);
3911 return result;
3912}
3913
Tim Petersced69f82003-09-16 20:30:58 +00003914static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915int tailmatch(PyUnicodeObject *self,
3916 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003917 Py_ssize_t start,
3918 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 int direction)
3920{
3921 if (start < 0)
3922 start += self->length;
3923 if (start < 0)
3924 start = 0;
3925
3926 if (substring->length == 0)
3927 return 1;
3928
3929 if (end > self->length)
3930 end = self->length;
3931 if (end < 0)
3932 end += self->length;
3933 if (end < 0)
3934 end = 0;
3935
3936 end -= substring->length;
3937 if (end < start)
3938 return 0;
3939
3940 if (direction > 0) {
3941 if (Py_UNICODE_MATCH(self, end, substring))
3942 return 1;
3943 } else {
3944 if (Py_UNICODE_MATCH(self, start, substring))
3945 return 1;
3946 }
3947
3948 return 0;
3949}
3950
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t start,
3954 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 int direction)
3956{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003957 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003958
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 str = PyUnicode_FromObject(str);
3960 if (str == NULL)
3961 return -1;
3962 substr = PyUnicode_FromObject(substr);
3963 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003964 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 return -1;
3966 }
Tim Petersced69f82003-09-16 20:30:58 +00003967
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 result = tailmatch((PyUnicodeObject *)str,
3969 (PyUnicodeObject *)substr,
3970 start, end, direction);
3971 Py_DECREF(str);
3972 Py_DECREF(substr);
3973 return result;
3974}
3975
Tim Petersced69f82003-09-16 20:30:58 +00003976static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003978 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 Py_UNICODE ch)
3980{
3981 /* like wcschr, but doesn't stop at NULL characters */
3982
3983 while (size-- > 0) {
3984 if (*s == ch)
3985 return s;
3986 s++;
3987 }
3988
3989 return NULL;
3990}
3991
3992/* Apply fixfct filter to the Unicode object self and return a
3993 reference to the modified object */
3994
Tim Petersced69f82003-09-16 20:30:58 +00003995static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996PyObject *fixup(PyUnicodeObject *self,
3997 int (*fixfct)(PyUnicodeObject *s))
3998{
3999
4000 PyUnicodeObject *u;
4001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004002 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 if (u == NULL)
4004 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004005
4006 Py_UNICODE_COPY(u->str, self->str, self->length);
4007
Tim Peters7a29bd52001-09-12 03:03:31 +00004008 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 /* fixfct should return TRUE if it modified the buffer. If
4010 FALSE, return a reference to the original buffer instead
4011 (to save space, not time) */
4012 Py_INCREF(self);
4013 Py_DECREF(u);
4014 return (PyObject*) self;
4015 }
4016 return (PyObject*) u;
4017}
4018
Tim Petersced69f82003-09-16 20:30:58 +00004019static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020int fixupper(PyUnicodeObject *self)
4021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 Py_UNICODE *s = self->str;
4024 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004025
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 while (len-- > 0) {
4027 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004028
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 ch = Py_UNICODE_TOUPPER(*s);
4030 if (ch != *s) {
4031 status = 1;
4032 *s = ch;
4033 }
4034 s++;
4035 }
4036
4037 return status;
4038}
4039
Tim Petersced69f82003-09-16 20:30:58 +00004040static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041int fixlower(PyUnicodeObject *self)
4042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004043 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 Py_UNICODE *s = self->str;
4045 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 while (len-- > 0) {
4048 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 ch = Py_UNICODE_TOLOWER(*s);
4051 if (ch != *s) {
4052 status = 1;
4053 *s = ch;
4054 }
4055 s++;
4056 }
4057
4058 return status;
4059}
4060
Tim Petersced69f82003-09-16 20:30:58 +00004061static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062int fixswapcase(PyUnicodeObject *self)
4063{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004064 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 Py_UNICODE *s = self->str;
4066 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004067
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 while (len-- > 0) {
4069 if (Py_UNICODE_ISUPPER(*s)) {
4070 *s = Py_UNICODE_TOLOWER(*s);
4071 status = 1;
4072 } else if (Py_UNICODE_ISLOWER(*s)) {
4073 *s = Py_UNICODE_TOUPPER(*s);
4074 status = 1;
4075 }
4076 s++;
4077 }
4078
4079 return status;
4080}
4081
Tim Petersced69f82003-09-16 20:30:58 +00004082static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083int fixcapitalize(PyUnicodeObject *self)
4084{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004085 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004086 Py_UNICODE *s = self->str;
4087 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004088
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004089 if (len == 0)
4090 return 0;
4091 if (Py_UNICODE_ISLOWER(*s)) {
4092 *s = Py_UNICODE_TOUPPER(*s);
4093 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004095 s++;
4096 while (--len > 0) {
4097 if (Py_UNICODE_ISUPPER(*s)) {
4098 *s = Py_UNICODE_TOLOWER(*s);
4099 status = 1;
4100 }
4101 s++;
4102 }
4103 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104}
4105
4106static
4107int fixtitle(PyUnicodeObject *self)
4108{
4109 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4110 register Py_UNICODE *e;
4111 int previous_is_cased;
4112
4113 /* Shortcut for single character strings */
4114 if (PyUnicode_GET_SIZE(self) == 1) {
4115 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4116 if (*p != ch) {
4117 *p = ch;
4118 return 1;
4119 }
4120 else
4121 return 0;
4122 }
Tim Petersced69f82003-09-16 20:30:58 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 e = p + PyUnicode_GET_SIZE(self);
4125 previous_is_cased = 0;
4126 for (; p < e; p++) {
4127 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004128
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 if (previous_is_cased)
4130 *p = Py_UNICODE_TOLOWER(ch);
4131 else
4132 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004133
4134 if (Py_UNICODE_ISLOWER(ch) ||
4135 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 Py_UNICODE_ISTITLE(ch))
4137 previous_is_cased = 1;
4138 else
4139 previous_is_cased = 0;
4140 }
4141 return 1;
4142}
4143
Tim Peters8ce9f162004-08-27 01:49:32 +00004144PyObject *
4145PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146{
Tim Peters8ce9f162004-08-27 01:49:32 +00004147 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004148 const Py_UNICODE blank = ' ';
4149 const Py_UNICODE *sep = &blank;
4150 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004151 PyUnicodeObject *res = NULL; /* the result */
4152 size_t res_alloc = 100; /* # allocated bytes for string in res */
4153 size_t res_used; /* # used bytes */
4154 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4155 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004156 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004157 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004158 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159
Tim Peters05eba1f2004-08-27 21:32:02 +00004160 fseq = PySequence_Fast(seq, "");
4161 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004162 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004163 }
4164
Tim Peters91879ab2004-08-27 22:35:44 +00004165 /* Grrrr. A codec may be invoked to convert str objects to
4166 * Unicode, and so it's possible to call back into Python code
4167 * during PyUnicode_FromObject(), and so it's possible for a sick
4168 * codec to change the size of fseq (if seq is a list). Therefore
4169 * we have to keep refetching the size -- can't assume seqlen
4170 * is invariant.
4171 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004172 seqlen = PySequence_Fast_GET_SIZE(fseq);
4173 /* If empty sequence, return u"". */
4174 if (seqlen == 0) {
4175 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4176 goto Done;
4177 }
4178 /* If singleton sequence with an exact Unicode, return that. */
4179 if (seqlen == 1) {
4180 item = PySequence_Fast_GET_ITEM(fseq, 0);
4181 if (PyUnicode_CheckExact(item)) {
4182 Py_INCREF(item);
4183 res = (PyUnicodeObject *)item;
4184 goto Done;
4185 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004186 }
4187
Tim Peters05eba1f2004-08-27 21:32:02 +00004188 /* At least two items to join, or one that isn't exact Unicode. */
4189 if (seqlen > 1) {
4190 /* Set up sep and seplen -- they're needed. */
4191 if (separator == NULL) {
4192 sep = &blank;
4193 seplen = 1;
4194 }
4195 else {
4196 internal_separator = PyUnicode_FromObject(separator);
4197 if (internal_separator == NULL)
4198 goto onError;
4199 sep = PyUnicode_AS_UNICODE(internal_separator);
4200 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004201 /* In case PyUnicode_FromObject() mutated seq. */
4202 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004203 }
4204 }
4205
4206 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004207 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004208 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004209 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004210 res_p = PyUnicode_AS_UNICODE(res);
4211 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004212
Tim Peters05eba1f2004-08-27 21:32:02 +00004213 for (i = 0; i < seqlen; ++i) {
4214 size_t itemlen;
4215 size_t new_res_used;
4216
4217 item = PySequence_Fast_GET_ITEM(fseq, i);
4218 /* Convert item to Unicode. */
4219 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4220 PyErr_Format(PyExc_TypeError,
4221 "sequence item %i: expected string or Unicode,"
4222 " %.80s found",
4223 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004224 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004225 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004226 item = PyUnicode_FromObject(item);
4227 if (item == NULL)
4228 goto onError;
4229 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004230
Tim Peters91879ab2004-08-27 22:35:44 +00004231 /* In case PyUnicode_FromObject() mutated seq. */
4232 seqlen = PySequence_Fast_GET_SIZE(fseq);
4233
Tim Peters8ce9f162004-08-27 01:49:32 +00004234 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004236 new_res_used = res_used + itemlen;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004237 if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004238 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004239 if (i < seqlen - 1) {
4240 new_res_used += seplen;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004241 if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX)
Tim Peters05eba1f2004-08-27 21:32:02 +00004242 goto Overflow;
4243 }
4244 if (new_res_used > res_alloc) {
4245 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004246 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004247 size_t oldsize = res_alloc;
4248 res_alloc += res_alloc;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004249 if (res_alloc < oldsize || res_alloc > PY_SSIZE_T_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004250 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004251 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004252 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004253 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004255 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004256 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004258
4259 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004260 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004261 res_p += itemlen;
4262 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004263 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004264 res_p += seplen;
4265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004267 res_used = new_res_used;
4268 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004269
Tim Peters05eba1f2004-08-27 21:32:02 +00004270 /* Shrink res to match the used area; this probably can't fail,
4271 * but it's cheap to check.
4272 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004273 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004274 goto onError;
4275
4276 Done:
4277 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004278 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 return (PyObject *)res;
4280
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 Overflow:
4282 PyErr_SetString(PyExc_OverflowError,
4283 "join() is too long for a Python string");
4284 Py_DECREF(item);
4285 /* fall through */
4286
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004288 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004289 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004290 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return NULL;
4292}
4293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
4295PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004296 Py_ssize_t left,
4297 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 Py_UNICODE fill)
4299{
4300 PyUnicodeObject *u;
4301
4302 if (left < 0)
4303 left = 0;
4304 if (right < 0)
4305 right = 0;
4306
Tim Peters7a29bd52001-09-12 03:03:31 +00004307 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 Py_INCREF(self);
4309 return self;
4310 }
4311
4312 u = _PyUnicode_New(left + self->length + right);
4313 if (u) {
4314 if (left)
4315 Py_UNICODE_FILL(u->str, fill, left);
4316 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4317 if (right)
4318 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4319 }
4320
4321 return u;
4322}
4323
4324#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 if (!str) \
4327 goto onError; \
4328 if (PyList_Append(list, str)) { \
4329 Py_DECREF(str); \
4330 goto onError; \
4331 } \
4332 else \
4333 Py_DECREF(str);
4334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004335#define SPLIT_INSERT(data, left, right) \
4336 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4337 if (!str) \
4338 goto onError; \
4339 if (PyList_Insert(list, 0, str)) { \
4340 Py_DECREF(str); \
4341 goto onError; \
4342 } \
4343 else \
4344 Py_DECREF(str);
4345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346static
4347PyObject *split_whitespace(PyUnicodeObject *self,
4348 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 register Py_ssize_t i;
4352 register Py_ssize_t j;
4353 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 PyObject *str;
4355
4356 for (i = j = 0; i < len; ) {
4357 /* find a token */
4358 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4359 i++;
4360 j = i;
4361 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4362 i++;
4363 if (j < i) {
4364 if (maxcount-- <= 0)
4365 break;
4366 SPLIT_APPEND(self->str, j, i);
4367 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4368 i++;
4369 j = i;
4370 }
4371 }
4372 if (j < len) {
4373 SPLIT_APPEND(self->str, j, len);
4374 }
4375 return list;
4376
4377 onError:
4378 Py_DECREF(list);
4379 return NULL;
4380}
4381
4382PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004383 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 register Py_ssize_t i;
4386 register Py_ssize_t j;
4387 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 PyObject *list;
4389 PyObject *str;
4390 Py_UNICODE *data;
4391
4392 string = PyUnicode_FromObject(string);
4393 if (string == NULL)
4394 return NULL;
4395 data = PyUnicode_AS_UNICODE(string);
4396 len = PyUnicode_GET_SIZE(string);
4397
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 list = PyList_New(0);
4399 if (!list)
4400 goto onError;
4401
4402 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004403 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 /* Find a line and append it */
4406 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4407 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408
4409 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004410 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 if (i < len) {
4412 if (data[i] == '\r' && i + 1 < len &&
4413 data[i+1] == '\n')
4414 i += 2;
4415 else
4416 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004417 if (keepends)
4418 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Guido van Rossum86662912000-04-11 15:38:46 +00004420 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 j = i;
4422 }
4423 if (j < len) {
4424 SPLIT_APPEND(data, j, len);
4425 }
4426
4427 Py_DECREF(string);
4428 return list;
4429
4430 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004431 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_DECREF(string);
4433 return NULL;
4434}
4435
Tim Petersced69f82003-09-16 20:30:58 +00004436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437PyObject *split_char(PyUnicodeObject *self,
4438 PyObject *list,
4439 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 register Py_ssize_t i;
4443 register Py_ssize_t j;
4444 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 PyObject *str;
4446
4447 for (i = j = 0; i < len; ) {
4448 if (self->str[i] == ch) {
4449 if (maxcount-- <= 0)
4450 break;
4451 SPLIT_APPEND(self->str, j, i);
4452 i = j = i + 1;
4453 } else
4454 i++;
4455 }
4456 if (j <= len) {
4457 SPLIT_APPEND(self->str, j, len);
4458 }
4459 return list;
4460
4461 onError:
4462 Py_DECREF(list);
4463 return NULL;
4464}
4465
Tim Petersced69f82003-09-16 20:30:58 +00004466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *split_substring(PyUnicodeObject *self,
4468 PyObject *list,
4469 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004470 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 register Py_ssize_t i;
4473 register Py_ssize_t j;
4474 Py_ssize_t len = self->length;
4475 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 PyObject *str;
4477
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004478 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 if (Py_UNICODE_MATCH(self, i, substring)) {
4480 if (maxcount-- <= 0)
4481 break;
4482 SPLIT_APPEND(self->str, j, i);
4483 i = j = i + sublen;
4484 } else
4485 i++;
4486 }
4487 if (j <= len) {
4488 SPLIT_APPEND(self->str, j, len);
4489 }
4490 return list;
4491
4492 onError:
4493 Py_DECREF(list);
4494 return NULL;
4495}
4496
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004497static
4498PyObject *rsplit_whitespace(PyUnicodeObject *self,
4499 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 register Py_ssize_t i;
4503 register Py_ssize_t j;
4504 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004505 PyObject *str;
4506
4507 for (i = j = len - 1; i >= 0; ) {
4508 /* find a token */
4509 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4510 i--;
4511 j = i;
4512 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4513 i--;
4514 if (j > i) {
4515 if (maxcount-- <= 0)
4516 break;
4517 SPLIT_INSERT(self->str, i + 1, j + 1);
4518 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4519 i--;
4520 j = i;
4521 }
4522 }
4523 if (j >= 0) {
4524 SPLIT_INSERT(self->str, 0, j + 1);
4525 }
4526 return list;
4527
4528 onError:
4529 Py_DECREF(list);
4530 return NULL;
4531}
4532
4533static
4534PyObject *rsplit_char(PyUnicodeObject *self,
4535 PyObject *list,
4536 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 register Py_ssize_t i;
4540 register Py_ssize_t j;
4541 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004542 PyObject *str;
4543
4544 for (i = j = len - 1; i >= 0; ) {
4545 if (self->str[i] == ch) {
4546 if (maxcount-- <= 0)
4547 break;
4548 SPLIT_INSERT(self->str, i + 1, j + 1);
4549 j = i = i - 1;
4550 } else
4551 i--;
4552 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004553 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004554 SPLIT_INSERT(self->str, 0, j + 1);
4555 }
4556 return list;
4557
4558 onError:
4559 Py_DECREF(list);
4560 return NULL;
4561}
4562
4563static
4564PyObject *rsplit_substring(PyUnicodeObject *self,
4565 PyObject *list,
4566 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 register Py_ssize_t i;
4570 register Py_ssize_t j;
4571 Py_ssize_t len = self->length;
4572 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004573 PyObject *str;
4574
4575 for (i = len - sublen, j = len; i >= 0; ) {
4576 if (Py_UNICODE_MATCH(self, i, substring)) {
4577 if (maxcount-- <= 0)
4578 break;
4579 SPLIT_INSERT(self->str, i + sublen, j);
4580 j = i;
4581 i -= sublen;
4582 } else
4583 i--;
4584 }
4585 if (j >= 0) {
4586 SPLIT_INSERT(self->str, 0, j);
4587 }
4588 return list;
4589
4590 onError:
4591 Py_DECREF(list);
4592 return NULL;
4593}
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004596#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
4598static
4599PyObject *split(PyUnicodeObject *self,
4600 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004601 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602{
4603 PyObject *list;
4604
4605 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004606 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 list = PyList_New(0);
4609 if (!list)
4610 return NULL;
4611
4612 if (substring == NULL)
4613 return split_whitespace(self,list,maxcount);
4614
4615 else if (substring->length == 1)
4616 return split_char(self,list,substring->str[0],maxcount);
4617
4618 else if (substring->length == 0) {
4619 Py_DECREF(list);
4620 PyErr_SetString(PyExc_ValueError, "empty separator");
4621 return NULL;
4622 }
4623 else
4624 return split_substring(self,list,substring,maxcount);
4625}
4626
Tim Petersced69f82003-09-16 20:30:58 +00004627static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004628PyObject *rsplit(PyUnicodeObject *self,
4629 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004630 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004631{
4632 PyObject *list;
4633
4634 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004635 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004636
4637 list = PyList_New(0);
4638 if (!list)
4639 return NULL;
4640
4641 if (substring == NULL)
4642 return rsplit_whitespace(self,list,maxcount);
4643
4644 else if (substring->length == 1)
4645 return rsplit_char(self,list,substring->str[0],maxcount);
4646
4647 else if (substring->length == 0) {
4648 Py_DECREF(list);
4649 PyErr_SetString(PyExc_ValueError, "empty separator");
4650 return NULL;
4651 }
4652 else
4653 return rsplit_substring(self,list,substring,maxcount);
4654}
4655
4656static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657PyObject *replace(PyUnicodeObject *self,
4658 PyUnicodeObject *str1,
4659 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661{
4662 PyUnicodeObject *u;
4663
4664 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004665 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666
4667 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004668 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004671 if (!findchar(self->str, self->length, str1->str[0]) &&
4672 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 /* nothing to replace, return original string */
4674 Py_INCREF(self);
4675 u = self;
4676 } else {
4677 Py_UNICODE u1 = str1->str[0];
4678 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004681 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 self->length
4683 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004685 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004686 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 for (i = 0; i < u->length; i++)
4688 if (u->str[i] == u1) {
4689 if (--maxcount < 0)
4690 break;
4691 u->str[i] = u2;
4692 }
4693 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
4696 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 Py_UNICODE *p;
4699
4700 /* replace strings */
4701 n = count(self, 0, self->length, str1);
4702 if (n > maxcount)
4703 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004704 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004706 if (PyUnicode_CheckExact(self)) {
4707 Py_INCREF(self);
4708 u = self;
4709 }
4710 else {
4711 u = (PyUnicodeObject *)
4712 PyUnicode_FromUnicode(self->str, self->length);
4713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 } else {
4715 u = _PyUnicode_New(
4716 self->length + n * (str2->length - str1->length));
4717 if (u) {
4718 i = 0;
4719 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004720 if (str1->length > 0) {
4721 while (i <= self->length - str1->length)
4722 if (Py_UNICODE_MATCH(self, i, str1)) {
4723 /* replace string segment */
4724 Py_UNICODE_COPY(p, str2->str, str2->length);
4725 p += str2->length;
4726 i += str1->length;
4727 if (--n <= 0) {
4728 /* copy remaining part */
4729 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4730 break;
4731 }
4732 } else
4733 *p++ = self->str[i++];
4734 } else {
4735 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 Py_UNICODE_COPY(p, str2->str, str2->length);
4737 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004738 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004741 }
4742 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 }
4745 }
4746 }
Tim Petersced69f82003-09-16 20:30:58 +00004747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return (PyObject *) u;
4749}
4750
4751/* --- Unicode Object Methods --------------------------------------------- */
4752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004753PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754"S.title() -> unicode\n\
4755\n\
4756Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004760unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 return fixup(self, fixtitle);
4763}
4764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004765PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766"S.capitalize() -> unicode\n\
4767\n\
4768Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004769have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
4771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004772unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return fixup(self, fixcapitalize);
4775}
4776
4777#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004778PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779"S.capwords() -> unicode\n\
4780\n\
4781Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004782normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783
4784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004785unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786{
4787 PyObject *list;
4788 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004789 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 /* Split into words */
4792 list = split(self, NULL, -1);
4793 if (!list)
4794 return NULL;
4795
4796 /* Capitalize each word */
4797 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4798 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4799 fixcapitalize);
4800 if (item == NULL)
4801 goto onError;
4802 Py_DECREF(PyList_GET_ITEM(list, i));
4803 PyList_SET_ITEM(list, i, item);
4804 }
4805
4806 /* Join the words to form a new string */
4807 item = PyUnicode_Join(NULL, list);
4808
4809onError:
4810 Py_DECREF(list);
4811 return (PyObject *)item;
4812}
4813#endif
4814
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004815/* Argument converter. Coerces to a single unicode character */
4816
4817static int
4818convert_uc(PyObject *obj, void *addr)
4819{
4820 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4821 PyObject *uniobj;
4822 Py_UNICODE *unistr;
4823
4824 uniobj = PyUnicode_FromObject(obj);
4825 if (uniobj == NULL) {
4826 PyErr_SetString(PyExc_TypeError,
4827 "The fill character cannot be converted to Unicode");
4828 return 0;
4829 }
4830 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4831 PyErr_SetString(PyExc_TypeError,
4832 "The fill character must be exactly one character long");
4833 Py_DECREF(uniobj);
4834 return 0;
4835 }
4836 unistr = PyUnicode_AS_UNICODE(uniobj);
4837 *fillcharloc = unistr[0];
4838 Py_DECREF(uniobj);
4839 return 1;
4840}
4841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004842PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004843"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845Return S centered in a Unicode string of length width. Padding is\n\
4846done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
4848static PyObject *
4849unicode_center(PyUnicodeObject *self, PyObject *args)
4850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t marg, left;
4852 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004853 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
Thomas Woutersde017742006-02-16 19:34:37 +00004855 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 return NULL;
4857
Tim Peters7a29bd52001-09-12 03:03:31 +00004858 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 Py_INCREF(self);
4860 return (PyObject*) self;
4861 }
4862
4863 marg = width - self->length;
4864 left = marg / 2 + (marg & width & 1);
4865
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004866 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867}
4868
Marc-André Lemburge5034372000-08-08 08:04:29 +00004869#if 0
4870
4871/* This code should go into some future Unicode collation support
4872 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004873 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004874
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004875/* speedy UTF-16 code point order comparison */
4876/* gleaned from: */
4877/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4878
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004879static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004880{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004882 0, 0, 0, 0, 0, 0, 0, 0,
4883 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004884 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004885};
4886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887static int
4888unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004891
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 Py_UNICODE *s1 = str1->str;
4893 Py_UNICODE *s2 = str2->str;
4894
4895 len1 = str1->length;
4896 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004897
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004899 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004900
4901 c1 = *s1++;
4902 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004903
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004904 if (c1 > (1<<11) * 26)
4905 c1 += utf16Fixup[c1>>11];
4906 if (c2 > (1<<11) * 26)
4907 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004908 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004909
4910 if (c1 != c2)
4911 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004912
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004913 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
4915
4916 return (len1 < len2) ? -1 : (len1 != len2);
4917}
4918
Marc-André Lemburge5034372000-08-08 08:04:29 +00004919#else
4920
4921static int
4922unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4923{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004925
4926 Py_UNICODE *s1 = str1->str;
4927 Py_UNICODE *s2 = str2->str;
4928
4929 len1 = str1->length;
4930 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004931
Marc-André Lemburge5034372000-08-08 08:04:29 +00004932 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004933 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004934
Fredrik Lundh45714e92001-06-26 16:39:36 +00004935 c1 = *s1++;
4936 c2 = *s2++;
4937
4938 if (c1 != c2)
4939 return (c1 < c2) ? -1 : 1;
4940
Marc-André Lemburge5034372000-08-08 08:04:29 +00004941 len1--; len2--;
4942 }
4943
4944 return (len1 < len2) ? -1 : (len1 != len2);
4945}
4946
4947#endif
4948
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949int PyUnicode_Compare(PyObject *left,
4950 PyObject *right)
4951{
4952 PyUnicodeObject *u = NULL, *v = NULL;
4953 int result;
4954
4955 /* Coerce the two arguments */
4956 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4957 if (u == NULL)
4958 goto onError;
4959 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4960 if (v == NULL)
4961 goto onError;
4962
Thomas Wouters7e474022000-07-16 12:04:32 +00004963 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 if (v == u) {
4965 Py_DECREF(u);
4966 Py_DECREF(v);
4967 return 0;
4968 }
4969
4970 result = unicode_compare(u, v);
4971
4972 Py_DECREF(u);
4973 Py_DECREF(v);
4974 return result;
4975
4976onError:
4977 Py_XDECREF(u);
4978 Py_XDECREF(v);
4979 return -1;
4980}
4981
Guido van Rossum403d68b2000-03-13 15:55:09 +00004982int PyUnicode_Contains(PyObject *container,
4983 PyObject *element)
4984{
4985 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 int result;
4987 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004988 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004989
4990 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004991 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004992 if (v == NULL) {
4993 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004994 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004995 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004996 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004997 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004998 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005000
Barry Warsaw817918c2002-08-06 16:58:21 +00005001 size = PyUnicode_GET_SIZE(v);
5002 rhs = PyUnicode_AS_UNICODE(v);
5003 lhs = PyUnicode_AS_UNICODE(u);
5004
Guido van Rossum403d68b2000-03-13 15:55:09 +00005005 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005006 if (size == 1) {
5007 end = lhs + PyUnicode_GET_SIZE(u);
5008 while (lhs < end) {
5009 if (*lhs++ == *rhs) {
5010 result = 1;
5011 break;
5012 }
5013 }
5014 }
5015 else {
5016 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5017 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005018 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005019 result = 1;
5020 break;
5021 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005022 }
5023 }
5024
5025 Py_DECREF(u);
5026 Py_DECREF(v);
5027 return result;
5028
5029onError:
5030 Py_XDECREF(u);
5031 Py_XDECREF(v);
5032 return -1;
5033}
5034
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035/* Concat to string or Unicode object giving a new Unicode object. */
5036
5037PyObject *PyUnicode_Concat(PyObject *left,
5038 PyObject *right)
5039{
5040 PyUnicodeObject *u = NULL, *v = NULL, *w;
5041
5042 /* Coerce the two arguments */
5043 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5044 if (u == NULL)
5045 goto onError;
5046 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5047 if (v == NULL)
5048 goto onError;
5049
5050 /* Shortcuts */
5051 if (v == unicode_empty) {
5052 Py_DECREF(v);
5053 return (PyObject *)u;
5054 }
5055 if (u == unicode_empty) {
5056 Py_DECREF(u);
5057 return (PyObject *)v;
5058 }
5059
5060 /* Concat the two Unicode strings */
5061 w = _PyUnicode_New(u->length + v->length);
5062 if (w == NULL)
5063 goto onError;
5064 Py_UNICODE_COPY(w->str, u->str, u->length);
5065 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5066
5067 Py_DECREF(u);
5068 Py_DECREF(v);
5069 return (PyObject *)w;
5070
5071onError:
5072 Py_XDECREF(u);
5073 Py_XDECREF(v);
5074 return NULL;
5075}
5076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005077PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078"S.count(sub[, start[, end]]) -> int\n\
5079\n\
5080Return the number of occurrences of substring sub in Unicode string\n\
5081S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005082interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
5084static PyObject *
5085unicode_count(PyUnicodeObject *self, PyObject *args)
5086{
5087 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005088 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005089 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 PyObject *result;
5091
Guido van Rossumb8872e62000-05-09 14:14:27 +00005092 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5093 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 return NULL;
5095
5096 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5097 (PyObject *)substring);
5098 if (substring == NULL)
5099 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005100
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (start < 0)
5102 start += self->length;
5103 if (start < 0)
5104 start = 0;
5105 if (end > self->length)
5106 end = self->length;
5107 if (end < 0)
5108 end += self->length;
5109 if (end < 0)
5110 end = 0;
5111
5112 result = PyInt_FromLong((long) count(self, start, end, substring));
5113
5114 Py_DECREF(substring);
5115 return result;
5116}
5117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005118PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005119"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121Encodes S using the codec registered for encoding. encoding defaults\n\
5122to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005123handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5125'xmlcharrefreplace' as well as any other name registered with\n\
5126codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
5128static PyObject *
5129unicode_encode(PyUnicodeObject *self, PyObject *args)
5130{
5131 char *encoding = NULL;
5132 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005133 PyObject *v;
5134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5136 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005137 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005138 if (v == NULL)
5139 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005140 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5141 PyErr_Format(PyExc_TypeError,
5142 "encoder did not return a string/unicode object "
5143 "(type=%.400s)",
5144 v->ob_type->tp_name);
5145 Py_DECREF(v);
5146 return NULL;
5147 }
5148 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005149
5150 onError:
5151 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005152}
5153
5154PyDoc_STRVAR(decode__doc__,
5155"S.decode([encoding[,errors]]) -> string or unicode\n\
5156\n\
5157Decodes S using the codec registered for encoding. encoding defaults\n\
5158to the default encoding. errors may be given to set a different error\n\
5159handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5160a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5161as well as any other name registerd with codecs.register_error that is\n\
5162able to handle UnicodeDecodeErrors.");
5163
5164static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005165unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005166{
5167 char *encoding = NULL;
5168 char *errors = NULL;
5169 PyObject *v;
5170
5171 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5172 return NULL;
5173 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005174 if (v == NULL)
5175 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005176 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5177 PyErr_Format(PyExc_TypeError,
5178 "decoder did not return a string/unicode object "
5179 "(type=%.400s)",
5180 v->ob_type->tp_name);
5181 Py_DECREF(v);
5182 return NULL;
5183 }
5184 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005185
5186 onError:
5187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188}
5189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005190PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191"S.expandtabs([tabsize]) -> unicode\n\
5192\n\
5193Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005194If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
5196static PyObject*
5197unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5198{
5199 Py_UNICODE *e;
5200 Py_UNICODE *p;
5201 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 PyUnicodeObject *u;
5204 int tabsize = 8;
5205
5206 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5207 return NULL;
5208
Thomas Wouters7e474022000-07-16 12:04:32 +00005209 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 i = j = 0;
5211 e = self->str + self->length;
5212 for (p = self->str; p < e; p++)
5213 if (*p == '\t') {
5214 if (tabsize > 0)
5215 j += tabsize - (j % tabsize);
5216 }
5217 else {
5218 j++;
5219 if (*p == '\n' || *p == '\r') {
5220 i += j;
5221 j = 0;
5222 }
5223 }
5224
5225 /* Second pass: create output string and fill it */
5226 u = _PyUnicode_New(i + j);
5227 if (!u)
5228 return NULL;
5229
5230 j = 0;
5231 q = u->str;
5232
5233 for (p = self->str; p < e; p++)
5234 if (*p == '\t') {
5235 if (tabsize > 0) {
5236 i = tabsize - (j % tabsize);
5237 j += i;
5238 while (i--)
5239 *q++ = ' ';
5240 }
5241 }
5242 else {
5243 j++;
5244 *q++ = *p;
5245 if (*p == '\n' || *p == '\r')
5246 j = 0;
5247 }
5248
5249 return (PyObject*) u;
5250}
5251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005252PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253"S.find(sub [,start [,end]]) -> int\n\
5254\n\
5255Return the lowest index in S where substring sub is found,\n\
5256such that sub is contained within s[start,end]. Optional\n\
5257arguments start and end are interpreted as in slice notation.\n\
5258\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005259Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261static PyObject *
5262unicode_find(PyUnicodeObject *self, PyObject *args)
5263{
5264 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005266 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 PyObject *result;
5268
Guido van Rossumb8872e62000-05-09 14:14:27 +00005269 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5270 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 return NULL;
5272 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5273 (PyObject *)substring);
5274 if (substring == NULL)
5275 return NULL;
5276
Martin v. Löwis18e16552006-02-15 17:27:45 +00005277 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279 Py_DECREF(substring);
5280 return result;
5281}
5282
5283static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
5286 if (index < 0 || index >= self->length) {
5287 PyErr_SetString(PyExc_IndexError, "string index out of range");
5288 return NULL;
5289 }
5290
5291 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5292}
5293
5294static long
5295unicode_hash(PyUnicodeObject *self)
5296{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005297 /* Since Unicode objects compare equal to their ASCII string
5298 counterparts, they should use the individual character values
5299 as basis for their hash value. This is needed to assure that
5300 strings and Unicode objects behave in the same way as
5301 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005304 register Py_UNICODE *p;
5305 register long x;
5306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 if (self->hash != -1)
5308 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005309 len = PyUnicode_GET_SIZE(self);
5310 p = PyUnicode_AS_UNICODE(self);
5311 x = *p << 7;
5312 while (--len >= 0)
5313 x = (1000003*x) ^ *p++;
5314 x ^= PyUnicode_GET_SIZE(self);
5315 if (x == -1)
5316 x = -2;
5317 self->hash = x;
5318 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319}
5320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005321PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322"S.index(sub [,start [,end]]) -> int\n\
5323\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005324Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325
5326static PyObject *
5327unicode_index(PyUnicodeObject *self, PyObject *args)
5328{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005332 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
Guido van Rossumb8872e62000-05-09 14:14:27 +00005334 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5335 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5339 (PyObject *)substring);
5340 if (substring == NULL)
5341 return NULL;
5342
5343 result = findstring(self, substring, start, end, 1);
5344
5345 Py_DECREF(substring);
5346 if (result < 0) {
5347 PyErr_SetString(PyExc_ValueError, "substring not found");
5348 return NULL;
5349 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351}
5352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005353PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005354"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005356Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005357at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
5359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005360unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361{
5362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5363 register const Py_UNICODE *e;
5364 int cased;
5365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 /* Shortcut for single character strings */
5367 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005368 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005370 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005371 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 e = p + PyUnicode_GET_SIZE(self);
5375 cased = 0;
5376 for (; p < e; p++) {
5377 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005378
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 else if (!cased && Py_UNICODE_ISLOWER(ch))
5382 cased = 1;
5383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385}
5386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005387PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005388"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005390Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005391at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005394unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
5396 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5397 register const Py_UNICODE *e;
5398 int cased;
5399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 /* Shortcut for single character strings */
5401 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005402 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005404 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005405 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005406 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 e = p + PyUnicode_GET_SIZE(self);
5409 cased = 0;
5410 for (; p < e; p++) {
5411 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005414 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 else if (!cased && Py_UNICODE_ISUPPER(ch))
5416 cased = 1;
5417 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005418 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419}
5420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005421PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005422"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005424Return True if S is a titlecased string and there is at least one\n\
5425character in S, i.e. upper- and titlecase characters may only\n\
5426follow uncased characters and lowercase characters only cased ones.\n\
5427Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
5429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005430unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
5432 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5433 register const Py_UNICODE *e;
5434 int cased, previous_is_cased;
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 /* Shortcut for single character strings */
5437 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005438 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5439 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005442 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 e = p + PyUnicode_GET_SIZE(self);
5446 cased = 0;
5447 previous_is_cased = 0;
5448 for (; p < e; p++) {
5449 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5452 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005453 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 previous_is_cased = 1;
5455 cased = 1;
5456 }
5457 else if (Py_UNICODE_ISLOWER(ch)) {
5458 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 previous_is_cased = 1;
5461 cased = 1;
5462 }
5463 else
5464 previous_is_cased = 0;
5465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005466 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467}
5468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005469PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005470"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005472Return True if all characters in S are whitespace\n\
5473and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005476unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477{
5478 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5479 register const Py_UNICODE *e;
5480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 /* Shortcut for single character strings */
5482 if (PyUnicode_GET_SIZE(self) == 1 &&
5483 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005486 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005487 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005489
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 e = p + PyUnicode_GET_SIZE(self);
5491 for (; p < e; p++) {
5492 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005493 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496}
5497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005499"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005500\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005501Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005502and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005503
5504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005505unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005506{
5507 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5508 register const Py_UNICODE *e;
5509
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005510 /* Shortcut for single character strings */
5511 if (PyUnicode_GET_SIZE(self) == 1 &&
5512 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005513 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005514
5515 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005516 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005517 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005518
5519 e = p + PyUnicode_GET_SIZE(self);
5520 for (; p < e; p++) {
5521 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005522 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005523 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005524 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005525}
5526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005527PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005528"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005529\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005530Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005531and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005532
5533static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005534unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005535{
5536 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5537 register const Py_UNICODE *e;
5538
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005539 /* Shortcut for single character strings */
5540 if (PyUnicode_GET_SIZE(self) == 1 &&
5541 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005542 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005543
5544 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005545 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005546 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005547
5548 e = p + PyUnicode_GET_SIZE(self);
5549 for (; p < e; p++) {
5550 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005551 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554}
5555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005556PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005557"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005560False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
5562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005563unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564{
5565 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5566 register const Py_UNICODE *e;
5567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 /* Shortcut for single character strings */
5569 if (PyUnicode_GET_SIZE(self) == 1 &&
5570 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005571 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005573 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005574 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005575 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 e = p + PyUnicode_GET_SIZE(self);
5578 for (; p < e; p++) {
5579 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005580 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583}
5584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005585PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005586"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005588Return True if all characters in S are digits\n\
5589and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
5591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005592unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
5594 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5595 register const Py_UNICODE *e;
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 /* Shortcut for single character strings */
5598 if (PyUnicode_GET_SIZE(self) == 1 &&
5599 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005602 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005603 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005604 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005605
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 e = p + PyUnicode_GET_SIZE(self);
5607 for (; p < e; p++) {
5608 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005609 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612}
5613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005614PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005615"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
5620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005621unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
5623 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5624 register const Py_UNICODE *e;
5625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 /* Shortcut for single character strings */
5627 if (PyUnicode_GET_SIZE(self) == 1 &&
5628 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005631 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005632 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005633 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 e = p + PyUnicode_GET_SIZE(self);
5636 for (; p < e; p++) {
5637 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005638 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005643PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644"S.join(sequence) -> unicode\n\
5645\n\
5646Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005647sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648
5649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005650unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653}
5654
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656unicode_length(PyUnicodeObject *self)
5657{
5658 return self->length;
5659}
5660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005662"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663\n\
5664Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005665done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
5667static PyObject *
5668unicode_ljust(PyUnicodeObject *self, PyObject *args)
5669{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005670 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005671 Py_UNICODE fillchar = ' ';
5672
Martin v. Löwis412fb672006-04-13 06:34:32 +00005673 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 return NULL;
5675
Tim Peters7a29bd52001-09-12 03:03:31 +00005676 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 Py_INCREF(self);
5678 return (PyObject*) self;
5679 }
5680
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005681 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682}
5683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685"S.lower() -> unicode\n\
5686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005687Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
5689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005690unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 return fixup(self, fixlower);
5693}
5694
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005695#define LEFTSTRIP 0
5696#define RIGHTSTRIP 1
5697#define BOTHSTRIP 2
5698
5699/* Arrays indexed by above */
5700static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5701
5702#define STRIPNAME(i) (stripformat[i]+3)
5703
5704static const Py_UNICODE *
5705unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5706{
Tim Peters030a5ce2002-04-22 19:00:10 +00005707 size_t i;
5708 for (i = 0; i < n; ++i)
5709 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005710 return s+i;
5711 return NULL;
5712}
5713
5714/* externally visible for str.strip(unicode) */
5715PyObject *
5716_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5717{
5718 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005719 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005720 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5722 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005723
5724 i = 0;
5725 if (striptype != RIGHTSTRIP) {
5726 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5727 i++;
5728 }
5729 }
5730
5731 j = len;
5732 if (striptype != LEFTSTRIP) {
5733 do {
5734 j--;
5735 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5736 j++;
5737 }
5738
5739 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5740 Py_INCREF(self);
5741 return (PyObject*)self;
5742 }
5743 else
5744 return PyUnicode_FromUnicode(s+i, j-i);
5745}
5746
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
5748static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005749do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005751 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005753
5754 i = 0;
5755 if (striptype != RIGHTSTRIP) {
5756 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5757 i++;
5758 }
5759 }
5760
5761 j = len;
5762 if (striptype != LEFTSTRIP) {
5763 do {
5764 j--;
5765 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5766 j++;
5767 }
5768
5769 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5770 Py_INCREF(self);
5771 return (PyObject*)self;
5772 }
5773 else
5774 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}
5776
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005777
5778static PyObject *
5779do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5780{
5781 PyObject *sep = NULL;
5782
5783 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5784 return NULL;
5785
5786 if (sep != NULL && sep != Py_None) {
5787 if (PyUnicode_Check(sep))
5788 return _PyUnicode_XStrip(self, striptype, sep);
5789 else if (PyString_Check(sep)) {
5790 PyObject *res;
5791 sep = PyUnicode_FromObject(sep);
5792 if (sep==NULL)
5793 return NULL;
5794 res = _PyUnicode_XStrip(self, striptype, sep);
5795 Py_DECREF(sep);
5796 return res;
5797 }
5798 else {
5799 PyErr_Format(PyExc_TypeError,
5800 "%s arg must be None, unicode or str",
5801 STRIPNAME(striptype));
5802 return NULL;
5803 }
5804 }
5805
5806 return do_strip(self, striptype);
5807}
5808
5809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005810PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005811"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005812\n\
5813Return a copy of the string S with leading and trailing\n\
5814whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005815If chars is given and not None, remove characters in chars instead.\n\
5816If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005817
5818static PyObject *
5819unicode_strip(PyUnicodeObject *self, PyObject *args)
5820{
5821 if (PyTuple_GET_SIZE(args) == 0)
5822 return do_strip(self, BOTHSTRIP); /* Common case */
5823 else
5824 return do_argstrip(self, BOTHSTRIP, args);
5825}
5826
5827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005828PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005829"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005830\n\
5831Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005832If chars is given and not None, remove characters in chars instead.\n\
5833If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005834
5835static PyObject *
5836unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5837{
5838 if (PyTuple_GET_SIZE(args) == 0)
5839 return do_strip(self, LEFTSTRIP); /* Common case */
5840 else
5841 return do_argstrip(self, LEFTSTRIP, args);
5842}
5843
5844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005846"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005847\n\
5848Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005849If chars is given and not None, remove characters in chars instead.\n\
5850If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005851
5852static PyObject *
5853unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5854{
5855 if (PyTuple_GET_SIZE(args) == 0)
5856 return do_strip(self, RIGHTSTRIP); /* Common case */
5857 else
5858 return do_argstrip(self, RIGHTSTRIP, args);
5859}
5860
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
5865 PyUnicodeObject *u;
5866 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005868 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 if (len < 0)
5871 len = 0;
5872
Tim Peters7a29bd52001-09-12 03:03:31 +00005873 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 /* no repeat, return original string */
5875 Py_INCREF(str);
5876 return (PyObject*) str;
5877 }
Tim Peters8f422462000-09-09 06:13:41 +00005878
5879 /* ensure # of chars needed doesn't overflow int and # of bytes
5880 * needed doesn't overflow size_t
5881 */
5882 nchars = len * str->length;
5883 if (len && nchars / len != str->length) {
5884 PyErr_SetString(PyExc_OverflowError,
5885 "repeated string is too long");
5886 return NULL;
5887 }
5888 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5889 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5890 PyErr_SetString(PyExc_OverflowError,
5891 "repeated string is too long");
5892 return NULL;
5893 }
5894 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 if (!u)
5896 return NULL;
5897
5898 p = u->str;
5899
5900 while (len-- > 0) {
5901 Py_UNICODE_COPY(p, str->str, str->length);
5902 p += str->length;
5903 }
5904
5905 return (PyObject*) u;
5906}
5907
5908PyObject *PyUnicode_Replace(PyObject *obj,
5909 PyObject *subobj,
5910 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005911 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
5913 PyObject *self;
5914 PyObject *str1;
5915 PyObject *str2;
5916 PyObject *result;
5917
5918 self = PyUnicode_FromObject(obj);
5919 if (self == NULL)
5920 return NULL;
5921 str1 = PyUnicode_FromObject(subobj);
5922 if (str1 == NULL) {
5923 Py_DECREF(self);
5924 return NULL;
5925 }
5926 str2 = PyUnicode_FromObject(replobj);
5927 if (str2 == NULL) {
5928 Py_DECREF(self);
5929 Py_DECREF(str1);
5930 return NULL;
5931 }
Tim Petersced69f82003-09-16 20:30:58 +00005932 result = replace((PyUnicodeObject *)self,
5933 (PyUnicodeObject *)str1,
5934 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 maxcount);
5936 Py_DECREF(self);
5937 Py_DECREF(str1);
5938 Py_DECREF(str2);
5939 return result;
5940}
5941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943"S.replace (old, new[, maxsplit]) -> unicode\n\
5944\n\
5945Return a copy of S with all occurrences of substring\n\
5946old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948
5949static PyObject*
5950unicode_replace(PyUnicodeObject *self, PyObject *args)
5951{
5952 PyUnicodeObject *str1;
5953 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005954 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 PyObject *result;
5956
Martin v. Löwis18e16552006-02-15 17:27:45 +00005957 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 return NULL;
5959 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5960 if (str1 == NULL)
5961 return NULL;
5962 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005963 if (str2 == NULL) {
5964 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968 result = replace(self, str1, str2, maxcount);
5969
5970 Py_DECREF(str1);
5971 Py_DECREF(str2);
5972 return result;
5973}
5974
5975static
5976PyObject *unicode_repr(PyObject *unicode)
5977{
5978 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5979 PyUnicode_GET_SIZE(unicode),
5980 1);
5981}
5982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005983PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984"S.rfind(sub [,start [,end]]) -> int\n\
5985\n\
5986Return the highest index in S where substring sub is found,\n\
5987such that sub is contained within s[start,end]. Optional\n\
5988arguments start and end are interpreted as in slice notation.\n\
5989\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005990Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
5992static PyObject *
5993unicode_rfind(PyUnicodeObject *self, PyObject *args)
5994{
5995 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005996 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005997 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 PyObject *result;
5999
Guido van Rossumb8872e62000-05-09 14:14:27 +00006000 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6001 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 return NULL;
6003 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6004 (PyObject *)substring);
6005 if (substring == NULL)
6006 return NULL;
6007
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
6010 Py_DECREF(substring);
6011 return result;
6012}
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015"S.rindex(sub [,start [,end]]) -> int\n\
6016\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006017Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018
6019static PyObject *
6020unicode_rindex(PyUnicodeObject *self, PyObject *args)
6021{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006022 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006025 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Guido van Rossumb8872e62000-05-09 14:14:27 +00006027 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6028 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return NULL;
6030 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6031 (PyObject *)substring);
6032 if (substring == NULL)
6033 return NULL;
6034
6035 result = findstring(self, substring, start, end, -1);
6036
6037 Py_DECREF(substring);
6038 if (result < 0) {
6039 PyErr_SetString(PyExc_ValueError, "substring not found");
6040 return NULL;
6041 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006042 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043}
6044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006046"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047\n\
6048Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006049done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject *
6052unicode_rjust(PyUnicodeObject *self, PyObject *args)
6053{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006054 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006055 Py_UNICODE fillchar = ' ';
6056
Martin v. Löwis412fb672006-04-13 06:34:32 +00006057 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 return NULL;
6059
Tim Peters7a29bd52001-09-12 03:03:31 +00006060 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 Py_INCREF(self);
6062 return (PyObject*) self;
6063 }
6064
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006065 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006069unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070{
6071 /* standard clamping */
6072 if (start < 0)
6073 start = 0;
6074 if (end < 0)
6075 end = 0;
6076 if (end > self->length)
6077 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006078 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 /* full slice, return original string */
6080 Py_INCREF(self);
6081 return (PyObject*) self;
6082 }
6083 if (start > end)
6084 start = end;
6085 /* copy slice */
6086 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6087 end - start);
6088}
6089
6090PyObject *PyUnicode_Split(PyObject *s,
6091 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
6094 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 s = PyUnicode_FromObject(s);
6097 if (s == NULL)
6098 return NULL;
6099 if (sep != NULL) {
6100 sep = PyUnicode_FromObject(sep);
6101 if (sep == NULL) {
6102 Py_DECREF(s);
6103 return NULL;
6104 }
6105 }
6106
6107 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6108
6109 Py_DECREF(s);
6110 Py_XDECREF(sep);
6111 return result;
6112}
6113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115"S.split([sep [,maxsplit]]) -> list of strings\n\
6116\n\
6117Return a list of the words in S, using sep as the\n\
6118delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006119splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006120any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122static PyObject*
6123unicode_split(PyUnicodeObject *self, PyObject *args)
6124{
6125 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006126 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Martin v. Löwis18e16552006-02-15 17:27:45 +00006128 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 return NULL;
6130
6131 if (substring == Py_None)
6132 return split(self, NULL, maxcount);
6133 else if (PyUnicode_Check(substring))
6134 return split(self, (PyUnicodeObject *)substring, maxcount);
6135 else
6136 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6137}
6138
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006139PyObject *PyUnicode_RSplit(PyObject *s,
6140 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006141 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006142{
6143 PyObject *result;
6144
6145 s = PyUnicode_FromObject(s);
6146 if (s == NULL)
6147 return NULL;
6148 if (sep != NULL) {
6149 sep = PyUnicode_FromObject(sep);
6150 if (sep == NULL) {
6151 Py_DECREF(s);
6152 return NULL;
6153 }
6154 }
6155
6156 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6157
6158 Py_DECREF(s);
6159 Py_XDECREF(sep);
6160 return result;
6161}
6162
6163PyDoc_STRVAR(rsplit__doc__,
6164"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6165\n\
6166Return a list of the words in S, using sep as the\n\
6167delimiter string, starting at the end of the string and\n\
6168working to the front. If maxsplit is given, at most maxsplit\n\
6169splits are done. If sep is not specified, any whitespace string\n\
6170is a separator.");
6171
6172static PyObject*
6173unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6174{
6175 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006176 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006177
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006179 return NULL;
6180
6181 if (substring == Py_None)
6182 return rsplit(self, NULL, maxcount);
6183 else if (PyUnicode_Check(substring))
6184 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6185 else
6186 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6187}
6188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006189PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006190"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191\n\
6192Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006193Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006194is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195
6196static PyObject*
6197unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6198{
Guido van Rossum86662912000-04-11 15:38:46 +00006199 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200
Guido van Rossum86662912000-04-11 15:38:46 +00006201 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 return NULL;
6203
Guido van Rossum86662912000-04-11 15:38:46 +00006204 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205}
6206
6207static
6208PyObject *unicode_str(PyUnicodeObject *self)
6209{
Fred Drakee4315f52000-05-09 19:53:39 +00006210 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211}
6212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006213PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214"S.swapcase() -> unicode\n\
6215\n\
6216Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006217and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218
6219static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006220unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 return fixup(self, fixswapcase);
6223}
6224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006225PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226"S.translate(table) -> unicode\n\
6227\n\
6228Return a copy of the string S, where all characters have been mapped\n\
6229through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006230Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6231Unmapped characters are left untouched. Characters mapped to None\n\
6232are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
6234static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006235unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236{
Tim Petersced69f82003-09-16 20:30:58 +00006237 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006239 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 "ignore");
6241}
6242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006243PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244"S.upper() -> unicode\n\
6245\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006246Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
6248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006249unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 return fixup(self, fixupper);
6252}
6253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006254PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255"S.zfill(width) -> unicode\n\
6256\n\
6257Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006258of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
6260static PyObject *
6261unicode_zfill(PyUnicodeObject *self, PyObject *args)
6262{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006263 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 PyUnicodeObject *u;
6265
Martin v. Löwis18e16552006-02-15 17:27:45 +00006266 Py_ssize_t width;
6267 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 return NULL;
6269
6270 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006271 if (PyUnicode_CheckExact(self)) {
6272 Py_INCREF(self);
6273 return (PyObject*) self;
6274 }
6275 else
6276 return PyUnicode_FromUnicode(
6277 PyUnicode_AS_UNICODE(self),
6278 PyUnicode_GET_SIZE(self)
6279 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
6281
6282 fill = width - self->length;
6283
6284 u = pad(self, fill, 0, '0');
6285
Walter Dörwald068325e2002-04-15 13:36:47 +00006286 if (u == NULL)
6287 return NULL;
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 if (u->str[fill] == '+' || u->str[fill] == '-') {
6290 /* move sign to beginning of string */
6291 u->str[0] = u->str[fill];
6292 u->str[fill] = '0';
6293 }
6294
6295 return (PyObject*) u;
6296}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
6298#if 0
6299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006300unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 return PyInt_FromLong(unicode_freelist_size);
6303}
6304#endif
6305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006306PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006307"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006309Return True if S starts with the specified prefix, False otherwise.\n\
6310With optional start, test S beginning at that position.\n\
6311With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313static PyObject *
6314unicode_startswith(PyUnicodeObject *self,
6315 PyObject *args)
6316{
6317 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006318 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006319 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 PyObject *result;
6321
Guido van Rossumb8872e62000-05-09 14:14:27 +00006322 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6323 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 return NULL;
6325 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6326 (PyObject *)substring);
6327 if (substring == NULL)
6328 return NULL;
6329
Guido van Rossum77f6a652002-04-03 22:41:51 +00006330 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
6332 Py_DECREF(substring);
6333 return result;
6334}
6335
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006338"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006340Return True if S ends with the specified suffix, False otherwise.\n\
6341With optional start, test S beginning at that position.\n\
6342With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject *
6345unicode_endswith(PyUnicodeObject *self,
6346 PyObject *args)
6347{
6348 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006349 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006350 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 PyObject *result;
6352
Guido van Rossumb8872e62000-05-09 14:14:27 +00006353 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6354 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 return NULL;
6356 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6357 (PyObject *)substring);
6358 if (substring == NULL)
6359 return NULL;
6360
Guido van Rossum77f6a652002-04-03 22:41:51 +00006361 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362
6363 Py_DECREF(substring);
6364 return result;
6365}
6366
6367
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006368
6369static PyObject *
6370unicode_getnewargs(PyUnicodeObject *v)
6371{
6372 return Py_BuildValue("(u#)", v->str, v->length);
6373}
6374
6375
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376static PyMethodDef unicode_methods[] = {
6377
6378 /* Order is according to common usage: often used methods should
6379 appear first, since lookup is done sequentially. */
6380
Georg Brandlecdc0a92006-03-30 12:19:07 +00006381 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006382 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6383 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006384 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006385 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6386 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6387 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6388 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6389 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6390 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6391 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6392 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6393 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6394 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006395 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006397/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6398 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6399 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6400 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006401 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006402 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006403 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006404 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6405 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6406 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6407 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6408 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6409 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6410 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6411 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6412 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6413 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6414 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6415 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6416 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6417 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006418 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006419#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006420 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421#endif
6422
6423#if 0
6424 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006425 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426#endif
6427
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006428 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 {NULL, NULL}
6430};
6431
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006432static PyObject *
6433unicode_mod(PyObject *v, PyObject *w)
6434{
6435 if (!PyUnicode_Check(v)) {
6436 Py_INCREF(Py_NotImplemented);
6437 return Py_NotImplemented;
6438 }
6439 return PyUnicode_Format(v, w);
6440}
6441
6442static PyNumberMethods unicode_as_number = {
6443 0, /*nb_add*/
6444 0, /*nb_subtract*/
6445 0, /*nb_multiply*/
6446 0, /*nb_divide*/
6447 unicode_mod, /*nb_remainder*/
6448};
6449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006452 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6454 (ssizeargfunc) unicode_getitem, /* sq_item */
6455 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 0, /* sq_ass_item */
6457 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006458 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459};
6460
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006461#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6462
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006463static PyObject*
6464unicode_subscript(PyUnicodeObject* self, PyObject* item)
6465{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006466 PyNumberMethods *nb = item->ob_type->tp_as_number;
6467 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6468 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006469 if (i == -1 && PyErr_Occurred())
6470 return NULL;
6471 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006472 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006473 return unicode_getitem(self, i);
6474 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006476 Py_UNICODE* source_buf;
6477 Py_UNICODE* result_buf;
6478 PyObject* result;
6479
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006480 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006481 &start, &stop, &step, &slicelength) < 0) {
6482 return NULL;
6483 }
6484
6485 if (slicelength <= 0) {
6486 return PyUnicode_FromUnicode(NULL, 0);
6487 } else {
6488 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006489 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6490 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006491
6492 if (result_buf == NULL)
6493 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006494
6495 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6496 result_buf[i] = source_buf[cur];
6497 }
Tim Petersced69f82003-09-16 20:30:58 +00006498
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006499 result = PyUnicode_FromUnicode(result_buf, slicelength);
6500 PyMem_FREE(result_buf);
6501 return result;
6502 }
6503 } else {
6504 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6505 return NULL;
6506 }
6507}
6508
6509static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006511 (binaryfunc)unicode_subscript, /* mp_subscript */
6512 (objobjargproc)0, /* mp_ass_subscript */
6513};
6514
Martin v. Löwis18e16552006-02-15 17:27:45 +00006515static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006517 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 const void **ptr)
6519{
6520 if (index != 0) {
6521 PyErr_SetString(PyExc_SystemError,
6522 "accessing non-existent unicode segment");
6523 return -1;
6524 }
6525 *ptr = (void *) self->str;
6526 return PyUnicode_GET_DATA_SIZE(self);
6527}
6528
Martin v. Löwis18e16552006-02-15 17:27:45 +00006529static Py_ssize_t
6530unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 const void **ptr)
6532{
6533 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006534 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return -1;
6536}
6537
6538static int
6539unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541{
6542 if (lenp)
6543 *lenp = PyUnicode_GET_DATA_SIZE(self);
6544 return 1;
6545}
6546
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006547static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 const void **ptr)
6551{
6552 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 if (index != 0) {
6555 PyErr_SetString(PyExc_SystemError,
6556 "accessing non-existent unicode segment");
6557 return -1;
6558 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006559 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 if (str == NULL)
6561 return -1;
6562 *ptr = (void *) PyString_AS_STRING(str);
6563 return PyString_GET_SIZE(str);
6564}
6565
6566/* Helpers for PyUnicode_Format() */
6567
6568static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (argidx < arglen) {
6573 (*p_argidx)++;
6574 if (arglen < 0)
6575 return args;
6576 else
6577 return PyTuple_GetItem(args, argidx);
6578 }
6579 PyErr_SetString(PyExc_TypeError,
6580 "not enough arguments for format string");
6581 return NULL;
6582}
6583
6584#define F_LJUST (1<<0)
6585#define F_SIGN (1<<1)
6586#define F_BLANK (1<<2)
6587#define F_ALT (1<<3)
6588#define F_ZERO (1<<4)
6589
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006591strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 register Py_ssize_t i;
6594 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 for (i = len - 1; i >= 0; i--)
6596 buffer[i] = (Py_UNICODE) charbuffer[i];
6597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return len;
6599}
6600
Neal Norwitzfc76d632006-01-10 06:03:13 +00006601static int
6602doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6603{
Tim Peters15231542006-02-16 01:08:01 +00006604 Py_ssize_t result;
6605
Neal Norwitzfc76d632006-01-10 06:03:13 +00006606 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006607 result = strtounicode(buffer, (char *)buffer);
6608 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006609}
6610
6611static int
6612longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6613{
Tim Peters15231542006-02-16 01:08:01 +00006614 Py_ssize_t result;
6615
Neal Norwitzfc76d632006-01-10 06:03:13 +00006616 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006617 result = strtounicode(buffer, (char *)buffer);
6618 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006619}
6620
Guido van Rossum078151d2002-08-11 04:24:12 +00006621/* XXX To save some code duplication, formatfloat/long/int could have been
6622 shared with stringobject.c, converting from 8-bit to Unicode after the
6623 formatting is done. */
6624
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625static int
6626formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006627 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 int flags,
6629 int prec,
6630 int type,
6631 PyObject *v)
6632{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006633 /* fmt = '%#.' + `prec` + `type`
6634 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 char fmt[20];
6636 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 x = PyFloat_AsDouble(v);
6639 if (x == -1.0 && PyErr_Occurred())
6640 return -1;
6641 if (prec < 0)
6642 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6644 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006645 /* Worst case length calc to ensure no buffer overrun:
6646
6647 'g' formats:
6648 fmt = %#.<prec>g
6649 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6650 for any double rep.)
6651 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6652
6653 'f' formats:
6654 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6655 len = 1 + 50 + 1 + prec = 52 + prec
6656
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006657 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006658 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006659
6660 */
6661 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6662 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006663 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006664 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006665 return -1;
6666 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006667 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6668 (flags&F_ALT) ? "#" : "",
6669 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006670 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Tim Peters38fd5b62000-09-21 05:43:11 +00006673static PyObject*
6674formatlong(PyObject *val, int flags, int prec, int type)
6675{
6676 char *buf;
6677 int i, len;
6678 PyObject *str; /* temporary string object. */
6679 PyUnicodeObject *result;
6680
6681 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6682 if (!str)
6683 return NULL;
6684 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006685 if (!result) {
6686 Py_DECREF(str);
6687 return NULL;
6688 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006689 for (i = 0; i < len; i++)
6690 result->str[i] = buf[i];
6691 result->str[len] = 0;
6692 Py_DECREF(str);
6693 return (PyObject*)result;
6694}
6695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696static int
6697formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006698 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 int flags,
6700 int prec,
6701 int type,
6702 PyObject *v)
6703{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006704 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006705 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6706 * + 1 + 1
6707 * = 24
6708 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006709 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006710 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 long x;
6712
6713 x = PyInt_AsLong(v);
6714 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006715 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006716 if (x < 0 && type == 'u') {
6717 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006718 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006719 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6720 sign = "-";
6721 else
6722 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006724 prec = 1;
6725
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006726 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6727 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006728 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006729 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006730 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006731 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006732 return -1;
6733 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006734
6735 if ((flags & F_ALT) &&
6736 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006737 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006738 * of issues that cause pain:
6739 * - when 0 is being converted, the C standard leaves off
6740 * the '0x' or '0X', which is inconsistent with other
6741 * %#x/%#X conversions and inconsistent with Python's
6742 * hex() function
6743 * - there are platforms that violate the standard and
6744 * convert 0 with the '0x' or '0X'
6745 * (Metrowerks, Compaq Tru64)
6746 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006747 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006748 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006749 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006750 * We can achieve the desired consistency by inserting our
6751 * own '0x' or '0X' prefix, and substituting %x/%X in place
6752 * of %#x/%#X.
6753 *
6754 * Note that this is the same approach as used in
6755 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006756 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006757 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6758 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006759 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006760 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006761 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6762 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006763 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006764 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006765 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006766 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006767 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006768 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769}
6770
6771static int
6772formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006773 size_t buflen,
6774 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006776 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006777 if (PyUnicode_Check(v)) {
6778 if (PyUnicode_GET_SIZE(v) != 1)
6779 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006783 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006784 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006785 goto onError;
6786 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
6789 else {
6790 /* Integer input truncated to a character */
6791 long x;
6792 x = PyInt_AsLong(v);
6793 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006794 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006795#ifdef Py_UNICODE_WIDE
6796 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006797 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006798 "%c arg not in range(0x110000) "
6799 "(wide Python build)");
6800 return -1;
6801 }
6802#else
6803 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006804 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006805 "%c arg not in range(0x10000) "
6806 "(narrow Python build)");
6807 return -1;
6808 }
6809#endif
6810 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 }
6812 buf[1] = '\0';
6813 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006814
6815 onError:
6816 PyErr_SetString(PyExc_TypeError,
6817 "%c requires int or char");
6818 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819}
6820
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006821/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6822
6823 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6824 chars are formatted. XXX This is a magic number. Each formatting
6825 routine does bounds checking to ensure no overflow, but a better
6826 solution may be to malloc a buffer of appropriate size for each
6827 format. For now, the current solution is sufficient.
6828*/
6829#define FORMATBUFLEN (size_t)120
6830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831PyObject *PyUnicode_Format(PyObject *format,
6832 PyObject *args)
6833{
6834 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006835 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 int args_owned = 0;
6837 PyUnicodeObject *result = NULL;
6838 PyObject *dict = NULL;
6839 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 if (format == NULL || args == NULL) {
6842 PyErr_BadInternalCall();
6843 return NULL;
6844 }
6845 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006846 if (uformat == NULL)
6847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 fmt = PyUnicode_AS_UNICODE(uformat);
6849 fmtcnt = PyUnicode_GET_SIZE(uformat);
6850
6851 reslen = rescnt = fmtcnt + 100;
6852 result = _PyUnicode_New(reslen);
6853 if (result == NULL)
6854 goto onError;
6855 res = PyUnicode_AS_UNICODE(result);
6856
6857 if (PyTuple_Check(args)) {
6858 arglen = PyTuple_Size(args);
6859 argidx = 0;
6860 }
6861 else {
6862 arglen = -1;
6863 argidx = -2;
6864 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006865 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6866 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 dict = args;
6868
6869 while (--fmtcnt >= 0) {
6870 if (*fmt != '%') {
6871 if (--rescnt < 0) {
6872 rescnt = fmtcnt + 100;
6873 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006874 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006875 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6877 --rescnt;
6878 }
6879 *res++ = *fmt++;
6880 }
6881 else {
6882 /* Got a format specifier */
6883 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 Py_UNICODE c = '\0';
6887 Py_UNICODE fill;
6888 PyObject *v = NULL;
6889 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006890 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006893 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
6895 fmt++;
6896 if (*fmt == '(') {
6897 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 PyObject *key;
6900 int pcount = 1;
6901
6902 if (dict == NULL) {
6903 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006904 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 goto onError;
6906 }
6907 ++fmt;
6908 --fmtcnt;
6909 keystart = fmt;
6910 /* Skip over balanced parentheses */
6911 while (pcount > 0 && --fmtcnt >= 0) {
6912 if (*fmt == ')')
6913 --pcount;
6914 else if (*fmt == '(')
6915 ++pcount;
6916 fmt++;
6917 }
6918 keylen = fmt - keystart - 1;
6919 if (fmtcnt < 0 || pcount > 0) {
6920 PyErr_SetString(PyExc_ValueError,
6921 "incomplete format key");
6922 goto onError;
6923 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006924#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006925 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 then looked up since Python uses strings to hold
6927 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006928 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 key = PyUnicode_EncodeUTF8(keystart,
6930 keylen,
6931 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006932#else
6933 key = PyUnicode_FromUnicode(keystart, keylen);
6934#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 if (key == NULL)
6936 goto onError;
6937 if (args_owned) {
6938 Py_DECREF(args);
6939 args_owned = 0;
6940 }
6941 args = PyObject_GetItem(dict, key);
6942 Py_DECREF(key);
6943 if (args == NULL) {
6944 goto onError;
6945 }
6946 args_owned = 1;
6947 arglen = -1;
6948 argidx = -2;
6949 }
6950 while (--fmtcnt >= 0) {
6951 switch (c = *fmt++) {
6952 case '-': flags |= F_LJUST; continue;
6953 case '+': flags |= F_SIGN; continue;
6954 case ' ': flags |= F_BLANK; continue;
6955 case '#': flags |= F_ALT; continue;
6956 case '0': flags |= F_ZERO; continue;
6957 }
6958 break;
6959 }
6960 if (c == '*') {
6961 v = getnextarg(args, arglen, &argidx);
6962 if (v == NULL)
6963 goto onError;
6964 if (!PyInt_Check(v)) {
6965 PyErr_SetString(PyExc_TypeError,
6966 "* wants int");
6967 goto onError;
6968 }
6969 width = PyInt_AsLong(v);
6970 if (width < 0) {
6971 flags |= F_LJUST;
6972 width = -width;
6973 }
6974 if (--fmtcnt >= 0)
6975 c = *fmt++;
6976 }
6977 else if (c >= '0' && c <= '9') {
6978 width = c - '0';
6979 while (--fmtcnt >= 0) {
6980 c = *fmt++;
6981 if (c < '0' || c > '9')
6982 break;
6983 if ((width*10) / 10 != width) {
6984 PyErr_SetString(PyExc_ValueError,
6985 "width too big");
6986 goto onError;
6987 }
6988 width = width*10 + (c - '0');
6989 }
6990 }
6991 if (c == '.') {
6992 prec = 0;
6993 if (--fmtcnt >= 0)
6994 c = *fmt++;
6995 if (c == '*') {
6996 v = getnextarg(args, arglen, &argidx);
6997 if (v == NULL)
6998 goto onError;
6999 if (!PyInt_Check(v)) {
7000 PyErr_SetString(PyExc_TypeError,
7001 "* wants int");
7002 goto onError;
7003 }
7004 prec = PyInt_AsLong(v);
7005 if (prec < 0)
7006 prec = 0;
7007 if (--fmtcnt >= 0)
7008 c = *fmt++;
7009 }
7010 else if (c >= '0' && c <= '9') {
7011 prec = c - '0';
7012 while (--fmtcnt >= 0) {
7013 c = Py_CHARMASK(*fmt++);
7014 if (c < '0' || c > '9')
7015 break;
7016 if ((prec*10) / 10 != prec) {
7017 PyErr_SetString(PyExc_ValueError,
7018 "prec too big");
7019 goto onError;
7020 }
7021 prec = prec*10 + (c - '0');
7022 }
7023 }
7024 } /* prec */
7025 if (fmtcnt >= 0) {
7026 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 if (--fmtcnt >= 0)
7028 c = *fmt++;
7029 }
7030 }
7031 if (fmtcnt < 0) {
7032 PyErr_SetString(PyExc_ValueError,
7033 "incomplete format");
7034 goto onError;
7035 }
7036 if (c != '%') {
7037 v = getnextarg(args, arglen, &argidx);
7038 if (v == NULL)
7039 goto onError;
7040 }
7041 sign = 0;
7042 fill = ' ';
7043 switch (c) {
7044
7045 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007046 pbuf = formatbuf;
7047 /* presume that buffer length is at least 1 */
7048 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 len = 1;
7050 break;
7051
7052 case 's':
7053 case 'r':
7054 if (PyUnicode_Check(v) && c == 's') {
7055 temp = v;
7056 Py_INCREF(temp);
7057 }
7058 else {
7059 PyObject *unicode;
7060 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007061 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 else
7063 temp = PyObject_Repr(v);
7064 if (temp == NULL)
7065 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007066 if (PyUnicode_Check(temp))
7067 /* nothing to do */;
7068 else if (PyString_Check(temp)) {
7069 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007070 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007072 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007074 Py_DECREF(temp);
7075 temp = unicode;
7076 if (temp == NULL)
7077 goto onError;
7078 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007079 else {
7080 Py_DECREF(temp);
7081 PyErr_SetString(PyExc_TypeError,
7082 "%s argument has non-string str()");
7083 goto onError;
7084 }
7085 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007086 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 len = PyUnicode_GET_SIZE(temp);
7088 if (prec >= 0 && len > prec)
7089 len = prec;
7090 break;
7091
7092 case 'i':
7093 case 'd':
7094 case 'u':
7095 case 'o':
7096 case 'x':
7097 case 'X':
7098 if (c == 'i')
7099 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007100 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007101 temp = formatlong(v, flags, prec, c);
7102 if (!temp)
7103 goto onError;
7104 pbuf = PyUnicode_AS_UNICODE(temp);
7105 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007106 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007108 else {
7109 pbuf = formatbuf;
7110 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7111 flags, prec, c, v);
7112 if (len < 0)
7113 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007114 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007115 }
7116 if (flags & F_ZERO)
7117 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 break;
7119
7120 case 'e':
7121 case 'E':
7122 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007123 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 case 'g':
7125 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007126 if (c == 'F')
7127 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007128 pbuf = formatbuf;
7129 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7130 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 if (len < 0)
7132 goto onError;
7133 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007134 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 fill = '0';
7136 break;
7137
7138 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007139 pbuf = formatbuf;
7140 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 if (len < 0)
7142 goto onError;
7143 break;
7144
7145 default:
7146 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007147 "unsupported format character '%c' (0x%x) "
7148 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007149 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007150 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007151 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 goto onError;
7153 }
7154 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007155 if (*pbuf == '-' || *pbuf == '+') {
7156 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 len--;
7158 }
7159 else if (flags & F_SIGN)
7160 sign = '+';
7161 else if (flags & F_BLANK)
7162 sign = ' ';
7163 else
7164 sign = 0;
7165 }
7166 if (width < len)
7167 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007168 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 reslen -= rescnt;
7170 rescnt = width + fmtcnt + 100;
7171 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007172 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007173 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007174 PyErr_NoMemory();
7175 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007176 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007177 if (_PyUnicode_Resize(&result, reslen) < 0) {
7178 Py_XDECREF(temp);
7179 goto onError;
7180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 res = PyUnicode_AS_UNICODE(result)
7182 + reslen - rescnt;
7183 }
7184 if (sign) {
7185 if (fill != ' ')
7186 *res++ = sign;
7187 rescnt--;
7188 if (width > len)
7189 width--;
7190 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007191 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7192 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007193 assert(pbuf[1] == c);
7194 if (fill != ' ') {
7195 *res++ = *pbuf++;
7196 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007197 }
Tim Petersfff53252001-04-12 18:38:48 +00007198 rescnt -= 2;
7199 width -= 2;
7200 if (width < 0)
7201 width = 0;
7202 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 if (width > len && !(flags & F_LJUST)) {
7205 do {
7206 --rescnt;
7207 *res++ = fill;
7208 } while (--width > len);
7209 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007210 if (fill == ' ') {
7211 if (sign)
7212 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007213 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007214 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007215 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007216 *res++ = *pbuf++;
7217 *res++ = *pbuf++;
7218 }
7219 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007220 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 res += len;
7222 rescnt -= len;
7223 while (--width >= len) {
7224 --rescnt;
7225 *res++ = ' ';
7226 }
7227 if (dict && (argidx < arglen) && c != '%') {
7228 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007229 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007230 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 goto onError;
7232 }
7233 Py_XDECREF(temp);
7234 } /* '%' */
7235 } /* until end */
7236 if (argidx < arglen && !dict) {
7237 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007238 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 goto onError;
7240 }
7241
Thomas Woutersa96affe2006-03-12 00:29:36 +00007242 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 if (args_owned) {
7245 Py_DECREF(args);
7246 }
7247 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 return (PyObject *)result;
7249
7250 onError:
7251 Py_XDECREF(result);
7252 Py_DECREF(uformat);
7253 if (args_owned) {
7254 Py_DECREF(args);
7255 }
7256 return NULL;
7257}
7258
7259static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007260 (readbufferproc) unicode_buffer_getreadbuf,
7261 (writebufferproc) unicode_buffer_getwritebuf,
7262 (segcountproc) unicode_buffer_getsegcount,
7263 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264};
7265
Jeremy Hylton938ace62002-07-17 16:30:39 +00007266static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007267unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7268
Tim Peters6d6c1a32001-08-02 04:15:00 +00007269static PyObject *
7270unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7271{
7272 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007273 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007274 char *encoding = NULL;
7275 char *errors = NULL;
7276
Guido van Rossume023fe02001-08-30 03:12:59 +00007277 if (type != &PyUnicode_Type)
7278 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007279 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7280 kwlist, &x, &encoding, &errors))
7281 return NULL;
7282 if (x == NULL)
7283 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007284 if (encoding == NULL && errors == NULL)
7285 return PyObject_Unicode(x);
7286 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007287 return PyUnicode_FromEncodedObject(x, encoding, errors);
7288}
7289
Guido van Rossume023fe02001-08-30 03:12:59 +00007290static PyObject *
7291unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7292{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007293 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007294 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007295
7296 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7297 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7298 if (tmp == NULL)
7299 return NULL;
7300 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007301 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007302 if (pnew == NULL) {
7303 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007304 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007305 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007306 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7307 if (pnew->str == NULL) {
7308 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007309 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007310 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007311 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007312 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007313 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7314 pnew->length = n;
7315 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007316 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007317 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007318}
7319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007321"unicode(string [, encoding[, errors]]) -> object\n\
7322\n\
7323Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007324encoding defaults to the current default string encoding.\n\
7325errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007326
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327PyTypeObject PyUnicode_Type = {
7328 PyObject_HEAD_INIT(&PyType_Type)
7329 0, /* ob_size */
7330 "unicode", /* tp_name */
7331 sizeof(PyUnicodeObject), /* tp_size */
7332 0, /* tp_itemsize */
7333 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007334 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007336 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 0, /* tp_setattr */
7338 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007339 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007340 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007342 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 (hashfunc) unicode_hash, /* tp_hash*/
7344 0, /* tp_call*/
7345 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007346 PyObject_GenericGetAttr, /* tp_getattro */
7347 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007349 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7350 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007351 unicode_doc, /* tp_doc */
7352 0, /* tp_traverse */
7353 0, /* tp_clear */
7354 0, /* tp_richcompare */
7355 0, /* tp_weaklistoffset */
7356 0, /* tp_iter */
7357 0, /* tp_iternext */
7358 unicode_methods, /* tp_methods */
7359 0, /* tp_members */
7360 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007361 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007362 0, /* tp_dict */
7363 0, /* tp_descr_get */
7364 0, /* tp_descr_set */
7365 0, /* tp_dictoffset */
7366 0, /* tp_init */
7367 0, /* tp_alloc */
7368 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007369 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370};
7371
7372/* Initialize the Unicode implementation */
7373
Thomas Wouters78890102000-07-22 19:25:51 +00007374void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007376 int i;
7377
Fred Drakee4315f52000-05-09 19:53:39 +00007378 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007379 unicode_freelist = NULL;
7380 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007382 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007383 for (i = 0; i < 256; i++)
7384 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007385 if (PyType_Ready(&PyUnicode_Type) < 0)
7386 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387}
7388
7389/* Finalize the Unicode implementation */
7390
7391void
Thomas Wouters78890102000-07-22 19:25:51 +00007392_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007394 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007395 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007397 Py_XDECREF(unicode_empty);
7398 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007399
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007400 for (i = 0; i < 256; i++) {
7401 if (unicode_latin1[i]) {
7402 Py_DECREF(unicode_latin1[i]);
7403 unicode_latin1[i] = NULL;
7404 }
7405 }
7406
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007407 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 PyUnicodeObject *v = u;
7409 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007410 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007411 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007412 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007413 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007415 unicode_freelist = NULL;
7416 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007418
Anthony Baxterac6bd462006-04-13 02:06:09 +00007419#ifdef __cplusplus
7420}
7421#endif
7422
7423
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007424/*
7425Local variables:
7426c-basic-offset: 4
7427indent-tabs-mode: nil
7428End:
7429*/