blob: 30ae6f011ff07375803d7d48b2d655c7345ae457 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Anthony Baxterac6bd462006-04-13 02:06:09 +000086
87#ifdef __cplusplus
88extern "C" {
89#endif
90
Guido van Rossumd57fd912000-03-10 22:53:23 +000091/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092static PyUnicodeObject *unicode_freelist;
93static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000094
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000095/* The empty Unicode object is shared to improve performance. */
96static PyUnicodeObject *unicode_empty;
97
98/* Single character Unicode strings in the Latin-1 range are being
99 shared as well. */
100static PyUnicodeObject *unicode_latin1[256];
101
Fred Drakee4315f52000-05-09 19:53:39 +0000102/* Default encoding to use and assume when NULL is passed as encoding
103 parameter; it is initialized by _PyUnicode_Init().
104
105 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000106 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000107
108*/
Fred Drakee4315f52000-05-09 19:53:39 +0000109static char unicode_default_encoding[100];
110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Guido van Rossumd57fd912000-03-10 22:53:23 +0000123/* --- Unicode Object ----------------------------------------------------- */
124
125static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000127 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000128{
129 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000130
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000131 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000132 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000133 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000134
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000135 /* Resizing shared object (unicode_empty or single character
136 objects) in-place is not allowed. Use PyUnicode_Resize()
137 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000138 if (unicode == unicode_empty ||
139 (unicode->length == 1 &&
140 /* MvL said unicode->str[] may be signed. Python generally assumes
141 * an int contains at least 32 bits, and we don't use more than
142 * 32 bits even in a UCS4 build, so casting to unsigned int should
143 * be correct.
144 */
145 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000146 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000148 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000149 return -1;
150 }
151
152 /* We allocate one more byte to make sure the string is
153 Ux0000 terminated -- XXX is this needed ? */
154 oldstr = unicode->str;
155 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
156 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000157 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000158 PyErr_NoMemory();
159 return -1;
160 }
161 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000162 assert(length < INT_MAX);
163 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000165 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000166 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000167 if (unicode->defenc) {
168 Py_DECREF(unicode->defenc);
169 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000170 }
171 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000172
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173 return 0;
174}
175
176/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000177 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178
179 XXX This allocator could further be enhanced by assuring that the
180 free list never reduces its size below 1.
181
182*/
183
184static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 register PyUnicodeObject *unicode;
188
Tim Petersced69f82003-09-16 20:30:58 +0000189 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (length == 0 && unicode_empty != NULL) {
191 Py_INCREF(unicode_empty);
192 return unicode_empty;
193 }
194
195 /* Unicode freelist & memory allocation */
196 if (unicode_freelist) {
197 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000198 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000201 /* Keep-Alive optimization: we only upsize the buffer,
202 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000203 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000204 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000205 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000209 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000211 }
212 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000215 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 if (unicode == NULL)
217 return NULL;
218 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
219 }
220
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000221 if (!unicode->str) {
222 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000225 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000226 * the caller fails before initializing str -- unicode_resize()
227 * reads str[0], and the Keep-Alive optimization can keep memory
228 * allocated for str alive across a call to unicode_dealloc(unicode).
229 * We don't want unicode_resize to read uninitialized memory in
230 * that case.
231 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000232 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000234 assert(length<INT_MAX);
235 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000239
240 onError:
241 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000242 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244}
245
246static
Guido van Rossum9475a232001-10-05 20:51:39 +0000247void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 if (PyUnicode_CheckExact(unicode) &&
250 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000251 /* Keep-Alive optimization */
252 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000253 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 unicode->str = NULL;
255 unicode->length = 0;
256 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000257 if (unicode->defenc) {
258 Py_DECREF(unicode->defenc);
259 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 }
261 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 *(PyUnicodeObject **)unicode = unicode_freelist;
263 unicode_freelist = unicode;
264 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000267 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000268 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000269 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 }
271}
272
Martin v. Löwis18e16552006-02-15 17:27:45 +0000273int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000274{
275 register PyUnicodeObject *v;
276
277 /* Argument checks */
278 if (unicode == NULL) {
279 PyErr_BadInternalCall();
280 return -1;
281 }
282 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000283 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 PyErr_BadInternalCall();
285 return -1;
286 }
287
288 /* Resizing unicode_empty and single character objects is not
289 possible since these are being shared. We simply return a fresh
290 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000291 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 (v == unicode_empty || v->length == 1)) {
293 PyUnicodeObject *w = _PyUnicode_New(length);
294 if (w == NULL)
295 return -1;
296 Py_UNICODE_COPY(w->str, v->str,
297 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000298 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000299 *unicode = (PyObject *)w;
300 return 0;
301 }
302
303 /* Note that we don't have to modify *unicode for unshared Unicode
304 objects, since we can modify them in-place. */
305 return unicode_resize(v, length);
306}
307
308/* Internal API for use in unicodeobject.c only ! */
309#define _PyUnicode_Resize(unicodevar, length) \
310 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
311
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000313 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314{
315 PyUnicodeObject *unicode;
316
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 /* If the Unicode data is known at construction time, we can apply
318 some optimizations which share commonly used objects. */
319 if (u != NULL) {
320
321 /* Optimization for empty strings */
322 if (size == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return (PyObject *)unicode_empty;
325 }
326
327 /* Single character Unicode objects in the Latin-1 range are
328 shared when using this constructor */
329 if (size == 1 && *u < 256) {
330 unicode = unicode_latin1[*u];
331 if (!unicode) {
332 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000333 if (!unicode)
334 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000335 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000336 unicode_latin1[*u] = unicode;
337 }
338 Py_INCREF(unicode);
339 return (PyObject *)unicode;
340 }
341 }
Tim Petersced69f82003-09-16 20:30:58 +0000342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 unicode = _PyUnicode_New(size);
344 if (!unicode)
345 return NULL;
346
347 /* Copy the Unicode data into the new object */
348 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350
351 return (PyObject *)unicode;
352}
353
354#ifdef HAVE_WCHAR_H
355
356PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000357 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358{
359 PyUnicodeObject *unicode;
360
361 if (w == NULL) {
362 PyErr_BadInternalCall();
363 return NULL;
364 }
365
366 unicode = _PyUnicode_New(size);
367 if (!unicode)
368 return NULL;
369
370 /* Copy the wchar_t data into the new object */
371#ifdef HAVE_USABLE_WCHAR_T
372 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000373#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 {
375 register Py_UNICODE *u;
376 register int i;
377 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000378 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 *u++ = *w++;
380 }
381#endif
382
383 return (PyObject *)unicode;
384}
385
Martin v. Löwis18e16552006-02-15 17:27:45 +0000386Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
387 wchar_t *w,
388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 if (unicode == NULL) {
391 PyErr_BadInternalCall();
392 return -1;
393 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000394
395 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000397 size = PyUnicode_GET_SIZE(unicode) + 1;
398
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399#ifdef HAVE_USABLE_WCHAR_T
400 memcpy(w, unicode->str, size * sizeof(wchar_t));
401#else
402 {
403 register Py_UNICODE *u;
404 register int i;
405 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000406 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 *w++ = *u++;
408 }
409#endif
410
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000411 if (size > PyUnicode_GET_SIZE(unicode))
412 return PyUnicode_GET_SIZE(unicode);
413 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000414 return size;
415}
416
417#endif
418
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000419PyObject *PyUnicode_FromOrdinal(int ordinal)
420{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000421 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000422
423#ifdef Py_UNICODE_WIDE
424 if (ordinal < 0 || ordinal > 0x10ffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x110000) "
427 "(wide Python build)");
428 return NULL;
429 }
430#else
431 if (ordinal < 0 || ordinal > 0xffff) {
432 PyErr_SetString(PyExc_ValueError,
433 "unichr() arg not in range(0x10000) "
434 "(narrow Python build)");
435 return NULL;
436 }
437#endif
438
Hye-Shik Chang40574832004-04-06 07:24:51 +0000439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000441}
442
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443PyObject *PyUnicode_FromObject(register PyObject *obj)
444{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000445 /* XXX Perhaps we should make this API an alias of
446 PyObject_Unicode() instead ?! */
447 if (PyUnicode_CheckExact(obj)) {
448 Py_INCREF(obj);
449 return obj;
450 }
451 if (PyUnicode_Check(obj)) {
452 /* For a Unicode subtype that's not a Unicode object,
453 return a true Unicode object with the same data. */
454 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
455 PyUnicode_GET_SIZE(obj));
456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
458}
459
460PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
461 const char *encoding,
462 const char *errors)
463{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000464 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000465 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000467
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468 if (obj == NULL) {
469 PyErr_BadInternalCall();
470 return NULL;
471 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000473#if 0
474 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000475 that no encodings is given and then redirect to
476 PyObject_Unicode() which then applies the additional logic for
477 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 NOTE: This API should really only be used for object which
480 represent *encoded* Unicode !
481
482 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 if (PyUnicode_Check(obj)) {
484 if (encoding) {
485 PyErr_SetString(PyExc_TypeError,
486 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491#else
492 if (PyUnicode_Check(obj)) {
493 PyErr_SetString(PyExc_TypeError,
494 "decoding Unicode is not supported");
495 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497#endif
498
499 /* Coerce object */
500 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 s = PyString_AS_STRING(obj);
502 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000504 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
505 /* Overwrite the error message with something more useful in
506 case of a TypeError. */
507 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000508 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000509 "coercing to Unicode: need string or buffer, "
510 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000511 obj->ob_type->tp_name);
512 goto onError;
513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 if (len == 0) {
517 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 }
Tim Petersced69f82003-09-16 20:30:58 +0000520 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000521 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000522
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 return v;
524
525 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527}
528
529PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000530 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 const char *encoding,
532 const char *errors)
533{
534 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000535
536 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000537 encoding = PyUnicode_GetDefaultEncoding();
538
539 /* Shortcuts for common default encodings */
540 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000542 else if (strcmp(encoding, "latin-1") == 0)
543 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000544#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
545 else if (strcmp(encoding, "mbcs") == 0)
546 return PyUnicode_DecodeMBCS(s, size, errors);
547#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000548 else if (strcmp(encoding, "ascii") == 0)
549 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Decode via the codec registry */
552 buffer = PyBuffer_FromMemory((void *)s, size);
553 if (buffer == NULL)
554 goto onError;
555 unicode = PyCodec_Decode(buffer, encoding, errors);
556 if (unicode == NULL)
557 goto onError;
558 if (!PyUnicode_Check(unicode)) {
559 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000560 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 unicode->ob_type->tp_name);
562 Py_DECREF(unicode);
563 goto onError;
564 }
565 Py_DECREF(buffer);
566 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000567
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 onError:
569 Py_XDECREF(buffer);
570 return NULL;
571}
572
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000573PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
574 const char *encoding,
575 const char *errors)
576{
577 PyObject *v;
578
579 if (!PyUnicode_Check(unicode)) {
580 PyErr_BadArgument();
581 goto onError;
582 }
583
584 if (encoding == NULL)
585 encoding = PyUnicode_GetDefaultEncoding();
586
587 /* Decode via the codec registry */
588 v = PyCodec_Decode(unicode, encoding, errors);
589 if (v == NULL)
590 goto onError;
591 return v;
592
593 onError:
594 return NULL;
595}
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000598 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599 const char *encoding,
600 const char *errors)
601{
602 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604 unicode = PyUnicode_FromUnicode(s, size);
605 if (unicode == NULL)
606 return NULL;
607 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
608 Py_DECREF(unicode);
609 return v;
610}
611
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000612PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
613 const char *encoding,
614 const char *errors)
615{
616 PyObject *v;
617
618 if (!PyUnicode_Check(unicode)) {
619 PyErr_BadArgument();
620 goto onError;
621 }
622
623 if (encoding == NULL)
624 encoding = PyUnicode_GetDefaultEncoding();
625
626 /* Encode via the codec registry */
627 v = PyCodec_Encode(unicode, encoding, errors);
628 if (v == NULL)
629 goto onError;
630 return v;
631
632 onError:
633 return NULL;
634}
635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
637 const char *encoding,
638 const char *errors)
639{
640 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000641
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642 if (!PyUnicode_Check(unicode)) {
643 PyErr_BadArgument();
644 goto onError;
645 }
Fred Drakee4315f52000-05-09 19:53:39 +0000646
Tim Petersced69f82003-09-16 20:30:58 +0000647 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000648 encoding = PyUnicode_GetDefaultEncoding();
649
650 /* Shortcuts for common default encodings */
651 if (errors == NULL) {
652 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000653 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000654 else if (strcmp(encoding, "latin-1") == 0)
655 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000656#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
657 else if (strcmp(encoding, "mbcs") == 0)
658 return PyUnicode_AsMBCSString(unicode);
659#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000660 else if (strcmp(encoding, "ascii") == 0)
661 return PyUnicode_AsASCIIString(unicode);
662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663
664 /* Encode via the codec registry */
665 v = PyCodec_Encode(unicode, encoding, errors);
666 if (v == NULL)
667 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668 if (!PyString_Check(v)) {
669 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000670 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 v->ob_type->tp_name);
672 Py_DECREF(v);
673 goto onError;
674 }
675 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000676
Guido van Rossumd57fd912000-03-10 22:53:23 +0000677 onError:
678 return NULL;
679}
680
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000681PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
682 const char *errors)
683{
684 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
685
686 if (v)
687 return v;
688 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
689 if (v && errors == NULL)
690 ((PyUnicodeObject *)unicode)->defenc = v;
691 return v;
692}
693
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
695{
696 if (!PyUnicode_Check(unicode)) {
697 PyErr_BadArgument();
698 goto onError;
699 }
700 return PyUnicode_AS_UNICODE(unicode);
701
702 onError:
703 return NULL;
704}
705
Martin v. Löwis18e16552006-02-15 17:27:45 +0000706Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000707{
708 if (!PyUnicode_Check(unicode)) {
709 PyErr_BadArgument();
710 goto onError;
711 }
712 return PyUnicode_GET_SIZE(unicode);
713
714 onError:
715 return -1;
716}
717
Thomas Wouters78890102000-07-22 19:25:51 +0000718const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000719{
720 return unicode_default_encoding;
721}
722
723int PyUnicode_SetDefaultEncoding(const char *encoding)
724{
725 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000726
Fred Drakee4315f52000-05-09 19:53:39 +0000727 /* Make sure the encoding is valid. As side effect, this also
728 loads the encoding into the codec registry cache. */
729 v = _PyCodec_Lookup(encoding);
730 if (v == NULL)
731 goto onError;
732 Py_DECREF(v);
733 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000734 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000735 sizeof(unicode_default_encoding));
736 return 0;
737
738 onError:
739 return -1;
740}
741
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000742/* error handling callback helper:
743 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000744 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745 and adjust various state variables.
746 return 0 on success, -1 on error
747*/
748
749static
750int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
751 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
753 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000755 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756
757 PyObject *restuple = NULL;
758 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000759 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
760 Py_ssize_t requiredsize;
761 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000762 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000763 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000764 int res = -1;
765
766 if (*errorHandler == NULL) {
767 *errorHandler = PyCodec_LookupError(errors);
768 if (*errorHandler == NULL)
769 goto onError;
770 }
771
772 if (*exceptionObject == NULL) {
773 *exceptionObject = PyUnicodeDecodeError_Create(
774 encoding, input, insize, *startinpos, *endinpos, reason);
775 if (*exceptionObject == NULL)
776 goto onError;
777 }
778 else {
779 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
780 goto onError;
781 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
782 goto onError;
783 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
784 goto onError;
785 }
786
787 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
788 if (restuple == NULL)
789 goto onError;
790 if (!PyTuple_Check(restuple)) {
791 PyErr_Format(PyExc_TypeError, &argparse[4]);
792 goto onError;
793 }
794 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
795 goto onError;
796 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000797 newpos = insize+newpos;
798 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000799 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000800 goto onError;
801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 /* need more space? (at least enough for what we
804 have+the replacement+the rest of the string (starting
805 at the new input position), so we won't have to check space
806 when there are no errors in the rest of the string) */
807 repptr = PyUnicode_AS_UNICODE(repunicode);
808 repsize = PyUnicode_GET_SIZE(repunicode);
809 requiredsize = *outpos + repsize + insize-newpos;
810 if (requiredsize > outsize) {
811 if (requiredsize<2*outsize)
812 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000813 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814 goto onError;
815 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
816 }
817 *endinpos = newpos;
818 *inptr = input + newpos;
819 Py_UNICODE_COPY(*outptr, repptr, repsize);
820 *outptr += repsize;
821 *outpos += repsize;
822 /* we made it! */
823 res = 0;
824
825 onError:
826 Py_XDECREF(restuple);
827 return res;
828}
829
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000830/* --- UTF-7 Codec -------------------------------------------------------- */
831
832/* see RFC2152 for details */
833
Tim Petersced69f82003-09-16 20:30:58 +0000834static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000835char utf7_special[128] = {
836 /* indicate whether a UTF-7 character is special i.e. cannot be directly
837 encoded:
838 0 - not special
839 1 - special
840 2 - whitespace (optional)
841 3 - RFC2152 Set O (optional) */
842 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
843 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
844 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
846 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
848 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
850
851};
852
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000853/* Note: The comparison (c) <= 0 is a trick to work-around gcc
854 warnings about the comparison always being false; since
855 utf7_special[0] is 1, we can safely make that one comparison
856 true */
857
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000858#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000859 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000860 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861 (encodeO && (utf7_special[(c)] == 3)))
862
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000863#define B64(n) \
864 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
865#define B64CHAR(c) \
866 (isalnum(c) || (c) == '+' || (c) == '/')
867#define UB64(c) \
868 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
869 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000870
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000871#define ENCODE(out, ch, bits) \
872 while (bits >= 6) { \
873 *out++ = B64(ch >> (bits-6)); \
874 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875 }
876
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877#define DECODE(out, ch, bits, surrogate) \
878 while (bits >= 16) { \
879 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
880 bits -= 16; \
881 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000882 /* We have already generated an error for the high surrogate \
883 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000884 surrogate = 0; \
885 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000887 it in a 16-bit character */ \
888 surrogate = 1; \
889 errmsg = "code pairs are not supported"; \
890 goto utf7Error; \
891 } else { \
892 *out++ = outCh; \
893 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000895
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000896PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000897 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 const char *errors)
899{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000900 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000901 Py_ssize_t startinpos;
902 Py_ssize_t endinpos;
903 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904 const char *e;
905 PyUnicodeObject *unicode;
906 Py_UNICODE *p;
907 const char *errmsg = "";
908 int inShift = 0;
909 unsigned int bitsleft = 0;
910 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000911 int surrogate = 0;
912 PyObject *errorHandler = NULL;
913 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000914
915 unicode = _PyUnicode_New(size);
916 if (!unicode)
917 return NULL;
918 if (size == 0)
919 return (PyObject *)unicode;
920
921 p = unicode->str;
922 e = s + size;
923
924 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000925 Py_UNICODE ch;
926 restart:
927 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928
929 if (inShift) {
930 if ((ch == '-') || !B64CHAR(ch)) {
931 inShift = 0;
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
935 if (bitsleft >= 6) {
936 /* The shift sequence has a partial character in it. If
937 bitsleft < 6 then we could just classify it as padding
938 but that is not the case here */
939
940 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942 }
943 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000944 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 here so indicate the potential of a misencoded character. */
946
947 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
948 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
949 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000950 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 }
952
953 if (ch == '-') {
954 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000955 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000956 inShift = 1;
957 }
958 } else if (SPECIAL(ch,0,0)) {
959 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000960 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961 } else {
962 *p++ = ch;
963 }
964 } else {
965 charsleft = (charsleft << 6) | UB64(ch);
966 bitsleft += 6;
967 s++;
968 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
969 }
970 }
971 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973 s++;
974 if (s < e && *s == '-') {
975 s++;
976 *p++ = '+';
977 } else
978 {
979 inShift = 1;
980 bitsleft = 0;
981 }
982 }
983 else if (SPECIAL(ch,0,0)) {
984 errmsg = "unexpected special character";
985 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000986 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987 }
988 else {
989 *p++ = ch;
990 s++;
991 }
992 continue;
993 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000994 outpos = p-PyUnicode_AS_UNICODE(unicode);
995 endinpos = s-starts;
996 if (unicode_decode_call_errorhandler(
997 errors, &errorHandler,
998 "utf7", errmsg,
999 starts, size, &startinpos, &endinpos, &exc, &s,
1000 (PyObject **)&unicode, &outpos, &p))
1001 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001005 outpos = p-PyUnicode_AS_UNICODE(unicode);
1006 endinpos = size;
1007 if (unicode_decode_call_errorhandler(
1008 errors, &errorHandler,
1009 "utf7", "unterminated shift sequence",
1010 starts, size, &startinpos, &endinpos, &exc, &s,
1011 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 if (s < e)
1014 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 }
1016
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001017 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001018 goto onError;
1019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001020 Py_XDECREF(errorHandler);
1021 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001022 return (PyObject *)unicode;
1023
1024onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001025 Py_XDECREF(errorHandler);
1026 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001027 Py_DECREF(unicode);
1028 return NULL;
1029}
1030
1031
1032PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001033 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001034 int encodeSetO,
1035 int encodeWhiteSpace,
1036 const char *errors)
1037{
1038 PyObject *v;
1039 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001040 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001042 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001043 unsigned int bitsleft = 0;
1044 unsigned long charsleft = 0;
1045 char * out;
1046 char * start;
1047
1048 if (size == 0)
1049 return PyString_FromStringAndSize(NULL, 0);
1050
1051 v = PyString_FromStringAndSize(NULL, cbAllocated);
1052 if (v == NULL)
1053 return NULL;
1054
1055 start = out = PyString_AS_STRING(v);
1056 for (;i < size; ++i) {
1057 Py_UNICODE ch = s[i];
1058
1059 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 if (ch == '+') {
1061 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 *out++ = '-';
1063 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1064 charsleft = ch;
1065 bitsleft = 16;
1066 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001067 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001069 } else {
1070 *out++ = (char) ch;
1071 }
1072 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1074 *out++ = B64(charsleft << (6-bitsleft));
1075 charsleft = 0;
1076 bitsleft = 0;
1077 /* Characters not in the BASE64 set implicitly unshift the sequence
1078 so no '-' is required, except if the character is itself a '-' */
1079 if (B64CHAR(ch) || ch == '-') {
1080 *out++ = '-';
1081 }
1082 inShift = 0;
1083 *out++ = (char) ch;
1084 } else {
1085 bitsleft += 16;
1086 charsleft = (charsleft << 16) | ch;
1087 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1088
1089 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001090 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 or '-' then the shift sequence will be terminated implicitly and we
1092 don't have to insert a '-'. */
1093
1094 if (bitsleft == 0) {
1095 if (i + 1 < size) {
1096 Py_UNICODE ch2 = s[i+1];
1097
1098 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001099
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001100 } else if (B64CHAR(ch2) || ch2 == '-') {
1101 *out++ = '-';
1102 inShift = 0;
1103 } else {
1104 inShift = 0;
1105 }
1106
1107 }
1108 else {
1109 *out++ = '-';
1110 inShift = 0;
1111 }
1112 }
Tim Petersced69f82003-09-16 20:30:58 +00001113 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 if (bitsleft) {
1117 *out++= B64(charsleft << (6-bitsleft) );
1118 *out++ = '-';
1119 }
1120
Tim Peters5de98422002-04-27 18:44:32 +00001121 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 return v;
1123}
1124
1125#undef SPECIAL
1126#undef B64
1127#undef B64CHAR
1128#undef UB64
1129#undef ENCODE
1130#undef DECODE
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* --- UTF-8 Codec -------------------------------------------------------- */
1133
Tim Petersced69f82003-09-16 20:30:58 +00001134static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135char utf8_code_length[256] = {
1136 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1137 illegal prefix. see RFC 2279 for details */
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1141 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1150 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1151 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1152 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1153 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1154};
1155
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001157 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 const char *errors)
1159{
Walter Dörwald69652032004-09-07 20:24:22 +00001160 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1161}
1162
1163PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001164 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001165 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001166 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001168 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001170 Py_ssize_t startinpos;
1171 Py_ssize_t endinpos;
1172 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 const char *e;
1174 PyUnicodeObject *unicode;
1175 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001176 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001177 PyObject *errorHandler = NULL;
1178 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
1180 /* Note: size will always be longer than the resulting Unicode
1181 character count */
1182 unicode = _PyUnicode_New(size);
1183 if (!unicode)
1184 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001185 if (size == 0) {
1186 if (consumed)
1187 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 /* Unpack UTF-8 encoded data */
1192 p = unicode->str;
1193 e = s + size;
1194
1195 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
1198 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001199 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 s++;
1201 continue;
1202 }
1203
1204 n = utf8_code_length[ch];
1205
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001207 if (consumed)
1208 break;
1209 else {
1210 errmsg = "unexpected end of data";
1211 startinpos = s-starts;
1212 endinpos = size;
1213 goto utf8Error;
1214 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216
1217 switch (n) {
1218
1219 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001221 startinpos = s-starts;
1222 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001223 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224
1225 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if ((s[1] & 0xc0) != 0x80) {
1233 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001234 startinpos = s-starts;
1235 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001240 startinpos = s-starts;
1241 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001242 errmsg = "illegal encoding";
1243 goto utf8Error;
1244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 break;
1248
1249 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001250 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001251 (s[2] & 0xc0) != 0x80) {
1252 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253 startinpos = s-starts;
1254 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 goto utf8Error;
1256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001258 if (ch < 0x0800) {
1259 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001260 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001261
1262 XXX For wide builds (UCS-4) we should probably try
1263 to recombine the surrogates into a single code
1264 unit.
1265 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
1270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001272 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001273 break;
1274
1275 case 4:
1276 if ((s[1] & 0xc0) != 0x80 ||
1277 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 (s[3] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001284 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1285 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1286 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001287 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001288 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001289 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001290 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 startinpos = s-starts;
1294 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 goto utf8Error;
1296 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001297#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001298 *p++ = (Py_UNICODE)ch;
1299#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001300 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001301
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 /* translate from 10000..10FFFF to 0..FFFF */
1303 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001304
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001305 /* high surrogate = top 10 bits added to D800 */
1306 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001307
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001308 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001309 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001310#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 break;
1312
1313 default:
1314 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001316 startinpos = s-starts;
1317 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001318 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319 }
1320 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001321 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001322
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001323 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324 outpos = p-PyUnicode_AS_UNICODE(unicode);
1325 if (unicode_decode_call_errorhandler(
1326 errors, &errorHandler,
1327 "utf8", errmsg,
1328 starts, size, &startinpos, &endinpos, &exc, &s,
1329 (PyObject **)&unicode, &outpos, &p))
1330 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 }
Walter Dörwald69652032004-09-07 20:24:22 +00001332 if (consumed)
1333 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334
1335 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001336 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337 goto onError;
1338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 Py_XDECREF(errorHandler);
1340 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 return (PyObject *)unicode;
1342
1343onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 Py_XDECREF(errorHandler);
1345 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 Py_DECREF(unicode);
1347 return NULL;
1348}
1349
Tim Peters602f7402002-04-27 18:03:26 +00001350/* Allocation strategy: if the string is short, convert into a stack buffer
1351 and allocate exactly as much space needed at the end. Else allocate the
1352 maximum possible needed (4 result bytes per Unicode character), and return
1353 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001354*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001355PyObject *
1356PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001358 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359{
Tim Peters602f7402002-04-27 18:03:26 +00001360#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001361
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001363 PyObject *v; /* result string object */
1364 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001365 Py_ssize_t nallocated; /* number of result bytes allocated */
Tim Peters602f7402002-04-27 18:03:26 +00001366 int nneeded; /* number of result bytes needed */
1367 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001368
Tim Peters602f7402002-04-27 18:03:26 +00001369 assert(s != NULL);
1370 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371
Tim Peters602f7402002-04-27 18:03:26 +00001372 if (size <= MAX_SHORT_UNICHARS) {
1373 /* Write into the stack buffer; nallocated can't overflow.
1374 * At the end, we'll allocate exactly as much heap space as it
1375 * turns out we need.
1376 */
1377 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1378 v = NULL; /* will allocate after we're done */
1379 p = stackbuf;
1380 }
1381 else {
1382 /* Overallocate on the heap, and give the excess back at the end. */
1383 nallocated = size * 4;
1384 if (nallocated / 4 != size) /* overflow! */
1385 return PyErr_NoMemory();
1386 v = PyString_FromStringAndSize(NULL, nallocated);
1387 if (v == NULL)
1388 return NULL;
1389 p = PyString_AS_STRING(v);
1390 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001391
Tim Peters602f7402002-04-27 18:03:26 +00001392 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001393 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001394
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001395 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001396 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001398
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001400 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001401 *p++ = (char)(0xc0 | (ch >> 6));
1402 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001403 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001404 else {
Tim Peters602f7402002-04-27 18:03:26 +00001405 /* Encode UCS2 Unicode ordinals */
1406 if (ch < 0x10000) {
1407 /* Special case: check for high surrogate */
1408 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1409 Py_UCS4 ch2 = s[i];
1410 /* Check for low surrogate and combine the two to
1411 form a UCS4 value */
1412 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001413 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001414 i++;
1415 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001416 }
Tim Peters602f7402002-04-27 18:03:26 +00001417 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001418 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001419 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001420 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1421 *p++ = (char)(0x80 | (ch & 0x3f));
1422 continue;
1423 }
1424encodeUCS4:
1425 /* Encode UCS4 Unicode ordinals */
1426 *p++ = (char)(0xf0 | (ch >> 18));
1427 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1428 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1429 *p++ = (char)(0x80 | (ch & 0x3f));
1430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001432
Tim Peters602f7402002-04-27 18:03:26 +00001433 if (v == NULL) {
1434 /* This was stack allocated. */
1435 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1436 assert(nneeded <= nallocated);
1437 v = PyString_FromStringAndSize(stackbuf, nneeded);
1438 }
1439 else {
1440 /* Cut back to size actually needed. */
1441 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1442 assert(nneeded <= nallocated);
1443 _PyString_Resize(&v, nneeded);
1444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446
Tim Peters602f7402002-04-27 18:03:26 +00001447#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448}
1449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1451{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 if (!PyUnicode_Check(unicode)) {
1453 PyErr_BadArgument();
1454 return NULL;
1455 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001456 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1457 PyUnicode_GET_SIZE(unicode),
1458 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459}
1460
1461/* --- UTF-16 Codec ------------------------------------------------------- */
1462
Tim Peters772747b2001-08-09 22:21:55 +00001463PyObject *
1464PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001465 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001466 const char *errors,
1467 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468{
Walter Dörwald69652032004-09-07 20:24:22 +00001469 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1470}
1471
1472PyObject *
1473PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001475 const char *errors,
1476 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001477 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001480 Py_ssize_t startinpos;
1481 Py_ssize_t endinpos;
1482 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 PyUnicodeObject *unicode;
1484 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001485 const unsigned char *q, *e;
1486 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001487 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001488 /* Offsets from q for retrieving byte pairs in the right order. */
1489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1490 int ihi = 1, ilo = 0;
1491#else
1492 int ihi = 0, ilo = 1;
1493#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 PyObject *errorHandler = NULL;
1495 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496
1497 /* Note: size will always be longer than the resulting Unicode
1498 character count */
1499 unicode = _PyUnicode_New(size);
1500 if (!unicode)
1501 return NULL;
1502 if (size == 0)
1503 return (PyObject *)unicode;
1504
1505 /* Unpack UTF-16 encoded data */
1506 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001507 q = (unsigned char *)s;
1508 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509
1510 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001511 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513 /* Check for BOM marks (U+FEFF) in the input and adjust current
1514 byte order setting accordingly. In native mode, the leading BOM
1515 mark is skipped, in all other modes, it is copied to the output
1516 stream as-is (giving a ZWNBSP character). */
1517 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001518 if (size >= 2) {
1519 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001520#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001521 if (bom == 0xFEFF) {
1522 q += 2;
1523 bo = -1;
1524 }
1525 else if (bom == 0xFFFE) {
1526 q += 2;
1527 bo = 1;
1528 }
Tim Petersced69f82003-09-16 20:30:58 +00001529#else
Walter Dörwald69652032004-09-07 20:24:22 +00001530 if (bom == 0xFEFF) {
1531 q += 2;
1532 bo = 1;
1533 }
1534 else if (bom == 0xFFFE) {
1535 q += 2;
1536 bo = -1;
1537 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001538#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001539 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541
Tim Peters772747b2001-08-09 22:21:55 +00001542 if (bo == -1) {
1543 /* force LE */
1544 ihi = 1;
1545 ilo = 0;
1546 }
1547 else if (bo == 1) {
1548 /* force BE */
1549 ihi = 0;
1550 ilo = 1;
1551 }
1552
1553 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001554 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001555 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001557 if (consumed)
1558 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 errmsg = "truncated data";
1560 startinpos = ((const char *)q)-starts;
1561 endinpos = ((const char *)e)-starts;
1562 goto utf16Error;
1563 /* The remaining input chars are ignored if the callback
1564 chooses to skip the input */
1565 }
1566 ch = (q[ihi] << 8) | q[ilo];
1567
Tim Peters772747b2001-08-09 22:21:55 +00001568 q += 2;
1569
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 if (ch < 0xD800 || ch > 0xDFFF) {
1571 *p++ = ch;
1572 continue;
1573 }
1574
1575 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001576 if (q >= e) {
1577 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 startinpos = (((const char *)q)-2)-starts;
1579 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001580 goto utf16Error;
1581 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001582 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1584 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001586#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001587 *p++ = ch;
1588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001589#else
1590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001592 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001593 }
1594 else {
1595 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-4)-starts;
1597 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001598 goto utf16Error;
1599 }
1600
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001602 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 startinpos = (((const char *)q)-2)-starts;
1604 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001605 /* Fall through to report the error */
1606
1607 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001608 outpos = p-PyUnicode_AS_UNICODE(unicode);
1609 if (unicode_decode_call_errorhandler(
1610 errors, &errorHandler,
1611 "utf16", errmsg,
1612 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1613 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 }
1616
1617 if (byteorder)
1618 *byteorder = bo;
1619
Walter Dörwald69652032004-09-07 20:24:22 +00001620 if (consumed)
1621 *consumed = (const char *)q-starts;
1622
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001624 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625 goto onError;
1626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 Py_XDECREF(errorHandler);
1628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 return (PyObject *)unicode;
1630
1631onError:
1632 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_XDECREF(errorHandler);
1634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 return NULL;
1636}
1637
Tim Peters772747b2001-08-09 22:21:55 +00001638PyObject *
1639PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001640 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001641 const char *errors,
1642 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643{
1644 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001645 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001646#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001647 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001648#else
1649 const int pairs = 0;
1650#endif
Tim Peters772747b2001-08-09 22:21:55 +00001651 /* Offsets from p for storing byte pairs in the right order. */
1652#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1653 int ihi = 1, ilo = 0;
1654#else
1655 int ihi = 0, ilo = 1;
1656#endif
1657
1658#define STORECHAR(CH) \
1659 do { \
1660 p[ihi] = ((CH) >> 8) & 0xff; \
1661 p[ilo] = (CH) & 0xff; \
1662 p += 2; \
1663 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001666 for (i = pairs = 0; i < size; i++)
1667 if (s[i] >= 0x10000)
1668 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001669#endif
Tim Petersced69f82003-09-16 20:30:58 +00001670 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001671 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 if (v == NULL)
1673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674
Tim Peters772747b2001-08-09 22:21:55 +00001675 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001677 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001678 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001679 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001680
1681 if (byteorder == -1) {
1682 /* force LE */
1683 ihi = 1;
1684 ilo = 0;
1685 }
1686 else if (byteorder == 1) {
1687 /* force BE */
1688 ihi = 0;
1689 ilo = 1;
1690 }
1691
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001692 while (size-- > 0) {
1693 Py_UNICODE ch = *s++;
1694 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001695#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001696 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001697 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1698 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001700#endif
Tim Peters772747b2001-08-09 22:21:55 +00001701 STORECHAR(ch);
1702 if (ch2)
1703 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001706#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707}
1708
1709PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1710{
1711 if (!PyUnicode_Check(unicode)) {
1712 PyErr_BadArgument();
1713 return NULL;
1714 }
1715 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1716 PyUnicode_GET_SIZE(unicode),
1717 NULL,
1718 0);
1719}
1720
1721/* --- Unicode Escape Codec ----------------------------------------------- */
1722
Fredrik Lundh06d12682001-01-24 07:59:11 +00001723static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001724
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001726 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 const char *errors)
1728{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001730 Py_ssize_t startinpos;
1731 Py_ssize_t endinpos;
1732 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001737 char* message;
1738 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 PyObject *errorHandler = NULL;
1740 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001741
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 /* Escaped strings will always be longer than the resulting
1743 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 length after conversion to the true value.
1745 (but if the error callback returns a long replacement string
1746 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 v = _PyUnicode_New(size);
1748 if (v == NULL)
1749 goto onError;
1750 if (size == 0)
1751 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001752
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 while (s < end) {
1757 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001758 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 /* Non-escape characters are interpreted as Unicode ordinals */
1762 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001763 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 continue;
1765 }
1766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001767 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 /* \ - Escapes */
1769 s++;
1770 switch (*s++) {
1771
1772 /* \x escapes */
1773 case '\n': break;
1774 case '\\': *p++ = '\\'; break;
1775 case '\'': *p++ = '\''; break;
1776 case '\"': *p++ = '\"'; break;
1777 case 'b': *p++ = '\b'; break;
1778 case 'f': *p++ = '\014'; break; /* FF */
1779 case 't': *p++ = '\t'; break;
1780 case 'n': *p++ = '\n'; break;
1781 case 'r': *p++ = '\r'; break;
1782 case 'v': *p++ = '\013'; break; /* VT */
1783 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1784
1785 /* \OOO (octal) escapes */
1786 case '0': case '1': case '2': case '3':
1787 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001788 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001790 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001792 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001794 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 break;
1796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* hex escapes */
1798 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 digits = 2;
1801 message = "truncated \\xXX escape";
1802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Fredrik Lundhccc74732001-02-18 22:13:49 +00001804 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 digits = 4;
1807 message = "truncated \\uXXXX escape";
1808 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
Fredrik Lundhccc74732001-02-18 22:13:49 +00001810 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001811 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001812 digits = 8;
1813 message = "truncated \\UXXXXXXXX escape";
1814 hexescape:
1815 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (s+digits>end) {
1818 endinpos = size;
1819 if (unicode_decode_call_errorhandler(
1820 errors, &errorHandler,
1821 "unicodeescape", "end of string in escape sequence",
1822 starts, size, &startinpos, &endinpos, &exc, &s,
1823 (PyObject **)&v, &outpos, &p))
1824 goto onError;
1825 goto nextByte;
1826 }
1827 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001828 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 endinpos = (s+i+1)-starts;
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001836 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001838 }
1839 chr = (chr<<4) & ~0xF;
1840 if (c >= '0' && c <= '9')
1841 chr += c - '0';
1842 else if (c >= 'a' && c <= 'f')
1843 chr += 10 + c - 'a';
1844 else
1845 chr += 10 + c - 'A';
1846 }
1847 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001848 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 /* _decoding_error will have already written into the
1850 target buffer. */
1851 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001853 /* when we get here, chr is a 32-bit unicode character */
1854 if (chr <= 0xffff)
1855 /* UCS-2 character */
1856 *p++ = (Py_UNICODE) chr;
1857 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001858 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001859 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001860#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001861 *p++ = chr;
1862#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001863 chr -= 0x10000L;
1864 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001865 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001866#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001867 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 endinpos = s-starts;
1869 outpos = p-PyUnicode_AS_UNICODE(v);
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "illegal Unicode character",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 goto onError;
1876 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001877 break;
1878
1879 /* \N{name} */
1880 case 'N':
1881 message = "malformed \\N character escape";
1882 if (ucnhash_CAPI == NULL) {
1883 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001884 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001885 m = PyImport_ImportModule("unicodedata");
1886 if (m == NULL)
1887 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001888 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001889 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001890 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001891 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001892 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001893 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001894 if (ucnhash_CAPI == NULL)
1895 goto ucnhashError;
1896 }
1897 if (*s == '{') {
1898 const char *start = s+1;
1899 /* look for the closing brace */
1900 while (*s != '}' && s < end)
1901 s++;
1902 if (s > start && s < end && *s == '}') {
1903 /* found a name. look it up in the unicode database */
1904 message = "unknown Unicode character name";
1905 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001906 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 goto store;
1908 }
1909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 endinpos = s-starts;
1911 outpos = p-PyUnicode_AS_UNICODE(v);
1912 if (unicode_decode_call_errorhandler(
1913 errors, &errorHandler,
1914 "unicodeescape", message,
1915 starts, size, &startinpos, &endinpos, &exc, &s,
1916 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001917 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001918 break;
1919
1920 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001921 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 message = "\\ at end of string";
1923 s--;
1924 endinpos = s-starts;
1925 outpos = p-PyUnicode_AS_UNICODE(v);
1926 if (unicode_decode_call_errorhandler(
1927 errors, &errorHandler,
1928 "unicodeescape", message,
1929 starts, size, &startinpos, &endinpos, &exc, &s,
1930 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001931 goto onError;
1932 }
1933 else {
1934 *p++ = '\\';
1935 *p++ = (unsigned char)s[-1];
1936 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 nextByte:
1940 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001942 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001943 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001944 Py_XDECREF(errorHandler);
1945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001947
Fredrik Lundhccc74732001-02-18 22:13:49 +00001948ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001949 PyErr_SetString(
1950 PyExc_UnicodeError,
1951 "\\N escapes not supported (can't load unicodedata module)"
1952 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 Py_XDECREF(errorHandler);
1955 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001956 return NULL;
1957
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001960 Py_XDECREF(errorHandler);
1961 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 return NULL;
1963}
1964
1965/* Return a Unicode-Escape string version of the Unicode object.
1966
1967 If quotes is true, the string is enclosed in u"" or u'' quotes as
1968 appropriate.
1969
1970*/
1971
Barry Warsaw51ac5802000-03-20 16:36:48 +00001972static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001974 Py_UNICODE ch);
1975
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976static
1977PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001978 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 int quotes)
1980{
1981 PyObject *repr;
1982 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001984 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1987 if (repr == NULL)
1988 return NULL;
1989
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001990 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991
1992 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001994 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 !findchar(s, size, '"')) ? '"' : '\'';
1996 }
1997 while (size-- > 0) {
1998 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001999
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002000 /* Escape quotes and backslashes */
2001 if ((quotes &&
2002 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 *p++ = '\\';
2004 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002005 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002006 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002008#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002009 /* Map 21-bit characters to '\U00xxxxxx' */
2010 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002011 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002012
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002013 /* Resize the string if necessary */
2014 if (offset + 12 > PyString_GET_SIZE(repr)) {
2015 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002016 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002017 p = PyString_AS_STRING(repr) + offset;
2018 }
2019
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002020 *p++ = '\\';
2021 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002022 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2023 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2024 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2025 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2026 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2027 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2028 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002029 *p++ = hexdigit[ch & 0x0000000F];
2030 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002031 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002032#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002033 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2034 else if (ch >= 0xD800 && ch < 0xDC00) {
2035 Py_UNICODE ch2;
2036 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002037
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002038 ch2 = *s++;
2039 size--;
2040 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2041 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2042 *p++ = '\\';
2043 *p++ = 'U';
2044 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2045 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2046 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2047 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2048 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2049 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2050 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2051 *p++ = hexdigit[ucs & 0x0000000F];
2052 continue;
2053 }
2054 /* Fall through: isolated surrogates are copied as-is */
2055 s--;
2056 size++;
2057 }
2058
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002060 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 *p++ = '\\';
2062 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002063 *p++ = hexdigit[(ch >> 12) & 0x000F];
2064 *p++ = hexdigit[(ch >> 8) & 0x000F];
2065 *p++ = hexdigit[(ch >> 4) & 0x000F];
2066 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002069 /* Map special whitespace to '\t', \n', '\r' */
2070 else if (ch == '\t') {
2071 *p++ = '\\';
2072 *p++ = 't';
2073 }
2074 else if (ch == '\n') {
2075 *p++ = '\\';
2076 *p++ = 'n';
2077 }
2078 else if (ch == '\r') {
2079 *p++ = '\\';
2080 *p++ = 'r';
2081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002083 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002084 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002086 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002087 *p++ = hexdigit[(ch >> 4) & 0x000F];
2088 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002089 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002090
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 /* Copy everything else as-is */
2092 else
2093 *p++ = (char) ch;
2094 }
2095 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002096 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002099 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 return repr;
2101}
2102
2103PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002104 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105{
2106 return unicodeescape_string(s, size, 0);
2107}
2108
2109PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2110{
2111 if (!PyUnicode_Check(unicode)) {
2112 PyErr_BadArgument();
2113 return NULL;
2114 }
2115 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2116 PyUnicode_GET_SIZE(unicode));
2117}
2118
2119/* --- Raw Unicode Escape Codec ------------------------------------------- */
2120
2121PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002122 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 const char *errors)
2124{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002126 Py_ssize_t startinpos;
2127 Py_ssize_t endinpos;
2128 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 const char *end;
2132 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 PyObject *errorHandler = NULL;
2134 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 /* Escaped strings will always be longer than the resulting
2137 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 length after conversion to the true value. (But decoding error
2139 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 v = _PyUnicode_New(size);
2141 if (v == NULL)
2142 goto onError;
2143 if (size == 0)
2144 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 end = s + size;
2147 while (s < end) {
2148 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002149 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002151 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
2153 /* Non-escape characters are interpreted as Unicode ordinals */
2154 if (*s != '\\') {
2155 *p++ = (unsigned char)*s++;
2156 continue;
2157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159
2160 /* \u-escapes are only interpreted iff the number of leading
2161 backslashes if odd */
2162 bs = s;
2163 for (;s < end;) {
2164 if (*s != '\\')
2165 break;
2166 *p++ = (unsigned char)*s++;
2167 }
2168 if (((s - bs) & 1) == 0 ||
2169 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002170 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 continue;
2172 }
2173 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002174 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 s++;
2176
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002177 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002178 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002179 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002180 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 endinpos = s-starts;
2183 if (unicode_decode_call_errorhandler(
2184 errors, &errorHandler,
2185 "rawunicodeescape", "truncated \\uXXXX",
2186 starts, size, &startinpos, &endinpos, &exc, &s,
2187 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 }
2191 x = (x<<4) & ~0xF;
2192 if (c >= '0' && c <= '9')
2193 x += c - '0';
2194 else if (c >= 'a' && c <= 'f')
2195 x += 10 + c - 'a';
2196 else
2197 x += 10 + c - 'A';
2198 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002199#ifndef Py_UNICODE_WIDE
2200 if (x > 0x10000) {
2201 if (unicode_decode_call_errorhandler(
2202 errors, &errorHandler,
2203 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2204 starts, size, &startinpos, &endinpos, &exc, &s,
2205 (PyObject **)&v, &outpos, &p))
2206 goto onError;
2207 }
2208#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 *p++ = x;
2210 nextByte:
2211 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002213 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002214 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 Py_XDECREF(errorHandler);
2216 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002218
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 onError:
2220 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002221 Py_XDECREF(errorHandler);
2222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return NULL;
2224}
2225
2226PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228{
2229 PyObject *repr;
2230 char *p;
2231 char *q;
2232
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002233 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002235#ifdef Py_UNICODE_WIDE
2236 repr = PyString_FromStringAndSize(NULL, 10 * size);
2237#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002239#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 if (repr == NULL)
2241 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002242 if (size == 0)
2243 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244
2245 p = q = PyString_AS_STRING(repr);
2246 while (size-- > 0) {
2247 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248#ifdef Py_UNICODE_WIDE
2249 /* Map 32-bit characters to '\Uxxxxxxxx' */
2250 if (ch >= 0x10000) {
2251 *p++ = '\\';
2252 *p++ = 'U';
2253 *p++ = hexdigit[(ch >> 28) & 0xf];
2254 *p++ = hexdigit[(ch >> 24) & 0xf];
2255 *p++ = hexdigit[(ch >> 20) & 0xf];
2256 *p++ = hexdigit[(ch >> 16) & 0xf];
2257 *p++ = hexdigit[(ch >> 12) & 0xf];
2258 *p++ = hexdigit[(ch >> 8) & 0xf];
2259 *p++ = hexdigit[(ch >> 4) & 0xf];
2260 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002261 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002262 else
2263#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 /* Map 16-bit characters to '\uxxxx' */
2265 if (ch >= 256) {
2266 *p++ = '\\';
2267 *p++ = 'u';
2268 *p++ = hexdigit[(ch >> 12) & 0xf];
2269 *p++ = hexdigit[(ch >> 8) & 0xf];
2270 *p++ = hexdigit[(ch >> 4) & 0xf];
2271 *p++ = hexdigit[ch & 15];
2272 }
2273 /* Copy everything else as-is */
2274 else
2275 *p++ = (char) ch;
2276 }
2277 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002278 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 return repr;
2280}
2281
2282PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2283{
2284 if (!PyUnicode_Check(unicode)) {
2285 PyErr_BadArgument();
2286 return NULL;
2287 }
2288 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2289 PyUnicode_GET_SIZE(unicode));
2290}
2291
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002292/* --- Unicode Internal Codec ------------------------------------------- */
2293
2294PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002295 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002296 const char *errors)
2297{
2298 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002299 Py_ssize_t startinpos;
2300 Py_ssize_t endinpos;
2301 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002302 PyUnicodeObject *v;
2303 Py_UNICODE *p;
2304 const char *end;
2305 const char *reason;
2306 PyObject *errorHandler = NULL;
2307 PyObject *exc = NULL;
2308
Neal Norwitzd43069c2006-01-08 01:12:10 +00002309#ifdef Py_UNICODE_WIDE
2310 Py_UNICODE unimax = PyUnicode_GetMax();
2311#endif
2312
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002313 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2314 if (v == NULL)
2315 goto onError;
2316 if (PyUnicode_GetSize((PyObject *)v) == 0)
2317 return (PyObject *)v;
2318 p = PyUnicode_AS_UNICODE(v);
2319 end = s + size;
2320
2321 while (s < end) {
2322 *p = *(Py_UNICODE *)s;
2323 /* We have to sanity check the raw data, otherwise doom looms for
2324 some malformed UCS-4 data. */
2325 if (
2326 #ifdef Py_UNICODE_WIDE
2327 *p > unimax || *p < 0 ||
2328 #endif
2329 end-s < Py_UNICODE_SIZE
2330 )
2331 {
2332 startinpos = s - starts;
2333 if (end-s < Py_UNICODE_SIZE) {
2334 endinpos = end-starts;
2335 reason = "truncated input";
2336 }
2337 else {
2338 endinpos = s - starts + Py_UNICODE_SIZE;
2339 reason = "illegal code point (> 0x10FFFF)";
2340 }
2341 outpos = p - PyUnicode_AS_UNICODE(v);
2342 if (unicode_decode_call_errorhandler(
2343 errors, &errorHandler,
2344 "unicode_internal", reason,
2345 starts, size, &startinpos, &endinpos, &exc, &s,
2346 (PyObject **)&v, &outpos, &p)) {
2347 goto onError;
2348 }
2349 }
2350 else {
2351 p++;
2352 s += Py_UNICODE_SIZE;
2353 }
2354 }
2355
2356 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2357 goto onError;
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return (PyObject *)v;
2361
2362 onError:
2363 Py_XDECREF(v);
2364 Py_XDECREF(errorHandler);
2365 Py_XDECREF(exc);
2366 return NULL;
2367}
2368
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369/* --- Latin-1 Codec ------------------------------------------------------ */
2370
2371PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002372 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 const char *errors)
2374{
2375 PyUnicodeObject *v;
2376 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002379 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002380 Py_UNICODE r = *(unsigned char*)s;
2381 return PyUnicode_FromUnicode(&r, 1);
2382 }
2383
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 v = _PyUnicode_New(size);
2385 if (v == NULL)
2386 goto onError;
2387 if (size == 0)
2388 return (PyObject *)v;
2389 p = PyUnicode_AS_UNICODE(v);
2390 while (size-- > 0)
2391 *p++ = (unsigned char)*s++;
2392 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002393
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 onError:
2395 Py_XDECREF(v);
2396 return NULL;
2397}
2398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399/* create or adjust a UnicodeEncodeError */
2400static void make_encode_exception(PyObject **exceptionObject,
2401 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002402 const Py_UNICODE *unicode, Py_ssize_t size,
2403 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002404 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 if (*exceptionObject == NULL) {
2407 *exceptionObject = PyUnicodeEncodeError_Create(
2408 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
2410 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2412 goto onError;
2413 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2414 goto onError;
2415 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2416 goto onError;
2417 return;
2418 onError:
2419 Py_DECREF(*exceptionObject);
2420 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 }
2422}
2423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002424/* raises a UnicodeEncodeError */
2425static void raise_encode_exception(PyObject **exceptionObject,
2426 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002427 const Py_UNICODE *unicode, Py_ssize_t size,
2428 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429 const char *reason)
2430{
2431 make_encode_exception(exceptionObject,
2432 encoding, unicode, size, startpos, endpos, reason);
2433 if (*exceptionObject != NULL)
2434 PyCodec_StrictErrors(*exceptionObject);
2435}
2436
2437/* error handling callback helper:
2438 build arguments, call the callback and check the arguments,
2439 put the result into newpos and return the replacement string, which
2440 has to be freed by the caller */
2441static PyObject *unicode_encode_call_errorhandler(const char *errors,
2442 PyObject **errorHandler,
2443 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002444 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2445 Py_ssize_t startpos, Py_ssize_t endpos,
2446 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002447{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002448 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002449
2450 PyObject *restuple;
2451 PyObject *resunicode;
2452
2453 if (*errorHandler == NULL) {
2454 *errorHandler = PyCodec_LookupError(errors);
2455 if (*errorHandler == NULL)
2456 return NULL;
2457 }
2458
2459 make_encode_exception(exceptionObject,
2460 encoding, unicode, size, startpos, endpos, reason);
2461 if (*exceptionObject == NULL)
2462 return NULL;
2463
2464 restuple = PyObject_CallFunctionObjArgs(
2465 *errorHandler, *exceptionObject, NULL);
2466 if (restuple == NULL)
2467 return NULL;
2468 if (!PyTuple_Check(restuple)) {
2469 PyErr_Format(PyExc_TypeError, &argparse[4]);
2470 Py_DECREF(restuple);
2471 return NULL;
2472 }
2473 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2474 &resunicode, newpos)) {
2475 Py_DECREF(restuple);
2476 return NULL;
2477 }
2478 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002479 *newpos = size+*newpos;
2480 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002481 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002482 Py_DECREF(restuple);
2483 return NULL;
2484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 Py_INCREF(resunicode);
2486 Py_DECREF(restuple);
2487 return resunicode;
2488}
2489
2490static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002491 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 const char *errors,
2493 int limit)
2494{
2495 /* output object */
2496 PyObject *res;
2497 /* pointers to the beginning and end+1 of input */
2498 const Py_UNICODE *startp = p;
2499 const Py_UNICODE *endp = p + size;
2500 /* pointer to the beginning of the unencodable characters */
2501 /* const Py_UNICODE *badp = NULL; */
2502 /* pointer into the output */
2503 char *str;
2504 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 Py_ssize_t respos = 0;
2506 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002507 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2508 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002509 PyObject *errorHandler = NULL;
2510 PyObject *exc = NULL;
2511 /* the following variable is used for caching string comparisons
2512 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2513 int known_errorHandler = -1;
2514
2515 /* allocate enough for a simple encoding without
2516 replacements, if we need more, we'll resize */
2517 res = PyString_FromStringAndSize(NULL, size);
2518 if (res == NULL)
2519 goto onError;
2520 if (size == 0)
2521 return res;
2522 str = PyString_AS_STRING(res);
2523 ressize = size;
2524
2525 while (p<endp) {
2526 Py_UNICODE c = *p;
2527
2528 /* can we encode this? */
2529 if (c<limit) {
2530 /* no overflow check, because we know that the space is enough */
2531 *str++ = (char)c;
2532 ++p;
2533 }
2534 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002535 Py_ssize_t unicodepos = p-startp;
2536 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002538 Py_ssize_t repsize;
2539 Py_ssize_t newpos;
2540 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 Py_UNICODE *uni2;
2542 /* startpos for collecting unencodable chars */
2543 const Py_UNICODE *collstart = p;
2544 const Py_UNICODE *collend = p;
2545 /* find all unecodable characters */
2546 while ((collend < endp) && ((*collend)>=limit))
2547 ++collend;
2548 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2549 if (known_errorHandler==-1) {
2550 if ((errors==NULL) || (!strcmp(errors, "strict")))
2551 known_errorHandler = 1;
2552 else if (!strcmp(errors, "replace"))
2553 known_errorHandler = 2;
2554 else if (!strcmp(errors, "ignore"))
2555 known_errorHandler = 3;
2556 else if (!strcmp(errors, "xmlcharrefreplace"))
2557 known_errorHandler = 4;
2558 else
2559 known_errorHandler = 0;
2560 }
2561 switch (known_errorHandler) {
2562 case 1: /* strict */
2563 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2564 goto onError;
2565 case 2: /* replace */
2566 while (collstart++<collend)
2567 *str++ = '?'; /* fall through */
2568 case 3: /* ignore */
2569 p = collend;
2570 break;
2571 case 4: /* xmlcharrefreplace */
2572 respos = str-PyString_AS_STRING(res);
2573 /* determine replacement size (temporarily (mis)uses p) */
2574 for (p = collstart, repsize = 0; p < collend; ++p) {
2575 if (*p<10)
2576 repsize += 2+1+1;
2577 else if (*p<100)
2578 repsize += 2+2+1;
2579 else if (*p<1000)
2580 repsize += 2+3+1;
2581 else if (*p<10000)
2582 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002583#ifndef Py_UNICODE_WIDE
2584 else
2585 repsize += 2+5+1;
2586#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 else if (*p<100000)
2588 repsize += 2+5+1;
2589 else if (*p<1000000)
2590 repsize += 2+6+1;
2591 else
2592 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002593#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 }
2595 requiredsize = respos+repsize+(endp-collend);
2596 if (requiredsize > ressize) {
2597 if (requiredsize<2*ressize)
2598 requiredsize = 2*ressize;
2599 if (_PyString_Resize(&res, requiredsize))
2600 goto onError;
2601 str = PyString_AS_STRING(res) + respos;
2602 ressize = requiredsize;
2603 }
2604 /* generate replacement (temporarily (mis)uses p) */
2605 for (p = collstart; p < collend; ++p) {
2606 str += sprintf(str, "&#%d;", (int)*p);
2607 }
2608 p = collend;
2609 break;
2610 default:
2611 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2612 encoding, reason, startp, size, &exc,
2613 collstart-startp, collend-startp, &newpos);
2614 if (repunicode == NULL)
2615 goto onError;
2616 /* need more space? (at least enough for what we
2617 have+the replacement+the rest of the string, so
2618 we won't have to check space for encodable characters) */
2619 respos = str-PyString_AS_STRING(res);
2620 repsize = PyUnicode_GET_SIZE(repunicode);
2621 requiredsize = respos+repsize+(endp-collend);
2622 if (requiredsize > ressize) {
2623 if (requiredsize<2*ressize)
2624 requiredsize = 2*ressize;
2625 if (_PyString_Resize(&res, requiredsize)) {
2626 Py_DECREF(repunicode);
2627 goto onError;
2628 }
2629 str = PyString_AS_STRING(res) + respos;
2630 ressize = requiredsize;
2631 }
2632 /* check if there is anything unencodable in the replacement
2633 and copy it to the output */
2634 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2635 c = *uni2;
2636 if (c >= limit) {
2637 raise_encode_exception(&exc, encoding, startp, size,
2638 unicodepos, unicodepos+1, reason);
2639 Py_DECREF(repunicode);
2640 goto onError;
2641 }
2642 *str = (char)c;
2643 }
2644 p = startp + newpos;
2645 Py_DECREF(repunicode);
2646 }
2647 }
2648 }
2649 /* Resize if we allocated to much */
2650 respos = str-PyString_AS_STRING(res);
2651 if (respos<ressize)
2652 /* If this falls res will be NULL */
2653 _PyString_Resize(&res, respos);
2654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
2656 return res;
2657
2658 onError:
2659 Py_XDECREF(res);
2660 Py_XDECREF(errorHandler);
2661 Py_XDECREF(exc);
2662 return NULL;
2663}
2664
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002666 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667 const char *errors)
2668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002669 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670}
2671
2672PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2673{
2674 if (!PyUnicode_Check(unicode)) {
2675 PyErr_BadArgument();
2676 return NULL;
2677 }
2678 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2679 PyUnicode_GET_SIZE(unicode),
2680 NULL);
2681}
2682
2683/* --- 7-bit ASCII Codec -------------------------------------------------- */
2684
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 const char *errors)
2688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyUnicodeObject *v;
2691 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002692 Py_ssize_t startinpos;
2693 Py_ssize_t endinpos;
2694 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002695 const char *e;
2696 PyObject *errorHandler = NULL;
2697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002700 if (size == 1 && *(unsigned char*)s < 128) {
2701 Py_UNICODE r = *(unsigned char*)s;
2702 return PyUnicode_FromUnicode(&r, 1);
2703 }
Tim Petersced69f82003-09-16 20:30:58 +00002704
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 v = _PyUnicode_New(size);
2706 if (v == NULL)
2707 goto onError;
2708 if (size == 0)
2709 return (PyObject *)v;
2710 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 e = s + size;
2712 while (s < e) {
2713 register unsigned char c = (unsigned char)*s;
2714 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 ++s;
2717 }
2718 else {
2719 startinpos = s-starts;
2720 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002721 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 if (unicode_decode_call_errorhandler(
2723 errors, &errorHandler,
2724 "ascii", "ordinal not in range(128)",
2725 starts, size, &startinpos, &endinpos, &exc, &s,
2726 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002730 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002731 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002732 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 onError:
2738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 Py_XDECREF(errorHandler);
2740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 return NULL;
2742}
2743
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002745 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 const char *errors)
2747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749}
2750
2751PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2752{
2753 if (!PyUnicode_Check(unicode)) {
2754 PyErr_BadArgument();
2755 return NULL;
2756 }
2757 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2758 PyUnicode_GET_SIZE(unicode),
2759 NULL);
2760}
2761
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002762#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002764/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002765
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002766PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002767 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002768 const char *errors)
2769{
2770 PyUnicodeObject *v;
2771 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002772 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002773
2774 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002775 assert(size < INT_MAX);
2776 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002777 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002778 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2779
2780 v = _PyUnicode_New(usize);
2781 if (v == NULL)
2782 return NULL;
2783 if (usize == 0)
2784 return (PyObject *)v;
2785 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002786 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002787 Py_DECREF(v);
2788 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2789 }
2790
2791 return (PyObject *)v;
2792}
2793
2794PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002795 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002796 const char *errors)
2797{
2798 PyObject *repr;
2799 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002800 DWORD mbcssize;
2801
2802 /* If there are no characters, bail now! */
2803 if (size==0)
2804 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002805
2806 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002807 assert(size<INT_MAX);
2808 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002809 if (mbcssize==0)
2810 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2811
2812 repr = PyString_FromStringAndSize(NULL, mbcssize);
2813 if (repr == NULL)
2814 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002815 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002816 return repr;
2817
2818 /* Do the conversion */
2819 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002820 assert(size < INT_MAX);
2821 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002822 Py_DECREF(repr);
2823 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2824 }
2825 return repr;
2826}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002827
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002828PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2829{
2830 if (!PyUnicode_Check(unicode)) {
2831 PyErr_BadArgument();
2832 return NULL;
2833 }
2834 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2835 PyUnicode_GET_SIZE(unicode),
2836 NULL);
2837}
2838
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002839#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002840
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841/* --- Character Mapping Codec -------------------------------------------- */
2842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002844 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 PyObject *mapping,
2846 const char *errors)
2847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t startinpos;
2850 Py_ssize_t endinpos;
2851 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 PyUnicodeObject *v;
2854 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 PyObject *errorHandler = NULL;
2857 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002858 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002859 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002860
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 /* Default to Latin-1 */
2862 if (mapping == NULL)
2863 return PyUnicode_DecodeLatin1(s, size, errors);
2864
2865 v = _PyUnicode_New(size);
2866 if (v == NULL)
2867 goto onError;
2868 if (size == 0)
2869 return (PyObject *)v;
2870 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002872 if (PyUnicode_CheckExact(mapping)) {
2873 mapstring = PyUnicode_AS_UNICODE(mapping);
2874 maplen = PyUnicode_GET_SIZE(mapping);
2875 while (s < e) {
2876 unsigned char ch = *s;
2877 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002879 if (ch < maplen)
2880 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002882 if (x == 0xfffe) {
2883 /* undefined mapping */
2884 outpos = p-PyUnicode_AS_UNICODE(v);
2885 startinpos = s-starts;
2886 endinpos = startinpos+1;
2887 if (unicode_decode_call_errorhandler(
2888 errors, &errorHandler,
2889 "charmap", "character maps to <undefined>",
2890 starts, size, &startinpos, &endinpos, &exc, &s,
2891 (PyObject **)&v, &outpos, &p)) {
2892 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002893 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002894 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002895 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002896 *p++ = x;
2897 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002899 }
2900 else {
2901 while (s < e) {
2902 unsigned char ch = *s;
2903 PyObject *w, *x;
2904
2905 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2906 w = PyInt_FromLong((long)ch);
2907 if (w == NULL)
2908 goto onError;
2909 x = PyObject_GetItem(mapping, w);
2910 Py_DECREF(w);
2911 if (x == NULL) {
2912 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2913 /* No mapping found means: mapping is undefined. */
2914 PyErr_Clear();
2915 x = Py_None;
2916 Py_INCREF(x);
2917 } else
2918 goto onError;
2919 }
2920
2921 /* Apply mapping */
2922 if (PyInt_Check(x)) {
2923 long value = PyInt_AS_LONG(x);
2924 if (value < 0 || value > 65535) {
2925 PyErr_SetString(PyExc_TypeError,
2926 "character mapping must be in range(65536)");
2927 Py_DECREF(x);
2928 goto onError;
2929 }
2930 *p++ = (Py_UNICODE)value;
2931 }
2932 else if (x == Py_None) {
2933 /* undefined mapping */
2934 outpos = p-PyUnicode_AS_UNICODE(v);
2935 startinpos = s-starts;
2936 endinpos = startinpos+1;
2937 if (unicode_decode_call_errorhandler(
2938 errors, &errorHandler,
2939 "charmap", "character maps to <undefined>",
2940 starts, size, &startinpos, &endinpos, &exc, &s,
2941 (PyObject **)&v, &outpos, &p)) {
2942 Py_DECREF(x);
2943 goto onError;
2944 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002945 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002946 continue;
2947 }
2948 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002949 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002950
2951 if (targetsize == 1)
2952 /* 1-1 mapping */
2953 *p++ = *PyUnicode_AS_UNICODE(x);
2954
2955 else if (targetsize > 1) {
2956 /* 1-n mapping */
2957 if (targetsize > extrachars) {
2958 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002959 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2960 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002961 (targetsize << 2);
2962 extrachars += needed;
2963 if (_PyUnicode_Resize(&v,
2964 PyUnicode_GET_SIZE(v) + needed) < 0) {
2965 Py_DECREF(x);
2966 goto onError;
2967 }
2968 p = PyUnicode_AS_UNICODE(v) + oldpos;
2969 }
2970 Py_UNICODE_COPY(p,
2971 PyUnicode_AS_UNICODE(x),
2972 targetsize);
2973 p += targetsize;
2974 extrachars -= targetsize;
2975 }
2976 /* 1-0 mapping: skip the character */
2977 }
2978 else {
2979 /* wrong return value */
2980 PyErr_SetString(PyExc_TypeError,
2981 "character mapping must return integer, None or unicode");
2982 Py_DECREF(x);
2983 goto onError;
2984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002986 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 }
2989 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002990 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002992 Py_XDECREF(errorHandler);
2993 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 Py_XDECREF(v);
3000 return NULL;
3001}
3002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003/* Lookup the character ch in the mapping. If the character
3004 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003005 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 PyObject *w = PyInt_FromLong((long)c);
3009 PyObject *x;
3010
3011 if (w == NULL)
3012 return NULL;
3013 x = PyObject_GetItem(mapping, w);
3014 Py_DECREF(w);
3015 if (x == NULL) {
3016 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3017 /* No mapping found means: mapping is undefined. */
3018 PyErr_Clear();
3019 x = Py_None;
3020 Py_INCREF(x);
3021 return x;
3022 } else
3023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003025 else if (x == Py_None)
3026 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 else if (PyInt_Check(x)) {
3028 long value = PyInt_AS_LONG(x);
3029 if (value < 0 || value > 255) {
3030 PyErr_SetString(PyExc_TypeError,
3031 "character mapping must be in range(256)");
3032 Py_DECREF(x);
3033 return NULL;
3034 }
3035 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 else if (PyString_Check(x))
3038 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 /* wrong return value */
3041 PyErr_SetString(PyExc_TypeError,
3042 "character mapping must return integer, None or str");
3043 Py_DECREF(x);
3044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 }
3046}
3047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048/* lookup the character, put the result in the output string and adjust
3049 various state variables. Reallocate the output string if not enough
3050 space is available. Return a new reference to the object that
3051 was put in the output buffer, or Py_None, if the mapping was undefined
3052 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003053 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054static
3055PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003056 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057{
3058 PyObject *rep = charmapencode_lookup(c, mapping);
3059
3060 if (rep==NULL)
3061 return NULL;
3062 else if (rep==Py_None)
3063 return rep;
3064 else {
3065 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003068 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 if (outsize<requiredsize) {
3070 /* exponentially overallocate to minimize reallocations */
3071 if (requiredsize < 2*outsize)
3072 requiredsize = 2*outsize;
3073 if (_PyString_Resize(outobj, requiredsize)) {
3074 Py_DECREF(rep);
3075 return NULL;
3076 }
3077 outstart = PyString_AS_STRING(*outobj);
3078 }
3079 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3080 }
3081 else {
3082 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003083 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3084 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 if (outsize<requiredsize) {
3086 /* exponentially overallocate to minimize reallocations */
3087 if (requiredsize < 2*outsize)
3088 requiredsize = 2*outsize;
3089 if (_PyString_Resize(outobj, requiredsize)) {
3090 Py_DECREF(rep);
3091 return NULL;
3092 }
3093 outstart = PyString_AS_STRING(*outobj);
3094 }
3095 memcpy(outstart + *outpos, repchars, repsize);
3096 *outpos += repsize;
3097 }
3098 }
3099 return rep;
3100}
3101
3102/* handle an error in PyUnicode_EncodeCharmap
3103 Return 0 on success, -1 on error */
3104static
3105int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003108 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003109 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110{
3111 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003112 Py_ssize_t repsize;
3113 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 Py_UNICODE *uni2;
3115 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003116 Py_ssize_t collstartpos = *inpos;
3117 Py_ssize_t collendpos = *inpos+1;
3118 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 char *encoding = "charmap";
3120 char *reason = "character maps to <undefined>";
3121
3122 PyObject *x;
3123 /* find all unencodable characters */
3124 while (collendpos < size) {
3125 x = charmapencode_lookup(p[collendpos], mapping);
3126 if (x==NULL)
3127 return -1;
3128 else if (x!=Py_None) {
3129 Py_DECREF(x);
3130 break;
3131 }
3132 Py_DECREF(x);
3133 ++collendpos;
3134 }
3135 /* cache callback name lookup
3136 * (if not done yet, i.e. it's the first error) */
3137 if (*known_errorHandler==-1) {
3138 if ((errors==NULL) || (!strcmp(errors, "strict")))
3139 *known_errorHandler = 1;
3140 else if (!strcmp(errors, "replace"))
3141 *known_errorHandler = 2;
3142 else if (!strcmp(errors, "ignore"))
3143 *known_errorHandler = 3;
3144 else if (!strcmp(errors, "xmlcharrefreplace"))
3145 *known_errorHandler = 4;
3146 else
3147 *known_errorHandler = 0;
3148 }
3149 switch (*known_errorHandler) {
3150 case 1: /* strict */
3151 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3152 return -1;
3153 case 2: /* replace */
3154 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3155 x = charmapencode_output('?', mapping, res, respos);
3156 if (x==NULL) {
3157 return -1;
3158 }
3159 else if (x==Py_None) {
3160 Py_DECREF(x);
3161 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3162 return -1;
3163 }
3164 Py_DECREF(x);
3165 }
3166 /* fall through */
3167 case 3: /* ignore */
3168 *inpos = collendpos;
3169 break;
3170 case 4: /* xmlcharrefreplace */
3171 /* generate replacement (temporarily (mis)uses p) */
3172 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3173 char buffer[2+29+1+1];
3174 char *cp;
3175 sprintf(buffer, "&#%d;", (int)p[collpos]);
3176 for (cp = buffer; *cp; ++cp) {
3177 x = charmapencode_output(*cp, mapping, res, respos);
3178 if (x==NULL)
3179 return -1;
3180 else if (x==Py_None) {
3181 Py_DECREF(x);
3182 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3183 return -1;
3184 }
3185 Py_DECREF(x);
3186 }
3187 }
3188 *inpos = collendpos;
3189 break;
3190 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003191 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 encoding, reason, p, size, exceptionObject,
3193 collstartpos, collendpos, &newpos);
3194 if (repunicode == NULL)
3195 return -1;
3196 /* generate replacement */
3197 repsize = PyUnicode_GET_SIZE(repunicode);
3198 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3199 x = charmapencode_output(*uni2, mapping, res, respos);
3200 if (x==NULL) {
3201 Py_DECREF(repunicode);
3202 return -1;
3203 }
3204 else if (x==Py_None) {
3205 Py_DECREF(repunicode);
3206 Py_DECREF(x);
3207 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3208 return -1;
3209 }
3210 Py_DECREF(x);
3211 }
3212 *inpos = newpos;
3213 Py_DECREF(repunicode);
3214 }
3215 return 0;
3216}
3217
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003219 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 PyObject *mapping,
3221 const char *errors)
3222{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 /* output object */
3224 PyObject *res = NULL;
3225 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003226 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003228 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003229 PyObject *errorHandler = NULL;
3230 PyObject *exc = NULL;
3231 /* the following variable is used for caching string comparisons
3232 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3233 * 3=ignore, 4=xmlcharrefreplace */
3234 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235
3236 /* Default to Latin-1 */
3237 if (mapping == NULL)
3238 return PyUnicode_EncodeLatin1(p, size, errors);
3239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 /* allocate enough for a simple encoding without
3241 replacements, if we need more, we'll resize */
3242 res = PyString_FromStringAndSize(NULL, size);
3243 if (res == NULL)
3244 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003245 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 while (inpos<size) {
3249 /* try to encode it */
3250 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3251 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 if (x==Py_None) { /* unencodable character */
3254 if (charmap_encoding_error(p, size, &inpos, mapping,
3255 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003256 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003257 &res, &respos)) {
3258 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003259 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 else
3263 /* done with this character => adjust input position */
3264 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 Py_DECREF(x);
3266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 /* Resize if we allocated to much */
3269 if (respos<PyString_GET_SIZE(res)) {
3270 if (_PyString_Resize(&res, respos))
3271 goto onError;
3272 }
3273 Py_XDECREF(exc);
3274 Py_XDECREF(errorHandler);
3275 return res;
3276
3277 onError:
3278 Py_XDECREF(res);
3279 Py_XDECREF(exc);
3280 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 return NULL;
3282}
3283
3284PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3285 PyObject *mapping)
3286{
3287 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3288 PyErr_BadArgument();
3289 return NULL;
3290 }
3291 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3292 PyUnicode_GET_SIZE(unicode),
3293 mapping,
3294 NULL);
3295}
3296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297/* create or adjust a UnicodeTranslateError */
3298static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003299 const Py_UNICODE *unicode, Py_ssize_t size,
3300 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003301 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 if (*exceptionObject == NULL) {
3304 *exceptionObject = PyUnicodeTranslateError_Create(
3305 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
3307 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3309 goto onError;
3310 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3311 goto onError;
3312 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3313 goto onError;
3314 return;
3315 onError:
3316 Py_DECREF(*exceptionObject);
3317 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
3319}
3320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321/* raises a UnicodeTranslateError */
3322static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 const Py_UNICODE *unicode, Py_ssize_t size,
3324 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 const char *reason)
3326{
3327 make_translate_exception(exceptionObject,
3328 unicode, size, startpos, endpos, reason);
3329 if (*exceptionObject != NULL)
3330 PyCodec_StrictErrors(*exceptionObject);
3331}
3332
3333/* error handling callback helper:
3334 build arguments, call the callback and check the arguments,
3335 put the result into newpos and return the replacement string, which
3336 has to be freed by the caller */
3337static PyObject *unicode_translate_call_errorhandler(const char *errors,
3338 PyObject **errorHandler,
3339 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003340 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3341 Py_ssize_t startpos, Py_ssize_t endpos,
3342 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343{
3344 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3345
Martin v. Löwis18e16552006-02-15 17:27:45 +00003346 int i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 PyObject *restuple;
3348 PyObject *resunicode;
3349
3350 if (*errorHandler == NULL) {
3351 *errorHandler = PyCodec_LookupError(errors);
3352 if (*errorHandler == NULL)
3353 return NULL;
3354 }
3355
3356 make_translate_exception(exceptionObject,
3357 unicode, size, startpos, endpos, reason);
3358 if (*exceptionObject == NULL)
3359 return NULL;
3360
3361 restuple = PyObject_CallFunctionObjArgs(
3362 *errorHandler, *exceptionObject, NULL);
3363 if (restuple == NULL)
3364 return NULL;
3365 if (!PyTuple_Check(restuple)) {
3366 PyErr_Format(PyExc_TypeError, &argparse[4]);
3367 Py_DECREF(restuple);
3368 return NULL;
3369 }
3370 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 Py_DECREF(restuple);
3373 return NULL;
3374 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 if (i_newpos<0)
3376 *newpos = size+i_newpos;
3377 else
3378 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003379 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003380 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003381 Py_DECREF(restuple);
3382 return NULL;
3383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 Py_INCREF(resunicode);
3385 Py_DECREF(restuple);
3386 return resunicode;
3387}
3388
3389/* Lookup the character ch in the mapping and put the result in result,
3390 which must be decrefed by the caller.
3391 Return 0 on success, -1 on error */
3392static
3393int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3394{
3395 PyObject *w = PyInt_FromLong((long)c);
3396 PyObject *x;
3397
3398 if (w == NULL)
3399 return -1;
3400 x = PyObject_GetItem(mapping, w);
3401 Py_DECREF(w);
3402 if (x == NULL) {
3403 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3404 /* No mapping found means: use 1:1 mapping. */
3405 PyErr_Clear();
3406 *result = NULL;
3407 return 0;
3408 } else
3409 return -1;
3410 }
3411 else if (x == Py_None) {
3412 *result = x;
3413 return 0;
3414 }
3415 else if (PyInt_Check(x)) {
3416 long value = PyInt_AS_LONG(x);
3417 long max = PyUnicode_GetMax();
3418 if (value < 0 || value > max) {
3419 PyErr_Format(PyExc_TypeError,
3420 "character mapping must be in range(0x%lx)", max+1);
3421 Py_DECREF(x);
3422 return -1;
3423 }
3424 *result = x;
3425 return 0;
3426 }
3427 else if (PyUnicode_Check(x)) {
3428 *result = x;
3429 return 0;
3430 }
3431 else {
3432 /* wrong return value */
3433 PyErr_SetString(PyExc_TypeError,
3434 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003435 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 return -1;
3437 }
3438}
3439/* ensure that *outobj is at least requiredsize characters long,
3440if not reallocate and adjust various state variables.
3441Return 0 on success, -1 on error */
3442static
Walter Dörwald4894c302003-10-24 14:25:28 +00003443int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003446 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003447 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003451 if (requiredsize < 2 * oldsize)
3452 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003453 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 return -1;
3455 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 }
3457 return 0;
3458}
3459/* lookup the character, put the result in the output string and adjust
3460 various state variables. Return a new reference to the object that
3461 was put in the output buffer in *result, or Py_None, if the mapping was
3462 undefined (in which case no character was written).
3463 The called must decref result.
3464 Return 0 on success, -1 on error. */
3465static
Walter Dörwald4894c302003-10-24 14:25:28 +00003466int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003467 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469{
Walter Dörwald4894c302003-10-24 14:25:28 +00003470 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 return -1;
3472 if (*res==NULL) {
3473 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003474 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 }
3476 else if (*res==Py_None)
3477 ;
3478 else if (PyInt_Check(*res)) {
3479 /* no overflow check, because we know that the space is enough */
3480 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3481 }
3482 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 if (repsize==1) {
3485 /* no overflow check, because we know that the space is enough */
3486 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3487 }
3488 else if (repsize!=0) {
3489 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003490 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003491 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003492 repsize - 1;
3493 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 return -1;
3495 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3496 *outp += repsize;
3497 }
3498 }
3499 else
3500 return -1;
3501 return 0;
3502}
3503
3504PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003505 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 PyObject *mapping,
3507 const char *errors)
3508{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 /* output object */
3510 PyObject *res = NULL;
3511 /* pointers to the beginning and end+1 of input */
3512 const Py_UNICODE *startp = p;
3513 const Py_UNICODE *endp = p + size;
3514 /* pointer into the output */
3515 Py_UNICODE *str;
3516 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 char *reason = "character maps to <undefined>";
3519 PyObject *errorHandler = NULL;
3520 PyObject *exc = NULL;
3521 /* the following variable is used for caching string comparisons
3522 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3523 * 3=ignore, 4=xmlcharrefreplace */
3524 int known_errorHandler = -1;
3525
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 if (mapping == NULL) {
3527 PyErr_BadArgument();
3528 return NULL;
3529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530
3531 /* allocate enough for a simple 1:1 translation without
3532 replacements, if we need more, we'll resize */
3533 res = PyUnicode_FromUnicode(NULL, size);
3534 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 return res;
3538 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 while (p<endp) {
3541 /* try to encode it */
3542 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 goto onError;
3546 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003547 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (x!=Py_None) /* it worked => adjust input pointer */
3549 ++p;
3550 else { /* untranslatable character */
3551 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003552 Py_ssize_t repsize;
3553 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 Py_UNICODE *uni2;
3555 /* startpos for collecting untranslatable chars */
3556 const Py_UNICODE *collstart = p;
3557 const Py_UNICODE *collend = p+1;
3558 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 /* find all untranslatable characters */
3561 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003562 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 goto onError;
3564 Py_XDECREF(x);
3565 if (x!=Py_None)
3566 break;
3567 ++collend;
3568 }
3569 /* cache callback name lookup
3570 * (if not done yet, i.e. it's the first error) */
3571 if (known_errorHandler==-1) {
3572 if ((errors==NULL) || (!strcmp(errors, "strict")))
3573 known_errorHandler = 1;
3574 else if (!strcmp(errors, "replace"))
3575 known_errorHandler = 2;
3576 else if (!strcmp(errors, "ignore"))
3577 known_errorHandler = 3;
3578 else if (!strcmp(errors, "xmlcharrefreplace"))
3579 known_errorHandler = 4;
3580 else
3581 known_errorHandler = 0;
3582 }
3583 switch (known_errorHandler) {
3584 case 1: /* strict */
3585 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3586 goto onError;
3587 case 2: /* replace */
3588 /* No need to check for space, this is a 1:1 replacement */
3589 for (coll = collstart; coll<collend; ++coll)
3590 *str++ = '?';
3591 /* fall through */
3592 case 3: /* ignore */
3593 p = collend;
3594 break;
3595 case 4: /* xmlcharrefreplace */
3596 /* generate replacement (temporarily (mis)uses p) */
3597 for (p = collstart; p < collend; ++p) {
3598 char buffer[2+29+1+1];
3599 char *cp;
3600 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003601 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3603 goto onError;
3604 for (cp = buffer; *cp; ++cp)
3605 *str++ = *cp;
3606 }
3607 p = collend;
3608 break;
3609 default:
3610 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3611 reason, startp, size, &exc,
3612 collstart-startp, collend-startp, &newpos);
3613 if (repunicode == NULL)
3614 goto onError;
3615 /* generate replacement */
3616 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003617 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3619 Py_DECREF(repunicode);
3620 goto onError;
3621 }
3622 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3623 *str++ = *uni2;
3624 p = startp + newpos;
3625 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 }
3627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 /* Resize if we allocated to much */
3630 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003631 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003632 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003633 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 }
3635 Py_XDECREF(exc);
3636 Py_XDECREF(errorHandler);
3637 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 onError:
3640 Py_XDECREF(res);
3641 Py_XDECREF(exc);
3642 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 return NULL;
3644}
3645
3646PyObject *PyUnicode_Translate(PyObject *str,
3647 PyObject *mapping,
3648 const char *errors)
3649{
3650 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 str = PyUnicode_FromObject(str);
3653 if (str == NULL)
3654 goto onError;
3655 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3656 PyUnicode_GET_SIZE(str),
3657 mapping,
3658 errors);
3659 Py_DECREF(str);
3660 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 onError:
3663 Py_XDECREF(str);
3664 return NULL;
3665}
Tim Petersced69f82003-09-16 20:30:58 +00003666
Guido van Rossum9e896b32000-04-05 20:11:21 +00003667/* --- Decimal Encoder ---------------------------------------------------- */
3668
3669int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003670 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003671 char *output,
3672 const char *errors)
3673{
3674 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 PyObject *errorHandler = NULL;
3676 PyObject *exc = NULL;
3677 const char *encoding = "decimal";
3678 const char *reason = "invalid decimal Unicode string";
3679 /* the following variable is used for caching string comparisons
3680 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3681 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003682
3683 if (output == NULL) {
3684 PyErr_BadArgument();
3685 return -1;
3686 }
3687
3688 p = s;
3689 end = s + length;
3690 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003692 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003694 Py_ssize_t repsize;
3695 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 Py_UNICODE *uni2;
3697 Py_UNICODE *collstart;
3698 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003699
Guido van Rossum9e896b32000-04-05 20:11:21 +00003700 if (Py_UNICODE_ISSPACE(ch)) {
3701 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003703 continue;
3704 }
3705 decimal = Py_UNICODE_TODECIMAL(ch);
3706 if (decimal >= 0) {
3707 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003709 continue;
3710 }
Guido van Rossumba477042000-04-06 18:18:10 +00003711 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003712 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003714 continue;
3715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 /* All other characters are considered unencodable */
3717 collstart = p;
3718 collend = p+1;
3719 while (collend < end) {
3720 if ((0 < *collend && *collend < 256) ||
3721 !Py_UNICODE_ISSPACE(*collend) ||
3722 Py_UNICODE_TODECIMAL(*collend))
3723 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 /* cache callback name lookup
3726 * (if not done yet, i.e. it's the first error) */
3727 if (known_errorHandler==-1) {
3728 if ((errors==NULL) || (!strcmp(errors, "strict")))
3729 known_errorHandler = 1;
3730 else if (!strcmp(errors, "replace"))
3731 known_errorHandler = 2;
3732 else if (!strcmp(errors, "ignore"))
3733 known_errorHandler = 3;
3734 else if (!strcmp(errors, "xmlcharrefreplace"))
3735 known_errorHandler = 4;
3736 else
3737 known_errorHandler = 0;
3738 }
3739 switch (known_errorHandler) {
3740 case 1: /* strict */
3741 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3742 goto onError;
3743 case 2: /* replace */
3744 for (p = collstart; p < collend; ++p)
3745 *output++ = '?';
3746 /* fall through */
3747 case 3: /* ignore */
3748 p = collend;
3749 break;
3750 case 4: /* xmlcharrefreplace */
3751 /* generate replacement (temporarily (mis)uses p) */
3752 for (p = collstart; p < collend; ++p)
3753 output += sprintf(output, "&#%d;", (int)*p);
3754 p = collend;
3755 break;
3756 default:
3757 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3758 encoding, reason, s, length, &exc,
3759 collstart-s, collend-s, &newpos);
3760 if (repunicode == NULL)
3761 goto onError;
3762 /* generate replacement */
3763 repsize = PyUnicode_GET_SIZE(repunicode);
3764 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3765 Py_UNICODE ch = *uni2;
3766 if (Py_UNICODE_ISSPACE(ch))
3767 *output++ = ' ';
3768 else {
3769 decimal = Py_UNICODE_TODECIMAL(ch);
3770 if (decimal >= 0)
3771 *output++ = '0' + decimal;
3772 else if (0 < ch && ch < 256)
3773 *output++ = (char)ch;
3774 else {
3775 Py_DECREF(repunicode);
3776 raise_encode_exception(&exc, encoding,
3777 s, length, collstart-s, collend-s, reason);
3778 goto onError;
3779 }
3780 }
3781 }
3782 p = s + newpos;
3783 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003784 }
3785 }
3786 /* 0-terminate the output string */
3787 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(exc);
3789 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003790 return 0;
3791
3792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 Py_XDECREF(exc);
3794 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003795 return -1;
3796}
3797
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798/* --- Helpers ------------------------------------------------------------ */
3799
Tim Petersced69f82003-09-16 20:30:58 +00003800static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003801Py_ssize_t count(PyUnicodeObject *self,
3802 Py_ssize_t start,
3803 Py_ssize_t end,
3804 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805{
3806 int count = 0;
3807
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003808 if (start < 0)
3809 start += self->length;
3810 if (start < 0)
3811 start = 0;
3812 if (end > self->length)
3813 end = self->length;
3814 if (end < 0)
3815 end += self->length;
3816 if (end < 0)
3817 end = 0;
3818
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003819 if (substring->length == 0)
3820 return (end - start + 1);
3821
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 end -= substring->length;
3823
3824 while (start <= end)
3825 if (Py_UNICODE_MATCH(self, start, substring)) {
3826 count++;
3827 start += substring->length;
3828 } else
3829 start++;
3830
3831 return count;
3832}
3833
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003836 Py_ssize_t start,
3837 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003839 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 str = PyUnicode_FromObject(str);
3842 if (str == NULL)
3843 return -1;
3844 substr = PyUnicode_FromObject(substr);
3845 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003846 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 return -1;
3848 }
Tim Petersced69f82003-09-16 20:30:58 +00003849
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 result = count((PyUnicodeObject *)str,
3851 start, end,
3852 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003853
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 Py_DECREF(str);
3855 Py_DECREF(substr);
3856 return result;
3857}
3858
Tim Petersced69f82003-09-16 20:30:58 +00003859static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003860Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003862 Py_ssize_t start,
3863 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 int direction)
3865{
3866 if (start < 0)
3867 start += self->length;
3868 if (start < 0)
3869 start = 0;
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 if (end > self->length)
3872 end = self->length;
3873 if (end < 0)
3874 end += self->length;
3875 if (end < 0)
3876 end = 0;
3877
Guido van Rossum76afbd92002-08-20 17:29:29 +00003878 if (substring->length == 0)
3879 return (direction > 0) ? start : end;
3880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 end -= substring->length;
3882
3883 if (direction < 0) {
3884 for (; end >= start; end--)
3885 if (Py_UNICODE_MATCH(self, end, substring))
3886 return end;
3887 } else {
3888 for (; start <= end; start++)
3889 if (Py_UNICODE_MATCH(self, start, substring))
3890 return start;
3891 }
3892
3893 return -1;
3894}
3895
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003898 Py_ssize_t start,
3899 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 int direction)
3901{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003902 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003903
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 str = PyUnicode_FromObject(str);
3905 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003906 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 substr = PyUnicode_FromObject(substr);
3908 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003909 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003910 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 }
Tim Petersced69f82003-09-16 20:30:58 +00003912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 result = findstring((PyUnicodeObject *)str,
3914 (PyUnicodeObject *)substr,
3915 start, end, direction);
3916 Py_DECREF(str);
3917 Py_DECREF(substr);
3918 return result;
3919}
3920
Tim Petersced69f82003-09-16 20:30:58 +00003921static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922int tailmatch(PyUnicodeObject *self,
3923 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003924 Py_ssize_t start,
3925 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 int direction)
3927{
3928 if (start < 0)
3929 start += self->length;
3930 if (start < 0)
3931 start = 0;
3932
3933 if (substring->length == 0)
3934 return 1;
3935
3936 if (end > self->length)
3937 end = self->length;
3938 if (end < 0)
3939 end += self->length;
3940 if (end < 0)
3941 end = 0;
3942
3943 end -= substring->length;
3944 if (end < start)
3945 return 0;
3946
3947 if (direction > 0) {
3948 if (Py_UNICODE_MATCH(self, end, substring))
3949 return 1;
3950 } else {
3951 if (Py_UNICODE_MATCH(self, start, substring))
3952 return 1;
3953 }
3954
3955 return 0;
3956}
3957
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003960 Py_ssize_t start,
3961 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 int direction)
3963{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003964 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003965
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 str = PyUnicode_FromObject(str);
3967 if (str == NULL)
3968 return -1;
3969 substr = PyUnicode_FromObject(substr);
3970 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003971 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 return -1;
3973 }
Tim Petersced69f82003-09-16 20:30:58 +00003974
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 result = tailmatch((PyUnicodeObject *)str,
3976 (PyUnicodeObject *)substr,
3977 start, end, direction);
3978 Py_DECREF(str);
3979 Py_DECREF(substr);
3980 return result;
3981}
3982
Tim Petersced69f82003-09-16 20:30:58 +00003983static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 Py_UNICODE ch)
3987{
3988 /* like wcschr, but doesn't stop at NULL characters */
3989
3990 while (size-- > 0) {
3991 if (*s == ch)
3992 return s;
3993 s++;
3994 }
3995
3996 return NULL;
3997}
3998
3999/* Apply fixfct filter to the Unicode object self and return a
4000 reference to the modified object */
4001
Tim Petersced69f82003-09-16 20:30:58 +00004002static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003PyObject *fixup(PyUnicodeObject *self,
4004 int (*fixfct)(PyUnicodeObject *s))
4005{
4006
4007 PyUnicodeObject *u;
4008
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004009 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 if (u == NULL)
4011 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004012
4013 Py_UNICODE_COPY(u->str, self->str, self->length);
4014
Tim Peters7a29bd52001-09-12 03:03:31 +00004015 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 /* fixfct should return TRUE if it modified the buffer. If
4017 FALSE, return a reference to the original buffer instead
4018 (to save space, not time) */
4019 Py_INCREF(self);
4020 Py_DECREF(u);
4021 return (PyObject*) self;
4022 }
4023 return (PyObject*) u;
4024}
4025
Tim Petersced69f82003-09-16 20:30:58 +00004026static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027int fixupper(PyUnicodeObject *self)
4028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004029 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 Py_UNICODE *s = self->str;
4031 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004032
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 while (len-- > 0) {
4034 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 ch = Py_UNICODE_TOUPPER(*s);
4037 if (ch != *s) {
4038 status = 1;
4039 *s = ch;
4040 }
4041 s++;
4042 }
4043
4044 return status;
4045}
4046
Tim Petersced69f82003-09-16 20:30:58 +00004047static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048int fixlower(PyUnicodeObject *self)
4049{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 Py_UNICODE *s = self->str;
4052 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004053
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 while (len-- > 0) {
4055 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 ch = Py_UNICODE_TOLOWER(*s);
4058 if (ch != *s) {
4059 status = 1;
4060 *s = ch;
4061 }
4062 s++;
4063 }
4064
4065 return status;
4066}
4067
Tim Petersced69f82003-09-16 20:30:58 +00004068static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069int fixswapcase(PyUnicodeObject *self)
4070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004071 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 Py_UNICODE *s = self->str;
4073 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 while (len-- > 0) {
4076 if (Py_UNICODE_ISUPPER(*s)) {
4077 *s = Py_UNICODE_TOLOWER(*s);
4078 status = 1;
4079 } else if (Py_UNICODE_ISLOWER(*s)) {
4080 *s = Py_UNICODE_TOUPPER(*s);
4081 status = 1;
4082 }
4083 s++;
4084 }
4085
4086 return status;
4087}
4088
Tim Petersced69f82003-09-16 20:30:58 +00004089static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090int fixcapitalize(PyUnicodeObject *self)
4091{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004092 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004093 Py_UNICODE *s = self->str;
4094 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004095
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004096 if (len == 0)
4097 return 0;
4098 if (Py_UNICODE_ISLOWER(*s)) {
4099 *s = Py_UNICODE_TOUPPER(*s);
4100 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004102 s++;
4103 while (--len > 0) {
4104 if (Py_UNICODE_ISUPPER(*s)) {
4105 *s = Py_UNICODE_TOLOWER(*s);
4106 status = 1;
4107 }
4108 s++;
4109 }
4110 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111}
4112
4113static
4114int fixtitle(PyUnicodeObject *self)
4115{
4116 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4117 register Py_UNICODE *e;
4118 int previous_is_cased;
4119
4120 /* Shortcut for single character strings */
4121 if (PyUnicode_GET_SIZE(self) == 1) {
4122 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4123 if (*p != ch) {
4124 *p = ch;
4125 return 1;
4126 }
4127 else
4128 return 0;
4129 }
Tim Petersced69f82003-09-16 20:30:58 +00004130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 e = p + PyUnicode_GET_SIZE(self);
4132 previous_is_cased = 0;
4133 for (; p < e; p++) {
4134 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 if (previous_is_cased)
4137 *p = Py_UNICODE_TOLOWER(ch);
4138 else
4139 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004140
4141 if (Py_UNICODE_ISLOWER(ch) ||
4142 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 Py_UNICODE_ISTITLE(ch))
4144 previous_is_cased = 1;
4145 else
4146 previous_is_cased = 0;
4147 }
4148 return 1;
4149}
4150
Tim Peters8ce9f162004-08-27 01:49:32 +00004151PyObject *
4152PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153{
Tim Peters8ce9f162004-08-27 01:49:32 +00004154 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004155 const Py_UNICODE blank = ' ';
4156 const Py_UNICODE *sep = &blank;
4157 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004158 PyUnicodeObject *res = NULL; /* the result */
4159 size_t res_alloc = 100; /* # allocated bytes for string in res */
4160 size_t res_used; /* # used bytes */
4161 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4162 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004164 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 int i;
4166
Tim Peters05eba1f2004-08-27 21:32:02 +00004167 fseq = PySequence_Fast(seq, "");
4168 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004169 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004170 }
4171
Tim Peters91879ab2004-08-27 22:35:44 +00004172 /* Grrrr. A codec may be invoked to convert str objects to
4173 * Unicode, and so it's possible to call back into Python code
4174 * during PyUnicode_FromObject(), and so it's possible for a sick
4175 * codec to change the size of fseq (if seq is a list). Therefore
4176 * we have to keep refetching the size -- can't assume seqlen
4177 * is invariant.
4178 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004179 seqlen = PySequence_Fast_GET_SIZE(fseq);
4180 /* If empty sequence, return u"". */
4181 if (seqlen == 0) {
4182 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4183 goto Done;
4184 }
4185 /* If singleton sequence with an exact Unicode, return that. */
4186 if (seqlen == 1) {
4187 item = PySequence_Fast_GET_ITEM(fseq, 0);
4188 if (PyUnicode_CheckExact(item)) {
4189 Py_INCREF(item);
4190 res = (PyUnicodeObject *)item;
4191 goto Done;
4192 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004193 }
4194
Tim Peters05eba1f2004-08-27 21:32:02 +00004195 /* At least two items to join, or one that isn't exact Unicode. */
4196 if (seqlen > 1) {
4197 /* Set up sep and seplen -- they're needed. */
4198 if (separator == NULL) {
4199 sep = &blank;
4200 seplen = 1;
4201 }
4202 else {
4203 internal_separator = PyUnicode_FromObject(separator);
4204 if (internal_separator == NULL)
4205 goto onError;
4206 sep = PyUnicode_AS_UNICODE(internal_separator);
4207 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004208 /* In case PyUnicode_FromObject() mutated seq. */
4209 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004210 }
4211 }
4212
4213 /* Get space. */
4214 res = _PyUnicode_New((int)res_alloc);
4215 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004216 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004217 res_p = PyUnicode_AS_UNICODE(res);
4218 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004219
Tim Peters05eba1f2004-08-27 21:32:02 +00004220 for (i = 0; i < seqlen; ++i) {
4221 size_t itemlen;
4222 size_t new_res_used;
4223
4224 item = PySequence_Fast_GET_ITEM(fseq, i);
4225 /* Convert item to Unicode. */
4226 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4227 PyErr_Format(PyExc_TypeError,
4228 "sequence item %i: expected string or Unicode,"
4229 " %.80s found",
4230 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004231 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004232 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004233 item = PyUnicode_FromObject(item);
4234 if (item == NULL)
4235 goto onError;
4236 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004237
Tim Peters91879ab2004-08-27 22:35:44 +00004238 /* In case PyUnicode_FromObject() mutated seq. */
4239 seqlen = PySequence_Fast_GET_SIZE(fseq);
4240
Tim Peters8ce9f162004-08-27 01:49:32 +00004241 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004243 new_res_used = res_used + itemlen;
4244 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004245 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004246 if (i < seqlen - 1) {
4247 new_res_used += seplen;
4248 if (new_res_used < res_used || new_res_used > INT_MAX)
4249 goto Overflow;
4250 }
4251 if (new_res_used > res_alloc) {
4252 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004253 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004254 size_t oldsize = res_alloc;
4255 res_alloc += res_alloc;
4256 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004257 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004258 } while (new_res_used > res_alloc);
4259 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004260 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004262 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004263 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004265
4266 /* Copy item, and maybe the separator. */
4267 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4268 res_p += itemlen;
4269 if (i < seqlen - 1) {
4270 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4271 res_p += seplen;
4272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004274 res_used = new_res_used;
4275 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004276
Tim Peters05eba1f2004-08-27 21:32:02 +00004277 /* Shrink res to match the used area; this probably can't fail,
4278 * but it's cheap to check.
4279 */
4280 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 goto onError;
4282
4283 Done:
4284 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004285 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 return (PyObject *)res;
4287
Tim Peters8ce9f162004-08-27 01:49:32 +00004288 Overflow:
4289 PyErr_SetString(PyExc_OverflowError,
4290 "join() is too long for a Python string");
4291 Py_DECREF(item);
4292 /* fall through */
4293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004295 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004296 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004297 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 return NULL;
4299}
4300
Tim Petersced69f82003-09-16 20:30:58 +00004301static
4302PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004303 Py_ssize_t left,
4304 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 Py_UNICODE fill)
4306{
4307 PyUnicodeObject *u;
4308
4309 if (left < 0)
4310 left = 0;
4311 if (right < 0)
4312 right = 0;
4313
Tim Peters7a29bd52001-09-12 03:03:31 +00004314 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 Py_INCREF(self);
4316 return self;
4317 }
4318
4319 u = _PyUnicode_New(left + self->length + right);
4320 if (u) {
4321 if (left)
4322 Py_UNICODE_FILL(u->str, fill, left);
4323 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4324 if (right)
4325 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4326 }
4327
4328 return u;
4329}
4330
4331#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004332 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 if (!str) \
4334 goto onError; \
4335 if (PyList_Append(list, str)) { \
4336 Py_DECREF(str); \
4337 goto onError; \
4338 } \
4339 else \
4340 Py_DECREF(str);
4341
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004342#define SPLIT_INSERT(data, left, right) \
4343 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4344 if (!str) \
4345 goto onError; \
4346 if (PyList_Insert(list, 0, str)) { \
4347 Py_DECREF(str); \
4348 goto onError; \
4349 } \
4350 else \
4351 Py_DECREF(str);
4352
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353static
4354PyObject *split_whitespace(PyUnicodeObject *self,
4355 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004358 register Py_ssize_t i;
4359 register Py_ssize_t j;
4360 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 PyObject *str;
4362
4363 for (i = j = 0; i < len; ) {
4364 /* find a token */
4365 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4366 i++;
4367 j = i;
4368 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4369 i++;
4370 if (j < i) {
4371 if (maxcount-- <= 0)
4372 break;
4373 SPLIT_APPEND(self->str, j, i);
4374 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4375 i++;
4376 j = i;
4377 }
4378 }
4379 if (j < len) {
4380 SPLIT_APPEND(self->str, j, len);
4381 }
4382 return list;
4383
4384 onError:
4385 Py_DECREF(list);
4386 return NULL;
4387}
4388
4389PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004390 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004392 register Py_ssize_t i;
4393 register Py_ssize_t j;
4394 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 PyObject *list;
4396 PyObject *str;
4397 Py_UNICODE *data;
4398
4399 string = PyUnicode_FromObject(string);
4400 if (string == NULL)
4401 return NULL;
4402 data = PyUnicode_AS_UNICODE(string);
4403 len = PyUnicode_GET_SIZE(string);
4404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 list = PyList_New(0);
4406 if (!list)
4407 goto onError;
4408
4409 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004410 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 /* Find a line and append it */
4413 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4414 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415
4416 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004417 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 if (i < len) {
4419 if (data[i] == '\r' && i + 1 < len &&
4420 data[i+1] == '\n')
4421 i += 2;
4422 else
4423 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004424 if (keepends)
4425 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 }
Guido van Rossum86662912000-04-11 15:38:46 +00004427 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 j = i;
4429 }
4430 if (j < len) {
4431 SPLIT_APPEND(data, j, len);
4432 }
4433
4434 Py_DECREF(string);
4435 return list;
4436
4437 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004438 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 Py_DECREF(string);
4440 return NULL;
4441}
4442
Tim Petersced69f82003-09-16 20:30:58 +00004443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444PyObject *split_char(PyUnicodeObject *self,
4445 PyObject *list,
4446 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 register Py_ssize_t i;
4450 register Py_ssize_t j;
4451 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 PyObject *str;
4453
4454 for (i = j = 0; i < len; ) {
4455 if (self->str[i] == ch) {
4456 if (maxcount-- <= 0)
4457 break;
4458 SPLIT_APPEND(self->str, j, i);
4459 i = j = i + 1;
4460 } else
4461 i++;
4462 }
4463 if (j <= len) {
4464 SPLIT_APPEND(self->str, j, len);
4465 }
4466 return list;
4467
4468 onError:
4469 Py_DECREF(list);
4470 return NULL;
4471}
4472
Tim Petersced69f82003-09-16 20:30:58 +00004473static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474PyObject *split_substring(PyUnicodeObject *self,
4475 PyObject *list,
4476 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004477 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004479 register Py_ssize_t i;
4480 register Py_ssize_t j;
4481 Py_ssize_t len = self->length;
4482 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 PyObject *str;
4484
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004485 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 if (Py_UNICODE_MATCH(self, i, substring)) {
4487 if (maxcount-- <= 0)
4488 break;
4489 SPLIT_APPEND(self->str, j, i);
4490 i = j = i + sublen;
4491 } else
4492 i++;
4493 }
4494 if (j <= len) {
4495 SPLIT_APPEND(self->str, j, len);
4496 }
4497 return list;
4498
4499 onError:
4500 Py_DECREF(list);
4501 return NULL;
4502}
4503
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004504static
4505PyObject *rsplit_whitespace(PyUnicodeObject *self,
4506 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004507 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004509 register Py_ssize_t i;
4510 register Py_ssize_t j;
4511 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004512 PyObject *str;
4513
4514 for (i = j = len - 1; i >= 0; ) {
4515 /* find a token */
4516 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4517 i--;
4518 j = i;
4519 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4520 i--;
4521 if (j > i) {
4522 if (maxcount-- <= 0)
4523 break;
4524 SPLIT_INSERT(self->str, i + 1, j + 1);
4525 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4526 i--;
4527 j = i;
4528 }
4529 }
4530 if (j >= 0) {
4531 SPLIT_INSERT(self->str, 0, j + 1);
4532 }
4533 return list;
4534
4535 onError:
4536 Py_DECREF(list);
4537 return NULL;
4538}
4539
4540static
4541PyObject *rsplit_char(PyUnicodeObject *self,
4542 PyObject *list,
4543 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 register Py_ssize_t i;
4547 register Py_ssize_t j;
4548 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004549 PyObject *str;
4550
4551 for (i = j = len - 1; i >= 0; ) {
4552 if (self->str[i] == ch) {
4553 if (maxcount-- <= 0)
4554 break;
4555 SPLIT_INSERT(self->str, i + 1, j + 1);
4556 j = i = i - 1;
4557 } else
4558 i--;
4559 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004560 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004561 SPLIT_INSERT(self->str, 0, j + 1);
4562 }
4563 return list;
4564
4565 onError:
4566 Py_DECREF(list);
4567 return NULL;
4568}
4569
4570static
4571PyObject *rsplit_substring(PyUnicodeObject *self,
4572 PyObject *list,
4573 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004575{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004576 register Py_ssize_t i;
4577 register Py_ssize_t j;
4578 Py_ssize_t len = self->length;
4579 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004580 PyObject *str;
4581
4582 for (i = len - sublen, j = len; i >= 0; ) {
4583 if (Py_UNICODE_MATCH(self, i, substring)) {
4584 if (maxcount-- <= 0)
4585 break;
4586 SPLIT_INSERT(self->str, i + sublen, j);
4587 j = i;
4588 i -= sublen;
4589 } else
4590 i--;
4591 }
4592 if (j >= 0) {
4593 SPLIT_INSERT(self->str, 0, j);
4594 }
4595 return list;
4596
4597 onError:
4598 Py_DECREF(list);
4599 return NULL;
4600}
4601
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004603#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604
4605static
4606PyObject *split(PyUnicodeObject *self,
4607 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004608 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609{
4610 PyObject *list;
4611
4612 if (maxcount < 0)
4613 maxcount = INT_MAX;
4614
4615 list = PyList_New(0);
4616 if (!list)
4617 return NULL;
4618
4619 if (substring == NULL)
4620 return split_whitespace(self,list,maxcount);
4621
4622 else if (substring->length == 1)
4623 return split_char(self,list,substring->str[0],maxcount);
4624
4625 else if (substring->length == 0) {
4626 Py_DECREF(list);
4627 PyErr_SetString(PyExc_ValueError, "empty separator");
4628 return NULL;
4629 }
4630 else
4631 return split_substring(self,list,substring,maxcount);
4632}
4633
Tim Petersced69f82003-09-16 20:30:58 +00004634static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004635PyObject *rsplit(PyUnicodeObject *self,
4636 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004638{
4639 PyObject *list;
4640
4641 if (maxcount < 0)
4642 maxcount = INT_MAX;
4643
4644 list = PyList_New(0);
4645 if (!list)
4646 return NULL;
4647
4648 if (substring == NULL)
4649 return rsplit_whitespace(self,list,maxcount);
4650
4651 else if (substring->length == 1)
4652 return rsplit_char(self,list,substring->str[0],maxcount);
4653
4654 else if (substring->length == 0) {
4655 Py_DECREF(list);
4656 PyErr_SetString(PyExc_ValueError, "empty separator");
4657 return NULL;
4658 }
4659 else
4660 return rsplit_substring(self,list,substring,maxcount);
4661}
4662
4663static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664PyObject *replace(PyUnicodeObject *self,
4665 PyUnicodeObject *str1,
4666 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004667 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668{
4669 PyUnicodeObject *u;
4670
4671 if (maxcount < 0)
4672 maxcount = INT_MAX;
4673
4674 if (str1->length == 1 && str2->length == 1) {
4675 int i;
4676
4677 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004678 if (!findchar(self->str, self->length, str1->str[0]) &&
4679 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 /* nothing to replace, return original string */
4681 Py_INCREF(self);
4682 u = self;
4683 } else {
4684 Py_UNICODE u1 = str1->str[0];
4685 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004686
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004688 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 self->length
4690 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004691 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004692 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004693 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 for (i = 0; i < u->length; i++)
4695 if (u->str[i] == u1) {
4696 if (--maxcount < 0)
4697 break;
4698 u->str[i] = u2;
4699 }
4700 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702
4703 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 Py_UNICODE *p;
4706
4707 /* replace strings */
4708 n = count(self, 0, self->length, str1);
4709 if (n > maxcount)
4710 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004711 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004713 if (PyUnicode_CheckExact(self)) {
4714 Py_INCREF(self);
4715 u = self;
4716 }
4717 else {
4718 u = (PyUnicodeObject *)
4719 PyUnicode_FromUnicode(self->str, self->length);
4720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 } else {
4722 u = _PyUnicode_New(
4723 self->length + n * (str2->length - str1->length));
4724 if (u) {
4725 i = 0;
4726 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004727 if (str1->length > 0) {
4728 while (i <= self->length - str1->length)
4729 if (Py_UNICODE_MATCH(self, i, str1)) {
4730 /* replace string segment */
4731 Py_UNICODE_COPY(p, str2->str, str2->length);
4732 p += str2->length;
4733 i += str1->length;
4734 if (--n <= 0) {
4735 /* copy remaining part */
4736 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4737 break;
4738 }
4739 } else
4740 *p++ = self->str[i++];
4741 } else {
4742 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_UNICODE_COPY(p, str2->str, str2->length);
4744 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004745 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004748 }
4749 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 }
4752 }
4753 }
Tim Petersced69f82003-09-16 20:30:58 +00004754
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return (PyObject *) u;
4756}
4757
4758/* --- Unicode Object Methods --------------------------------------------- */
4759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004760PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761"S.title() -> unicode\n\
4762\n\
4763Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004764characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765
4766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004767unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 return fixup(self, fixtitle);
4770}
4771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004772PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773"S.capitalize() -> unicode\n\
4774\n\
4775Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004776have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
4778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004779unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return fixup(self, fixcapitalize);
4782}
4783
4784#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004785PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786"S.capwords() -> unicode\n\
4787\n\
4788Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004789normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
4791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004792unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
4794 PyObject *list;
4795 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004796 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 /* Split into words */
4799 list = split(self, NULL, -1);
4800 if (!list)
4801 return NULL;
4802
4803 /* Capitalize each word */
4804 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4805 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4806 fixcapitalize);
4807 if (item == NULL)
4808 goto onError;
4809 Py_DECREF(PyList_GET_ITEM(list, i));
4810 PyList_SET_ITEM(list, i, item);
4811 }
4812
4813 /* Join the words to form a new string */
4814 item = PyUnicode_Join(NULL, list);
4815
4816onError:
4817 Py_DECREF(list);
4818 return (PyObject *)item;
4819}
4820#endif
4821
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004822/* Argument converter. Coerces to a single unicode character */
4823
4824static int
4825convert_uc(PyObject *obj, void *addr)
4826{
4827 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4828 PyObject *uniobj;
4829 Py_UNICODE *unistr;
4830
4831 uniobj = PyUnicode_FromObject(obj);
4832 if (uniobj == NULL) {
4833 PyErr_SetString(PyExc_TypeError,
4834 "The fill character cannot be converted to Unicode");
4835 return 0;
4836 }
4837 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4838 PyErr_SetString(PyExc_TypeError,
4839 "The fill character must be exactly one character long");
4840 Py_DECREF(uniobj);
4841 return 0;
4842 }
4843 unistr = PyUnicode_AS_UNICODE(uniobj);
4844 *fillcharloc = unistr[0];
4845 Py_DECREF(uniobj);
4846 return 1;
4847}
4848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004849PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004850"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004852Return S centered in a Unicode string of length width. Padding is\n\
4853done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
4855static PyObject *
4856unicode_center(PyUnicodeObject *self, PyObject *args)
4857{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t marg, left;
4859 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004860 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861
Thomas Woutersde017742006-02-16 19:34:37 +00004862 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 return NULL;
4864
Tim Peters7a29bd52001-09-12 03:03:31 +00004865 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 Py_INCREF(self);
4867 return (PyObject*) self;
4868 }
4869
4870 marg = width - self->length;
4871 left = marg / 2 + (marg & width & 1);
4872
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004873 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874}
4875
Marc-André Lemburge5034372000-08-08 08:04:29 +00004876#if 0
4877
4878/* This code should go into some future Unicode collation support
4879 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004880 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004881
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004882/* speedy UTF-16 code point order comparison */
4883/* gleaned from: */
4884/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4885
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004886static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004887{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004888 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004889 0, 0, 0, 0, 0, 0, 0, 0,
4890 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004891 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004892};
4893
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894static int
4895unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004898
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 Py_UNICODE *s1 = str1->str;
4900 Py_UNICODE *s2 = str2->str;
4901
4902 len1 = str1->length;
4903 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004906 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004907
4908 c1 = *s1++;
4909 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004910
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004911 if (c1 > (1<<11) * 26)
4912 c1 += utf16Fixup[c1>>11];
4913 if (c2 > (1<<11) * 26)
4914 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004915 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004916
4917 if (c1 != c2)
4918 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004919
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004920 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 }
4922
4923 return (len1 < len2) ? -1 : (len1 != len2);
4924}
4925
Marc-André Lemburge5034372000-08-08 08:04:29 +00004926#else
4927
4928static int
4929unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004931 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004932
4933 Py_UNICODE *s1 = str1->str;
4934 Py_UNICODE *s2 = str2->str;
4935
4936 len1 = str1->length;
4937 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004938
Marc-André Lemburge5034372000-08-08 08:04:29 +00004939 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004940 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004941
Fredrik Lundh45714e92001-06-26 16:39:36 +00004942 c1 = *s1++;
4943 c2 = *s2++;
4944
4945 if (c1 != c2)
4946 return (c1 < c2) ? -1 : 1;
4947
Marc-André Lemburge5034372000-08-08 08:04:29 +00004948 len1--; len2--;
4949 }
4950
4951 return (len1 < len2) ? -1 : (len1 != len2);
4952}
4953
4954#endif
4955
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956int PyUnicode_Compare(PyObject *left,
4957 PyObject *right)
4958{
4959 PyUnicodeObject *u = NULL, *v = NULL;
4960 int result;
4961
4962 /* Coerce the two arguments */
4963 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4964 if (u == NULL)
4965 goto onError;
4966 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4967 if (v == NULL)
4968 goto onError;
4969
Thomas Wouters7e474022000-07-16 12:04:32 +00004970 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 if (v == u) {
4972 Py_DECREF(u);
4973 Py_DECREF(v);
4974 return 0;
4975 }
4976
4977 result = unicode_compare(u, v);
4978
4979 Py_DECREF(u);
4980 Py_DECREF(v);
4981 return result;
4982
4983onError:
4984 Py_XDECREF(u);
4985 Py_XDECREF(v);
4986 return -1;
4987}
4988
Guido van Rossum403d68b2000-03-13 15:55:09 +00004989int PyUnicode_Contains(PyObject *container,
4990 PyObject *element)
4991{
4992 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 int result;
4994 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004995 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004996
4997 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004998 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004999 if (v == NULL) {
5000 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005001 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00005002 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005003 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005004 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00005005 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005006 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005007
Barry Warsaw817918c2002-08-06 16:58:21 +00005008 size = PyUnicode_GET_SIZE(v);
5009 rhs = PyUnicode_AS_UNICODE(v);
5010 lhs = PyUnicode_AS_UNICODE(u);
5011
Guido van Rossum403d68b2000-03-13 15:55:09 +00005012 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005013 if (size == 1) {
5014 end = lhs + PyUnicode_GET_SIZE(u);
5015 while (lhs < end) {
5016 if (*lhs++ == *rhs) {
5017 result = 1;
5018 break;
5019 }
5020 }
5021 }
5022 else {
5023 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5024 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005025 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005026 result = 1;
5027 break;
5028 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005029 }
5030 }
5031
5032 Py_DECREF(u);
5033 Py_DECREF(v);
5034 return result;
5035
5036onError:
5037 Py_XDECREF(u);
5038 Py_XDECREF(v);
5039 return -1;
5040}
5041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042/* Concat to string or Unicode object giving a new Unicode object. */
5043
5044PyObject *PyUnicode_Concat(PyObject *left,
5045 PyObject *right)
5046{
5047 PyUnicodeObject *u = NULL, *v = NULL, *w;
5048
5049 /* Coerce the two arguments */
5050 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5051 if (u == NULL)
5052 goto onError;
5053 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5054 if (v == NULL)
5055 goto onError;
5056
5057 /* Shortcuts */
5058 if (v == unicode_empty) {
5059 Py_DECREF(v);
5060 return (PyObject *)u;
5061 }
5062 if (u == unicode_empty) {
5063 Py_DECREF(u);
5064 return (PyObject *)v;
5065 }
5066
5067 /* Concat the two Unicode strings */
5068 w = _PyUnicode_New(u->length + v->length);
5069 if (w == NULL)
5070 goto onError;
5071 Py_UNICODE_COPY(w->str, u->str, u->length);
5072 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5073
5074 Py_DECREF(u);
5075 Py_DECREF(v);
5076 return (PyObject *)w;
5077
5078onError:
5079 Py_XDECREF(u);
5080 Py_XDECREF(v);
5081 return NULL;
5082}
5083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085"S.count(sub[, start[, end]]) -> int\n\
5086\n\
5087Return the number of occurrences of substring sub in Unicode string\n\
5088S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005089interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
5091static PyObject *
5092unicode_count(PyUnicodeObject *self, PyObject *args)
5093{
5094 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005095 Py_ssize_t start = 0;
5096 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 PyObject *result;
5098
Guido van Rossumb8872e62000-05-09 14:14:27 +00005099 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 return NULL;
5102
5103 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5104 (PyObject *)substring);
5105 if (substring == NULL)
5106 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005107
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 if (start < 0)
5109 start += self->length;
5110 if (start < 0)
5111 start = 0;
5112 if (end > self->length)
5113 end = self->length;
5114 if (end < 0)
5115 end += self->length;
5116 if (end < 0)
5117 end = 0;
5118
5119 result = PyInt_FromLong((long) count(self, start, end, substring));
5120
5121 Py_DECREF(substring);
5122 return result;
5123}
5124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005125PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005126"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005128Encodes S using the codec registered for encoding. encoding defaults\n\
5129to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005130handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5132'xmlcharrefreplace' as well as any other name registered with\n\
5133codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
5135static PyObject *
5136unicode_encode(PyUnicodeObject *self, PyObject *args)
5137{
5138 char *encoding = NULL;
5139 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005140 PyObject *v;
5141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5143 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005144 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005145 if (v == NULL)
5146 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005147 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5148 PyErr_Format(PyExc_TypeError,
5149 "encoder did not return a string/unicode object "
5150 "(type=%.400s)",
5151 v->ob_type->tp_name);
5152 Py_DECREF(v);
5153 return NULL;
5154 }
5155 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005156
5157 onError:
5158 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005159}
5160
5161PyDoc_STRVAR(decode__doc__,
5162"S.decode([encoding[,errors]]) -> string or unicode\n\
5163\n\
5164Decodes S using the codec registered for encoding. encoding defaults\n\
5165to the default encoding. errors may be given to set a different error\n\
5166handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5167a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5168as well as any other name registerd with codecs.register_error that is\n\
5169able to handle UnicodeDecodeErrors.");
5170
5171static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005172unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005173{
5174 char *encoding = NULL;
5175 char *errors = NULL;
5176 PyObject *v;
5177
5178 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5179 return NULL;
5180 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005181 if (v == NULL)
5182 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005183 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5184 PyErr_Format(PyExc_TypeError,
5185 "decoder did not return a string/unicode object "
5186 "(type=%.400s)",
5187 v->ob_type->tp_name);
5188 Py_DECREF(v);
5189 return NULL;
5190 }
5191 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005192
5193 onError:
5194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005197PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198"S.expandtabs([tabsize]) -> unicode\n\
5199\n\
5200Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005201If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202
5203static PyObject*
5204unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5205{
5206 Py_UNICODE *e;
5207 Py_UNICODE *p;
5208 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 PyUnicodeObject *u;
5211 int tabsize = 8;
5212
5213 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5214 return NULL;
5215
Thomas Wouters7e474022000-07-16 12:04:32 +00005216 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 i = j = 0;
5218 e = self->str + self->length;
5219 for (p = self->str; p < e; p++)
5220 if (*p == '\t') {
5221 if (tabsize > 0)
5222 j += tabsize - (j % tabsize);
5223 }
5224 else {
5225 j++;
5226 if (*p == '\n' || *p == '\r') {
5227 i += j;
5228 j = 0;
5229 }
5230 }
5231
5232 /* Second pass: create output string and fill it */
5233 u = _PyUnicode_New(i + j);
5234 if (!u)
5235 return NULL;
5236
5237 j = 0;
5238 q = u->str;
5239
5240 for (p = self->str; p < e; p++)
5241 if (*p == '\t') {
5242 if (tabsize > 0) {
5243 i = tabsize - (j % tabsize);
5244 j += i;
5245 while (i--)
5246 *q++ = ' ';
5247 }
5248 }
5249 else {
5250 j++;
5251 *q++ = *p;
5252 if (*p == '\n' || *p == '\r')
5253 j = 0;
5254 }
5255
5256 return (PyObject*) u;
5257}
5258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005259PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260"S.find(sub [,start [,end]]) -> int\n\
5261\n\
5262Return the lowest index in S where substring sub is found,\n\
5263such that sub is contained within s[start,end]. Optional\n\
5264arguments start and end are interpreted as in slice notation.\n\
5265\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005266Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
5268static PyObject *
5269unicode_find(PyUnicodeObject *self, PyObject *args)
5270{
5271 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005272 Py_ssize_t start = 0;
5273 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 PyObject *result;
5275
Guido van Rossumb8872e62000-05-09 14:14:27 +00005276 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5277 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 return NULL;
5279 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5280 (PyObject *)substring);
5281 if (substring == NULL)
5282 return NULL;
5283
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
5286 Py_DECREF(substring);
5287 return result;
5288}
5289
5290static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292{
5293 if (index < 0 || index >= self->length) {
5294 PyErr_SetString(PyExc_IndexError, "string index out of range");
5295 return NULL;
5296 }
5297
5298 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5299}
5300
5301static long
5302unicode_hash(PyUnicodeObject *self)
5303{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005304 /* Since Unicode objects compare equal to their ASCII string
5305 counterparts, they should use the individual character values
5306 as basis for their hash value. This is needed to assure that
5307 strings and Unicode objects behave in the same way as
5308 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005311 register Py_UNICODE *p;
5312 register long x;
5313
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 if (self->hash != -1)
5315 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005316 len = PyUnicode_GET_SIZE(self);
5317 p = PyUnicode_AS_UNICODE(self);
5318 x = *p << 7;
5319 while (--len >= 0)
5320 x = (1000003*x) ^ *p++;
5321 x ^= PyUnicode_GET_SIZE(self);
5322 if (x == -1)
5323 x = -2;
5324 self->hash = x;
5325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326}
5327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005328PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329"S.index(sub [,start [,end]]) -> int\n\
5330\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005331Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332
5333static PyObject *
5334unicode_index(PyUnicodeObject *self, PyObject *args)
5335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t start = 0;
5339 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Guido van Rossumb8872e62000-05-09 14:14:27 +00005341 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5342 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5346 (PyObject *)substring);
5347 if (substring == NULL)
5348 return NULL;
5349
5350 result = findstring(self, substring, start, end, 1);
5351
5352 Py_DECREF(substring);
5353 if (result < 0) {
5354 PyErr_SetString(PyExc_ValueError, "substring not found");
5355 return NULL;
5356 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005357 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358}
5359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005360PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005361"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005363Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005364at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365
5366static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005367unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
5369 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5370 register const Py_UNICODE *e;
5371 int cased;
5372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 /* Shortcut for single character strings */
5374 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005375 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005377 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005378 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 e = p + PyUnicode_GET_SIZE(self);
5382 cased = 0;
5383 for (; p < e; p++) {
5384 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005385
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005387 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 else if (!cased && Py_UNICODE_ISLOWER(ch))
5389 cased = 1;
5390 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005391 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392}
5393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005394PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005395"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005397Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005398at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
5400static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005401unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
5403 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5404 register const Py_UNICODE *e;
5405 int cased;
5406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 /* Shortcut for single character strings */
5408 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005409 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005411 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005412 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005413 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 e = p + PyUnicode_GET_SIZE(self);
5416 cased = 0;
5417 for (; p < e; p++) {
5418 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005419
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005421 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 else if (!cased && Py_UNICODE_ISUPPER(ch))
5423 cased = 1;
5424 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005425 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426}
5427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005428PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005429"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005431Return True if S is a titlecased string and there is at least one\n\
5432character in S, i.e. upper- and titlecase characters may only\n\
5433follow uncased characters and lowercase characters only cased ones.\n\
5434Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
5436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005437unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438{
5439 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5440 register const Py_UNICODE *e;
5441 int cased, previous_is_cased;
5442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 /* Shortcut for single character strings */
5444 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005445 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5446 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005448 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005449 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005450 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 e = p + PyUnicode_GET_SIZE(self);
5453 cased = 0;
5454 previous_is_cased = 0;
5455 for (; p < e; p++) {
5456 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005457
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5459 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005460 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 previous_is_cased = 1;
5462 cased = 1;
5463 }
5464 else if (Py_UNICODE_ISLOWER(ch)) {
5465 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005466 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 previous_is_cased = 1;
5468 cased = 1;
5469 }
5470 else
5471 previous_is_cased = 0;
5472 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005473 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474}
5475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005476PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005477"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005479Return True if all characters in S are whitespace\n\
5480and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
5482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005483unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484{
5485 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5486 register const Py_UNICODE *e;
5487
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 /* Shortcut for single character strings */
5489 if (PyUnicode_GET_SIZE(self) == 1 &&
5490 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005491 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005493 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005494 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005496
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 e = p + PyUnicode_GET_SIZE(self);
5498 for (; p < e; p++) {
5499 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005500 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005502 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503}
5504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005505PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005506"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005507\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005508Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005509and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005510
5511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005512unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005513{
5514 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5515 register const Py_UNICODE *e;
5516
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005517 /* Shortcut for single character strings */
5518 if (PyUnicode_GET_SIZE(self) == 1 &&
5519 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005520 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005521
5522 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005523 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005525
5526 e = p + PyUnicode_GET_SIZE(self);
5527 for (; p < e; p++) {
5528 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005529 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005531 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005532}
5533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005534PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005535"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005537Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005538and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005539
5540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005541unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005542{
5543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5544 register const Py_UNICODE *e;
5545
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005546 /* Shortcut for single character strings */
5547 if (PyUnicode_GET_SIZE(self) == 1 &&
5548 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005549 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005550
5551 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005552 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554
5555 e = p + PyUnicode_GET_SIZE(self);
5556 for (; p < e; p++) {
5557 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005558 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005560 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005561}
5562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005563PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005564"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005566Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005567False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
5569static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005570unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
5572 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5573 register const Py_UNICODE *e;
5574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 /* Shortcut for single character strings */
5576 if (PyUnicode_GET_SIZE(self) == 1 &&
5577 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005578 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005581 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 e = p + PyUnicode_GET_SIZE(self);
5585 for (; p < e; p++) {
5586 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005587 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005589 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590}
5591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005592PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005593"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005595Return True if all characters in S are digits\n\
5596and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
5598static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005599unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600{
5601 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5602 register const Py_UNICODE *e;
5603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 /* Shortcut for single character strings */
5605 if (PyUnicode_GET_SIZE(self) == 1 &&
5606 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005609 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005610 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005612
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 e = p + PyUnicode_GET_SIZE(self);
5614 for (; p < e; p++) {
5615 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619}
5620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005621PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005622"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005624Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
5627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005628unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629{
5630 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5631 register const Py_UNICODE *e;
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 /* Shortcut for single character strings */
5634 if (PyUnicode_GET_SIZE(self) == 1 &&
5635 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005636 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005638 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005639 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 e = p + PyUnicode_GET_SIZE(self);
5643 for (; p < e; p++) {
5644 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005647 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648}
5649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005650PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651"S.join(sequence) -> unicode\n\
5652\n\
5653Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005654sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
5656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005657unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005659 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660}
5661
Martin v. Löwis18e16552006-02-15 17:27:45 +00005662static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663unicode_length(PyUnicodeObject *self)
5664{
5665 return self->length;
5666}
5667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005668PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005669"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670\n\
5671Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005672done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674static PyObject *
5675unicode_ljust(PyUnicodeObject *self, PyObject *args)
5676{
5677 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005678 Py_UNICODE fillchar = ' ';
5679
5680 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return NULL;
5682
Tim Peters7a29bd52001-09-12 03:03:31 +00005683 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 Py_INCREF(self);
5685 return (PyObject*) self;
5686 }
5687
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005688 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689}
5690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005691PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692"S.lower() -> unicode\n\
5693\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
5696static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005697unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 return fixup(self, fixlower);
5700}
5701
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005702#define LEFTSTRIP 0
5703#define RIGHTSTRIP 1
5704#define BOTHSTRIP 2
5705
5706/* Arrays indexed by above */
5707static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5708
5709#define STRIPNAME(i) (stripformat[i]+3)
5710
5711static const Py_UNICODE *
5712unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5713{
Tim Peters030a5ce2002-04-22 19:00:10 +00005714 size_t i;
5715 for (i = 0; i < n; ++i)
5716 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005717 return s+i;
5718 return NULL;
5719}
5720
5721/* externally visible for str.strip(unicode) */
5722PyObject *
5723_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5724{
5725 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005727 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005728 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5729 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005730
5731 i = 0;
5732 if (striptype != RIGHTSTRIP) {
5733 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5734 i++;
5735 }
5736 }
5737
5738 j = len;
5739 if (striptype != LEFTSTRIP) {
5740 do {
5741 j--;
5742 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5743 j++;
5744 }
5745
5746 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5747 Py_INCREF(self);
5748 return (PyObject*)self;
5749 }
5750 else
5751 return PyUnicode_FromUnicode(s+i, j-i);
5752}
5753
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005756do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005758 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005760
5761 i = 0;
5762 if (striptype != RIGHTSTRIP) {
5763 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5764 i++;
5765 }
5766 }
5767
5768 j = len;
5769 if (striptype != LEFTSTRIP) {
5770 do {
5771 j--;
5772 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5773 j++;
5774 }
5775
5776 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5777 Py_INCREF(self);
5778 return (PyObject*)self;
5779 }
5780 else
5781 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782}
5783
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005784
5785static PyObject *
5786do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5787{
5788 PyObject *sep = NULL;
5789
5790 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5791 return NULL;
5792
5793 if (sep != NULL && sep != Py_None) {
5794 if (PyUnicode_Check(sep))
5795 return _PyUnicode_XStrip(self, striptype, sep);
5796 else if (PyString_Check(sep)) {
5797 PyObject *res;
5798 sep = PyUnicode_FromObject(sep);
5799 if (sep==NULL)
5800 return NULL;
5801 res = _PyUnicode_XStrip(self, striptype, sep);
5802 Py_DECREF(sep);
5803 return res;
5804 }
5805 else {
5806 PyErr_Format(PyExc_TypeError,
5807 "%s arg must be None, unicode or str",
5808 STRIPNAME(striptype));
5809 return NULL;
5810 }
5811 }
5812
5813 return do_strip(self, striptype);
5814}
5815
5816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005817PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005818"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005819\n\
5820Return a copy of the string S with leading and trailing\n\
5821whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005822If chars is given and not None, remove characters in chars instead.\n\
5823If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005824
5825static PyObject *
5826unicode_strip(PyUnicodeObject *self, PyObject *args)
5827{
5828 if (PyTuple_GET_SIZE(args) == 0)
5829 return do_strip(self, BOTHSTRIP); /* Common case */
5830 else
5831 return do_argstrip(self, BOTHSTRIP, args);
5832}
5833
5834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005835PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005836"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005837\n\
5838Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005839If chars is given and not None, remove characters in chars instead.\n\
5840If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005841
5842static PyObject *
5843unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5844{
5845 if (PyTuple_GET_SIZE(args) == 0)
5846 return do_strip(self, LEFTSTRIP); /* Common case */
5847 else
5848 return do_argstrip(self, LEFTSTRIP, args);
5849}
5850
5851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005852PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005853"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005854\n\
5855Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005856If chars is given and not None, remove characters in chars instead.\n\
5857If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005858
5859static PyObject *
5860unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5861{
5862 if (PyTuple_GET_SIZE(args) == 0)
5863 return do_strip(self, RIGHTSTRIP); /* Common case */
5864 else
5865 return do_argstrip(self, RIGHTSTRIP, args);
5866}
5867
5868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
5872 PyUnicodeObject *u;
5873 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005874 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005875 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
5877 if (len < 0)
5878 len = 0;
5879
Tim Peters7a29bd52001-09-12 03:03:31 +00005880 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 /* no repeat, return original string */
5882 Py_INCREF(str);
5883 return (PyObject*) str;
5884 }
Tim Peters8f422462000-09-09 06:13:41 +00005885
5886 /* ensure # of chars needed doesn't overflow int and # of bytes
5887 * needed doesn't overflow size_t
5888 */
5889 nchars = len * str->length;
5890 if (len && nchars / len != str->length) {
5891 PyErr_SetString(PyExc_OverflowError,
5892 "repeated string is too long");
5893 return NULL;
5894 }
5895 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5896 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5897 PyErr_SetString(PyExc_OverflowError,
5898 "repeated string is too long");
5899 return NULL;
5900 }
5901 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 if (!u)
5903 return NULL;
5904
5905 p = u->str;
5906
5907 while (len-- > 0) {
5908 Py_UNICODE_COPY(p, str->str, str->length);
5909 p += str->length;
5910 }
5911
5912 return (PyObject*) u;
5913}
5914
5915PyObject *PyUnicode_Replace(PyObject *obj,
5916 PyObject *subobj,
5917 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
5920 PyObject *self;
5921 PyObject *str1;
5922 PyObject *str2;
5923 PyObject *result;
5924
5925 self = PyUnicode_FromObject(obj);
5926 if (self == NULL)
5927 return NULL;
5928 str1 = PyUnicode_FromObject(subobj);
5929 if (str1 == NULL) {
5930 Py_DECREF(self);
5931 return NULL;
5932 }
5933 str2 = PyUnicode_FromObject(replobj);
5934 if (str2 == NULL) {
5935 Py_DECREF(self);
5936 Py_DECREF(str1);
5937 return NULL;
5938 }
Tim Petersced69f82003-09-16 20:30:58 +00005939 result = replace((PyUnicodeObject *)self,
5940 (PyUnicodeObject *)str1,
5941 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 maxcount);
5943 Py_DECREF(self);
5944 Py_DECREF(str1);
5945 Py_DECREF(str2);
5946 return result;
5947}
5948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950"S.replace (old, new[, maxsplit]) -> unicode\n\
5951\n\
5952Return a copy of S with all occurrences of substring\n\
5953old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956static PyObject*
5957unicode_replace(PyUnicodeObject *self, PyObject *args)
5958{
5959 PyUnicodeObject *str1;
5960 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 PyObject *result;
5963
Martin v. Löwis18e16552006-02-15 17:27:45 +00005964 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 return NULL;
5966 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5967 if (str1 == NULL)
5968 return NULL;
5969 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005970 if (str2 == NULL) {
5971 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
5975 result = replace(self, str1, str2, maxcount);
5976
5977 Py_DECREF(str1);
5978 Py_DECREF(str2);
5979 return result;
5980}
5981
5982static
5983PyObject *unicode_repr(PyObject *unicode)
5984{
5985 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5986 PyUnicode_GET_SIZE(unicode),
5987 1);
5988}
5989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005990PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991"S.rfind(sub [,start [,end]]) -> int\n\
5992\n\
5993Return the highest index in S where substring sub is found,\n\
5994such that sub is contained within s[start,end]. Optional\n\
5995arguments start and end are interpreted as in slice notation.\n\
5996\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005997Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
5999static PyObject *
6000unicode_rfind(PyUnicodeObject *self, PyObject *args)
6001{
6002 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006003 Py_ssize_t start = 0;
6004 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 PyObject *result;
6006
Guido van Rossumb8872e62000-05-09 14:14:27 +00006007 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6008 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 return NULL;
6010 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6011 (PyObject *)substring);
6012 if (substring == NULL)
6013 return NULL;
6014
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016
6017 Py_DECREF(substring);
6018 return result;
6019}
6020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006021PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022"S.rindex(sub [,start [,end]]) -> int\n\
6023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006024Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
6026static PyObject *
6027unicode_rindex(PyUnicodeObject *self, PyObject *args)
6028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006029 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006031 Py_ssize_t start = 0;
6032 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
Guido van Rossumb8872e62000-05-09 14:14:27 +00006034 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6035 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
6037 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6038 (PyObject *)substring);
6039 if (substring == NULL)
6040 return NULL;
6041
6042 result = findstring(self, substring, start, end, -1);
6043
6044 Py_DECREF(substring);
6045 if (result < 0) {
6046 PyErr_SetString(PyExc_ValueError, "substring not found");
6047 return NULL;
6048 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006049 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006052PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006053"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054\n\
6055Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
6058static PyObject *
6059unicode_rjust(PyUnicodeObject *self, PyObject *args)
6060{
6061 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006062 Py_UNICODE fillchar = ' ';
6063
6064 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 return NULL;
6066
Tim Peters7a29bd52001-09-12 03:03:31 +00006067 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 Py_INCREF(self);
6069 return (PyObject*) self;
6070 }
6071
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006072 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073}
6074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006076unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077{
6078 /* standard clamping */
6079 if (start < 0)
6080 start = 0;
6081 if (end < 0)
6082 end = 0;
6083 if (end > self->length)
6084 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006085 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 /* full slice, return original string */
6087 Py_INCREF(self);
6088 return (PyObject*) self;
6089 }
6090 if (start > end)
6091 start = end;
6092 /* copy slice */
6093 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6094 end - start);
6095}
6096
6097PyObject *PyUnicode_Split(PyObject *s,
6098 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006099 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100{
6101 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 s = PyUnicode_FromObject(s);
6104 if (s == NULL)
6105 return NULL;
6106 if (sep != NULL) {
6107 sep = PyUnicode_FromObject(sep);
6108 if (sep == NULL) {
6109 Py_DECREF(s);
6110 return NULL;
6111 }
6112 }
6113
6114 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6115
6116 Py_DECREF(s);
6117 Py_XDECREF(sep);
6118 return result;
6119}
6120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006121PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122"S.split([sep [,maxsplit]]) -> list of strings\n\
6123\n\
6124Return a list of the words in S, using sep as the\n\
6125delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006126splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006127any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
6129static PyObject*
6130unicode_split(PyUnicodeObject *self, PyObject *args)
6131{
6132 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006133 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Martin v. Löwis18e16552006-02-15 17:27:45 +00006135 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 return NULL;
6137
6138 if (substring == Py_None)
6139 return split(self, NULL, maxcount);
6140 else if (PyUnicode_Check(substring))
6141 return split(self, (PyUnicodeObject *)substring, maxcount);
6142 else
6143 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6144}
6145
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006146PyObject *PyUnicode_RSplit(PyObject *s,
6147 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006148 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006149{
6150 PyObject *result;
6151
6152 s = PyUnicode_FromObject(s);
6153 if (s == NULL)
6154 return NULL;
6155 if (sep != NULL) {
6156 sep = PyUnicode_FromObject(sep);
6157 if (sep == NULL) {
6158 Py_DECREF(s);
6159 return NULL;
6160 }
6161 }
6162
6163 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6164
6165 Py_DECREF(s);
6166 Py_XDECREF(sep);
6167 return result;
6168}
6169
6170PyDoc_STRVAR(rsplit__doc__,
6171"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6172\n\
6173Return a list of the words in S, using sep as the\n\
6174delimiter string, starting at the end of the string and\n\
6175working to the front. If maxsplit is given, at most maxsplit\n\
6176splits are done. If sep is not specified, any whitespace string\n\
6177is a separator.");
6178
6179static PyObject*
6180unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6181{
6182 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006183 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006184
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006186 return NULL;
6187
6188 if (substring == Py_None)
6189 return rsplit(self, NULL, maxcount);
6190 else if (PyUnicode_Check(substring))
6191 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6192 else
6193 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6194}
6195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006196PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006197"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198\n\
6199Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006200Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006201is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203static PyObject*
6204unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6205{
Guido van Rossum86662912000-04-11 15:38:46 +00006206 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
Guido van Rossum86662912000-04-11 15:38:46 +00006208 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 return NULL;
6210
Guido van Rossum86662912000-04-11 15:38:46 +00006211 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212}
6213
6214static
6215PyObject *unicode_str(PyUnicodeObject *self)
6216{
Fred Drakee4315f52000-05-09 19:53:39 +00006217 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006220PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221"S.swapcase() -> unicode\n\
6222\n\
6223Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
6226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006227unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 return fixup(self, fixswapcase);
6230}
6231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006232PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233"S.translate(table) -> unicode\n\
6234\n\
6235Return a copy of the string S, where all characters have been mapped\n\
6236through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006237Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6238Unmapped characters are left untouched. Characters mapped to None\n\
6239are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240
6241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006242unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243{
Tim Petersced69f82003-09-16 20:30:58 +00006244 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006246 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 "ignore");
6248}
6249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006250PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251"S.upper() -> unicode\n\
6252\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006253Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
6255static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006256unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 return fixup(self, fixupper);
6259}
6260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006261PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262"S.zfill(width) -> unicode\n\
6263\n\
6264Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
6267static PyObject *
6268unicode_zfill(PyUnicodeObject *self, PyObject *args)
6269{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006270 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 PyUnicodeObject *u;
6272
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 Py_ssize_t width;
6274 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return NULL;
6276
6277 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006278 if (PyUnicode_CheckExact(self)) {
6279 Py_INCREF(self);
6280 return (PyObject*) self;
6281 }
6282 else
6283 return PyUnicode_FromUnicode(
6284 PyUnicode_AS_UNICODE(self),
6285 PyUnicode_GET_SIZE(self)
6286 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288
6289 fill = width - self->length;
6290
6291 u = pad(self, fill, 0, '0');
6292
Walter Dörwald068325e2002-04-15 13:36:47 +00006293 if (u == NULL)
6294 return NULL;
6295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 if (u->str[fill] == '+' || u->str[fill] == '-') {
6297 /* move sign to beginning of string */
6298 u->str[0] = u->str[fill];
6299 u->str[fill] = '0';
6300 }
6301
6302 return (PyObject*) u;
6303}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304
6305#if 0
6306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006307unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 return PyInt_FromLong(unicode_freelist_size);
6310}
6311#endif
6312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006314"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006316Return True if S starts with the specified prefix, False otherwise.\n\
6317With optional start, test S beginning at that position.\n\
6318With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
6320static PyObject *
6321unicode_startswith(PyUnicodeObject *self,
6322 PyObject *args)
6323{
6324 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006325 Py_ssize_t start = 0;
6326 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 PyObject *result;
6328
Guido van Rossumb8872e62000-05-09 14:14:27 +00006329 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6330 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 return NULL;
6332 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6333 (PyObject *)substring);
6334 if (substring == NULL)
6335 return NULL;
6336
Guido van Rossum77f6a652002-04-03 22:41:51 +00006337 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
6339 Py_DECREF(substring);
6340 return result;
6341}
6342
6343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006344PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006345"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006347Return True if S ends with the specified suffix, False otherwise.\n\
6348With optional start, test S beginning at that position.\n\
6349With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
6351static PyObject *
6352unicode_endswith(PyUnicodeObject *self,
6353 PyObject *args)
6354{
6355 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 Py_ssize_t start = 0;
6357 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 PyObject *result;
6359
Guido van Rossumb8872e62000-05-09 14:14:27 +00006360 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6361 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 return NULL;
6363 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6364 (PyObject *)substring);
6365 if (substring == NULL)
6366 return NULL;
6367
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
6370 Py_DECREF(substring);
6371 return result;
6372}
6373
6374
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006375
6376static PyObject *
6377unicode_getnewargs(PyUnicodeObject *v)
6378{
6379 return Py_BuildValue("(u#)", v->str, v->length);
6380}
6381
6382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383static PyMethodDef unicode_methods[] = {
6384
6385 /* Order is according to common usage: often used methods should
6386 appear first, since lookup is done sequentially. */
6387
Georg Brandlecdc0a92006-03-30 12:19:07 +00006388 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006389 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6390 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006391 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6393 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6394 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6395 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6396 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6397 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6398 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6399 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6400 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6401 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006402 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006403 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006404/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6405 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6406 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6407 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006408 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006409 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006410 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006411 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6412 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6413 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6414 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6415 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6416 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6417 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6418 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6419 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6420 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6421 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6422 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6423 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6424 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006425 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006426#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006427 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428#endif
6429
6430#if 0
6431 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006432 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433#endif
6434
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006435 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 {NULL, NULL}
6437};
6438
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006439static PyObject *
6440unicode_mod(PyObject *v, PyObject *w)
6441{
6442 if (!PyUnicode_Check(v)) {
6443 Py_INCREF(Py_NotImplemented);
6444 return Py_NotImplemented;
6445 }
6446 return PyUnicode_Format(v, w);
6447}
6448
6449static PyNumberMethods unicode_as_number = {
6450 0, /*nb_add*/
6451 0, /*nb_subtract*/
6452 0, /*nb_multiply*/
6453 0, /*nb_divide*/
6454 unicode_mod, /*nb_remainder*/
6455};
6456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006459 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6461 (ssizeargfunc) unicode_getitem, /* sq_item */
6462 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 0, /* sq_ass_item */
6464 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006465 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466};
6467
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006468#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6469
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006470static PyObject*
6471unicode_subscript(PyUnicodeObject* self, PyObject* item)
6472{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006473 PyNumberMethods *nb = item->ob_type->tp_as_number;
6474 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6475 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006476 if (i == -1 && PyErr_Occurred())
6477 return NULL;
6478 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006479 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006480 return unicode_getitem(self, i);
6481 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006482 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006483 Py_UNICODE* source_buf;
6484 Py_UNICODE* result_buf;
6485 PyObject* result;
6486
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006487 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006488 &start, &stop, &step, &slicelength) < 0) {
6489 return NULL;
6490 }
6491
6492 if (slicelength <= 0) {
6493 return PyUnicode_FromUnicode(NULL, 0);
6494 } else {
6495 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006496 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6497 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006498
6499 if (result_buf == NULL)
6500 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006501
6502 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6503 result_buf[i] = source_buf[cur];
6504 }
Tim Petersced69f82003-09-16 20:30:58 +00006505
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006506 result = PyUnicode_FromUnicode(result_buf, slicelength);
6507 PyMem_FREE(result_buf);
6508 return result;
6509 }
6510 } else {
6511 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6512 return NULL;
6513 }
6514}
6515
6516static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006517 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006518 (binaryfunc)unicode_subscript, /* mp_subscript */
6519 (objobjargproc)0, /* mp_ass_subscript */
6520};
6521
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006524 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 const void **ptr)
6526{
6527 if (index != 0) {
6528 PyErr_SetString(PyExc_SystemError,
6529 "accessing non-existent unicode segment");
6530 return -1;
6531 }
6532 *ptr = (void *) self->str;
6533 return PyUnicode_GET_DATA_SIZE(self);
6534}
6535
Martin v. Löwis18e16552006-02-15 17:27:45 +00006536static Py_ssize_t
6537unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 const void **ptr)
6539{
6540 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006541 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return -1;
6543}
6544
6545static int
6546unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548{
6549 if (lenp)
6550 *lenp = PyUnicode_GET_DATA_SIZE(self);
6551 return 1;
6552}
6553
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006554static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006556 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 const void **ptr)
6558{
6559 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006560
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (index != 0) {
6562 PyErr_SetString(PyExc_SystemError,
6563 "accessing non-existent unicode segment");
6564 return -1;
6565 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006566 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 if (str == NULL)
6568 return -1;
6569 *ptr = (void *) PyString_AS_STRING(str);
6570 return PyString_GET_SIZE(str);
6571}
6572
6573/* Helpers for PyUnicode_Format() */
6574
6575static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 if (argidx < arglen) {
6580 (*p_argidx)++;
6581 if (arglen < 0)
6582 return args;
6583 else
6584 return PyTuple_GetItem(args, argidx);
6585 }
6586 PyErr_SetString(PyExc_TypeError,
6587 "not enough arguments for format string");
6588 return NULL;
6589}
6590
6591#define F_LJUST (1<<0)
6592#define F_SIGN (1<<1)
6593#define F_BLANK (1<<2)
6594#define F_ALT (1<<3)
6595#define F_ZERO (1<<4)
6596
Martin v. Löwis18e16552006-02-15 17:27:45 +00006597static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006598strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600 register Py_ssize_t i;
6601 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 for (i = len - 1; i >= 0; i--)
6603 buffer[i] = (Py_UNICODE) charbuffer[i];
6604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return len;
6606}
6607
Neal Norwitzfc76d632006-01-10 06:03:13 +00006608static int
6609doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6610{
Tim Peters15231542006-02-16 01:08:01 +00006611 Py_ssize_t result;
6612
Neal Norwitzfc76d632006-01-10 06:03:13 +00006613 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006614 result = strtounicode(buffer, (char *)buffer);
6615 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006616}
6617
6618static int
6619longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6620{
Tim Peters15231542006-02-16 01:08:01 +00006621 Py_ssize_t result;
6622
Neal Norwitzfc76d632006-01-10 06:03:13 +00006623 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006624 result = strtounicode(buffer, (char *)buffer);
6625 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006626}
6627
Guido van Rossum078151d2002-08-11 04:24:12 +00006628/* XXX To save some code duplication, formatfloat/long/int could have been
6629 shared with stringobject.c, converting from 8-bit to Unicode after the
6630 formatting is done. */
6631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632static int
6633formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006634 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 int flags,
6636 int prec,
6637 int type,
6638 PyObject *v)
6639{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006640 /* fmt = '%#.' + `prec` + `type`
6641 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 char fmt[20];
6643 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 x = PyFloat_AsDouble(v);
6646 if (x == -1.0 && PyErr_Occurred())
6647 return -1;
6648 if (prec < 0)
6649 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6651 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006652 /* Worst case length calc to ensure no buffer overrun:
6653
6654 'g' formats:
6655 fmt = %#.<prec>g
6656 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6657 for any double rep.)
6658 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6659
6660 'f' formats:
6661 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6662 len = 1 + 50 + 1 + prec = 52 + prec
6663
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006664 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006665 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006666
6667 */
6668 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6669 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006670 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006671 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006672 return -1;
6673 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006674 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6675 (flags&F_ALT) ? "#" : "",
6676 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006677 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
Tim Peters38fd5b62000-09-21 05:43:11 +00006680static PyObject*
6681formatlong(PyObject *val, int flags, int prec, int type)
6682{
6683 char *buf;
6684 int i, len;
6685 PyObject *str; /* temporary string object. */
6686 PyUnicodeObject *result;
6687
6688 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6689 if (!str)
6690 return NULL;
6691 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006692 if (!result) {
6693 Py_DECREF(str);
6694 return NULL;
6695 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006696 for (i = 0; i < len; i++)
6697 result->str[i] = buf[i];
6698 result->str[len] = 0;
6699 Py_DECREF(str);
6700 return (PyObject*)result;
6701}
6702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703static int
6704formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006705 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 int flags,
6707 int prec,
6708 int type,
6709 PyObject *v)
6710{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006711 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006712 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6713 * + 1 + 1
6714 * = 24
6715 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006716 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006717 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 long x;
6719
6720 x = PyInt_AsLong(v);
6721 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006722 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006723 if (x < 0 && type == 'u') {
6724 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006725 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006726 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6727 sign = "-";
6728 else
6729 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006731 prec = 1;
6732
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006733 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6734 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006735 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006736 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006737 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006738 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006739 return -1;
6740 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006741
6742 if ((flags & F_ALT) &&
6743 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006744 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006745 * of issues that cause pain:
6746 * - when 0 is being converted, the C standard leaves off
6747 * the '0x' or '0X', which is inconsistent with other
6748 * %#x/%#X conversions and inconsistent with Python's
6749 * hex() function
6750 * - there are platforms that violate the standard and
6751 * convert 0 with the '0x' or '0X'
6752 * (Metrowerks, Compaq Tru64)
6753 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006754 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006755 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006756 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006757 * We can achieve the desired consistency by inserting our
6758 * own '0x' or '0X' prefix, and substituting %x/%X in place
6759 * of %#x/%#X.
6760 *
6761 * Note that this is the same approach as used in
6762 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006763 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006764 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6765 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006766 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006767 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006768 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6769 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006770 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006771 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006772 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006773 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006774 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006775 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
6778static int
6779formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006780 size_t buflen,
6781 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006783 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006784 if (PyUnicode_Check(v)) {
6785 if (PyUnicode_GET_SIZE(v) != 1)
6786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006790 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006791 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006792 goto onError;
6793 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796 else {
6797 /* Integer input truncated to a character */
6798 long x;
6799 x = PyInt_AsLong(v);
6800 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006801 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006802#ifdef Py_UNICODE_WIDE
6803 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006804 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006805 "%c arg not in range(0x110000) "
6806 "(wide Python build)");
6807 return -1;
6808 }
6809#else
6810 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006811 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006812 "%c arg not in range(0x10000) "
6813 "(narrow Python build)");
6814 return -1;
6815 }
6816#endif
6817 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 }
6819 buf[1] = '\0';
6820 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006821
6822 onError:
6823 PyErr_SetString(PyExc_TypeError,
6824 "%c requires int or char");
6825 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006828/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6829
6830 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6831 chars are formatted. XXX This is a magic number. Each formatting
6832 routine does bounds checking to ensure no overflow, but a better
6833 solution may be to malloc a buffer of appropriate size for each
6834 format. For now, the current solution is sufficient.
6835*/
6836#define FORMATBUFLEN (size_t)120
6837
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838PyObject *PyUnicode_Format(PyObject *format,
6839 PyObject *args)
6840{
6841 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006842 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 int args_owned = 0;
6844 PyUnicodeObject *result = NULL;
6845 PyObject *dict = NULL;
6846 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 if (format == NULL || args == NULL) {
6849 PyErr_BadInternalCall();
6850 return NULL;
6851 }
6852 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006853 if (uformat == NULL)
6854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 fmt = PyUnicode_AS_UNICODE(uformat);
6856 fmtcnt = PyUnicode_GET_SIZE(uformat);
6857
6858 reslen = rescnt = fmtcnt + 100;
6859 result = _PyUnicode_New(reslen);
6860 if (result == NULL)
6861 goto onError;
6862 res = PyUnicode_AS_UNICODE(result);
6863
6864 if (PyTuple_Check(args)) {
6865 arglen = PyTuple_Size(args);
6866 argidx = 0;
6867 }
6868 else {
6869 arglen = -1;
6870 argidx = -2;
6871 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006872 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6873 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 dict = args;
6875
6876 while (--fmtcnt >= 0) {
6877 if (*fmt != '%') {
6878 if (--rescnt < 0) {
6879 rescnt = fmtcnt + 100;
6880 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006881 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006882 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6884 --rescnt;
6885 }
6886 *res++ = *fmt++;
6887 }
6888 else {
6889 /* Got a format specifier */
6890 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 Py_UNICODE c = '\0';
6894 Py_UNICODE fill;
6895 PyObject *v = NULL;
6896 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006897 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006900 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902 fmt++;
6903 if (*fmt == '(') {
6904 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006905 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 PyObject *key;
6907 int pcount = 1;
6908
6909 if (dict == NULL) {
6910 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006911 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 goto onError;
6913 }
6914 ++fmt;
6915 --fmtcnt;
6916 keystart = fmt;
6917 /* Skip over balanced parentheses */
6918 while (pcount > 0 && --fmtcnt >= 0) {
6919 if (*fmt == ')')
6920 --pcount;
6921 else if (*fmt == '(')
6922 ++pcount;
6923 fmt++;
6924 }
6925 keylen = fmt - keystart - 1;
6926 if (fmtcnt < 0 || pcount > 0) {
6927 PyErr_SetString(PyExc_ValueError,
6928 "incomplete format key");
6929 goto onError;
6930 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006931#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006932 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 then looked up since Python uses strings to hold
6934 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006935 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 key = PyUnicode_EncodeUTF8(keystart,
6937 keylen,
6938 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006939#else
6940 key = PyUnicode_FromUnicode(keystart, keylen);
6941#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (key == NULL)
6943 goto onError;
6944 if (args_owned) {
6945 Py_DECREF(args);
6946 args_owned = 0;
6947 }
6948 args = PyObject_GetItem(dict, key);
6949 Py_DECREF(key);
6950 if (args == NULL) {
6951 goto onError;
6952 }
6953 args_owned = 1;
6954 arglen = -1;
6955 argidx = -2;
6956 }
6957 while (--fmtcnt >= 0) {
6958 switch (c = *fmt++) {
6959 case '-': flags |= F_LJUST; continue;
6960 case '+': flags |= F_SIGN; continue;
6961 case ' ': flags |= F_BLANK; continue;
6962 case '#': flags |= F_ALT; continue;
6963 case '0': flags |= F_ZERO; continue;
6964 }
6965 break;
6966 }
6967 if (c == '*') {
6968 v = getnextarg(args, arglen, &argidx);
6969 if (v == NULL)
6970 goto onError;
6971 if (!PyInt_Check(v)) {
6972 PyErr_SetString(PyExc_TypeError,
6973 "* wants int");
6974 goto onError;
6975 }
6976 width = PyInt_AsLong(v);
6977 if (width < 0) {
6978 flags |= F_LJUST;
6979 width = -width;
6980 }
6981 if (--fmtcnt >= 0)
6982 c = *fmt++;
6983 }
6984 else if (c >= '0' && c <= '9') {
6985 width = c - '0';
6986 while (--fmtcnt >= 0) {
6987 c = *fmt++;
6988 if (c < '0' || c > '9')
6989 break;
6990 if ((width*10) / 10 != width) {
6991 PyErr_SetString(PyExc_ValueError,
6992 "width too big");
6993 goto onError;
6994 }
6995 width = width*10 + (c - '0');
6996 }
6997 }
6998 if (c == '.') {
6999 prec = 0;
7000 if (--fmtcnt >= 0)
7001 c = *fmt++;
7002 if (c == '*') {
7003 v = getnextarg(args, arglen, &argidx);
7004 if (v == NULL)
7005 goto onError;
7006 if (!PyInt_Check(v)) {
7007 PyErr_SetString(PyExc_TypeError,
7008 "* wants int");
7009 goto onError;
7010 }
7011 prec = PyInt_AsLong(v);
7012 if (prec < 0)
7013 prec = 0;
7014 if (--fmtcnt >= 0)
7015 c = *fmt++;
7016 }
7017 else if (c >= '0' && c <= '9') {
7018 prec = c - '0';
7019 while (--fmtcnt >= 0) {
7020 c = Py_CHARMASK(*fmt++);
7021 if (c < '0' || c > '9')
7022 break;
7023 if ((prec*10) / 10 != prec) {
7024 PyErr_SetString(PyExc_ValueError,
7025 "prec too big");
7026 goto onError;
7027 }
7028 prec = prec*10 + (c - '0');
7029 }
7030 }
7031 } /* prec */
7032 if (fmtcnt >= 0) {
7033 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 if (--fmtcnt >= 0)
7035 c = *fmt++;
7036 }
7037 }
7038 if (fmtcnt < 0) {
7039 PyErr_SetString(PyExc_ValueError,
7040 "incomplete format");
7041 goto onError;
7042 }
7043 if (c != '%') {
7044 v = getnextarg(args, arglen, &argidx);
7045 if (v == NULL)
7046 goto onError;
7047 }
7048 sign = 0;
7049 fill = ' ';
7050 switch (c) {
7051
7052 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007053 pbuf = formatbuf;
7054 /* presume that buffer length is at least 1 */
7055 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 len = 1;
7057 break;
7058
7059 case 's':
7060 case 'r':
7061 if (PyUnicode_Check(v) && c == 's') {
7062 temp = v;
7063 Py_INCREF(temp);
7064 }
7065 else {
7066 PyObject *unicode;
7067 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007068 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 else
7070 temp = PyObject_Repr(v);
7071 if (temp == NULL)
7072 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007073 if (PyUnicode_Check(temp))
7074 /* nothing to do */;
7075 else if (PyString_Check(temp)) {
7076 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007077 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007079 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007081 Py_DECREF(temp);
7082 temp = unicode;
7083 if (temp == NULL)
7084 goto onError;
7085 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007086 else {
7087 Py_DECREF(temp);
7088 PyErr_SetString(PyExc_TypeError,
7089 "%s argument has non-string str()");
7090 goto onError;
7091 }
7092 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007093 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 len = PyUnicode_GET_SIZE(temp);
7095 if (prec >= 0 && len > prec)
7096 len = prec;
7097 break;
7098
7099 case 'i':
7100 case 'd':
7101 case 'u':
7102 case 'o':
7103 case 'x':
7104 case 'X':
7105 if (c == 'i')
7106 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007107 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007108 temp = formatlong(v, flags, prec, c);
7109 if (!temp)
7110 goto onError;
7111 pbuf = PyUnicode_AS_UNICODE(temp);
7112 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007113 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007115 else {
7116 pbuf = formatbuf;
7117 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7118 flags, prec, c, v);
7119 if (len < 0)
7120 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007121 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007122 }
7123 if (flags & F_ZERO)
7124 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 break;
7126
7127 case 'e':
7128 case 'E':
7129 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007130 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 case 'g':
7132 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007133 if (c == 'F')
7134 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007135 pbuf = formatbuf;
7136 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7137 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 if (len < 0)
7139 goto onError;
7140 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007141 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 fill = '0';
7143 break;
7144
7145 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007146 pbuf = formatbuf;
7147 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 if (len < 0)
7149 goto onError;
7150 break;
7151
7152 default:
7153 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007154 "unsupported format character '%c' (0x%x) "
7155 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007156 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007157 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007158 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 goto onError;
7160 }
7161 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007162 if (*pbuf == '-' || *pbuf == '+') {
7163 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 len--;
7165 }
7166 else if (flags & F_SIGN)
7167 sign = '+';
7168 else if (flags & F_BLANK)
7169 sign = ' ';
7170 else
7171 sign = 0;
7172 }
7173 if (width < len)
7174 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007175 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 reslen -= rescnt;
7177 rescnt = width + fmtcnt + 100;
7178 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007179 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007180 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007181 PyErr_NoMemory();
7182 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007183 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007184 if (_PyUnicode_Resize(&result, reslen) < 0) {
7185 Py_XDECREF(temp);
7186 goto onError;
7187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 res = PyUnicode_AS_UNICODE(result)
7189 + reslen - rescnt;
7190 }
7191 if (sign) {
7192 if (fill != ' ')
7193 *res++ = sign;
7194 rescnt--;
7195 if (width > len)
7196 width--;
7197 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007198 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7199 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007200 assert(pbuf[1] == c);
7201 if (fill != ' ') {
7202 *res++ = *pbuf++;
7203 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007204 }
Tim Petersfff53252001-04-12 18:38:48 +00007205 rescnt -= 2;
7206 width -= 2;
7207 if (width < 0)
7208 width = 0;
7209 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (width > len && !(flags & F_LJUST)) {
7212 do {
7213 --rescnt;
7214 *res++ = fill;
7215 } while (--width > len);
7216 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007217 if (fill == ' ') {
7218 if (sign)
7219 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007220 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007221 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007222 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007223 *res++ = *pbuf++;
7224 *res++ = *pbuf++;
7225 }
7226 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007227 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 res += len;
7229 rescnt -= len;
7230 while (--width >= len) {
7231 --rescnt;
7232 *res++ = ' ';
7233 }
7234 if (dict && (argidx < arglen) && c != '%') {
7235 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007236 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007237 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 goto onError;
7239 }
7240 Py_XDECREF(temp);
7241 } /* '%' */
7242 } /* until end */
7243 if (argidx < arglen && !dict) {
7244 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007245 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 goto onError;
7247 }
7248
Thomas Woutersa96affe2006-03-12 00:29:36 +00007249 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7250 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 if (args_owned) {
7252 Py_DECREF(args);
7253 }
7254 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 return (PyObject *)result;
7256
7257 onError:
7258 Py_XDECREF(result);
7259 Py_DECREF(uformat);
7260 if (args_owned) {
7261 Py_DECREF(args);
7262 }
7263 return NULL;
7264}
7265
7266static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007267 (readbufferproc) unicode_buffer_getreadbuf,
7268 (writebufferproc) unicode_buffer_getwritebuf,
7269 (segcountproc) unicode_buffer_getsegcount,
7270 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271};
7272
Jeremy Hylton938ace62002-07-17 16:30:39 +00007273static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007274unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7275
Tim Peters6d6c1a32001-08-02 04:15:00 +00007276static PyObject *
7277unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7278{
7279 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007280 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007281 char *encoding = NULL;
7282 char *errors = NULL;
7283
Guido van Rossume023fe02001-08-30 03:12:59 +00007284 if (type != &PyUnicode_Type)
7285 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007286 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7287 kwlist, &x, &encoding, &errors))
7288 return NULL;
7289 if (x == NULL)
7290 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007291 if (encoding == NULL && errors == NULL)
7292 return PyObject_Unicode(x);
7293 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007294 return PyUnicode_FromEncodedObject(x, encoding, errors);
7295}
7296
Guido van Rossume023fe02001-08-30 03:12:59 +00007297static PyObject *
7298unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7299{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007300 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007301 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007302
7303 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7304 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7305 if (tmp == NULL)
7306 return NULL;
7307 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007308 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007309 if (pnew == NULL) {
7310 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007311 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007312 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007313 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7314 if (pnew->str == NULL) {
7315 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007316 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007317 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007318 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007319 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007320 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7321 pnew->length = n;
7322 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007323 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007324 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007325}
7326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007327PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007328"unicode(string [, encoding[, errors]]) -> object\n\
7329\n\
7330Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007331encoding defaults to the current default string encoding.\n\
7332errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007333
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334PyTypeObject PyUnicode_Type = {
7335 PyObject_HEAD_INIT(&PyType_Type)
7336 0, /* ob_size */
7337 "unicode", /* tp_name */
7338 sizeof(PyUnicodeObject), /* tp_size */
7339 0, /* tp_itemsize */
7340 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007341 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007343 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 0, /* tp_setattr */
7345 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007346 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007347 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007349 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 (hashfunc) unicode_hash, /* tp_hash*/
7351 0, /* tp_call*/
7352 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007353 PyObject_GenericGetAttr, /* tp_getattro */
7354 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007356 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7357 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007358 unicode_doc, /* tp_doc */
7359 0, /* tp_traverse */
7360 0, /* tp_clear */
7361 0, /* tp_richcompare */
7362 0, /* tp_weaklistoffset */
7363 0, /* tp_iter */
7364 0, /* tp_iternext */
7365 unicode_methods, /* tp_methods */
7366 0, /* tp_members */
7367 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007368 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007369 0, /* tp_dict */
7370 0, /* tp_descr_get */
7371 0, /* tp_descr_set */
7372 0, /* tp_dictoffset */
7373 0, /* tp_init */
7374 0, /* tp_alloc */
7375 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007376 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377};
7378
7379/* Initialize the Unicode implementation */
7380
Thomas Wouters78890102000-07-22 19:25:51 +00007381void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007383 int i;
7384
Fred Drakee4315f52000-05-09 19:53:39 +00007385 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007386 unicode_freelist = NULL;
7387 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007389 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007390 for (i = 0; i < 256; i++)
7391 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007392 if (PyType_Ready(&PyUnicode_Type) < 0)
7393 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
7396/* Finalize the Unicode implementation */
7397
7398void
Thomas Wouters78890102000-07-22 19:25:51 +00007399_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007401 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007402 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007404 Py_XDECREF(unicode_empty);
7405 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007406
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007407 for (i = 0; i < 256; i++) {
7408 if (unicode_latin1[i]) {
7409 Py_DECREF(unicode_latin1[i]);
7410 unicode_latin1[i] = NULL;
7411 }
7412 }
7413
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007414 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 PyUnicodeObject *v = u;
7416 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007417 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007418 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007419 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007420 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007422 unicode_freelist = NULL;
7423 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007425
Anthony Baxterac6bd462006-04-13 02:06:09 +00007426#ifdef __cplusplus
7427}
7428#endif
7429
7430
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007431/*
7432Local variables:
7433c-basic-offset: 4
7434indent-tabs-mode: nil
7435End:
7436*/