blob: 85cbed285d7b459711c51e67c5e0cc987d59aefc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
51#define MAX_UNICODE_FREELIST_SIZE 1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Barry Warsaw51ac5802000-03-20 16:36:48 +000059 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Anthony Baxterac6bd462006-04-13 02:06:09 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Guido van Rossumd57fd912000-03-10 22:53:23 +000092/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093static PyUnicodeObject *unicode_freelist;
94static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000096/* The empty Unicode object is shared to improve performance. */
97static PyUnicodeObject *unicode_empty;
98
99/* Single character Unicode strings in the Latin-1 range are being
100 shared as well. */
101static PyUnicodeObject *unicode_latin1[256];
102
Fred Drakee4315f52000-05-09 19:53:39 +0000103/* Default encoding to use and assume when NULL is passed as encoding
104 parameter; it is initialized by _PyUnicode_Init().
105
106 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000107 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000108
109*/
Fred Drakee4315f52000-05-09 19:53:39 +0000110static char unicode_default_encoding[100];
111
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000112Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000113PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000114{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000115#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000116 return 0x10FFFF;
117#else
118 /* This is actually an illegal character, so it should
119 not be passed to unichr. */
120 return 0xFFFF;
121#endif
122}
123
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124/* --- Unicode Object ----------------------------------------------------- */
125
126static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000127int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000128 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129{
130 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000131
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000132 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000133 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000136 /* Resizing shared object (unicode_empty or single character
137 objects) in-place is not allowed. Use PyUnicode_Resize()
138 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000139 if (unicode == unicode_empty ||
140 (unicode->length == 1 &&
141 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000144 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000153 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000158 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 return 0;
169}
170
171/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000172 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173
174 XXX This allocator could further be enhanced by assuring that the
175 free list never reduces its size below 1.
176
177*/
178
179static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181{
182 register PyUnicodeObject *unicode;
183
Tim Petersced69f82003-09-16 20:30:58 +0000184 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 if (length == 0 && unicode_empty != NULL) {
186 Py_INCREF(unicode_empty);
187 return unicode_empty;
188 }
189
190 /* Unicode freelist & memory allocation */
191 if (unicode_freelist) {
192 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000193 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000196 /* Keep-Alive optimization: we only upsize the buffer,
197 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000198 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000199 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000204 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000206 }
207 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 }
209 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000210 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 if (unicode == NULL)
212 return NULL;
213 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
214 }
215
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000216 if (!unicode->str) {
217 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000218 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000220 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000221 * the caller fails before initializing str -- unicode_resize()
222 * reads str[0], and the Keep-Alive optimization can keep memory
223 * allocated for str alive across a call to unicode_dealloc(unicode).
224 * We don't want unicode_resize to read uninitialized memory in
225 * that case.
226 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000227 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000229 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000231 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000233
234 onError:
235 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000236 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238}
239
240static
Guido van Rossum9475a232001-10-05 20:51:39 +0000241void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000243 if (PyUnicode_CheckExact(unicode) &&
244 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000245 /* Keep-Alive optimization */
246 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 unicode->str = NULL;
249 unicode->length = 0;
250 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 if (unicode->defenc) {
252 Py_DECREF(unicode->defenc);
253 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 }
255 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 *(PyUnicodeObject **)unicode = unicode_freelist;
257 unicode_freelist = unicode;
258 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000262 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000263 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 }
265}
266
Martin v. Löwis18e16552006-02-15 17:27:45 +0000267int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268{
269 register PyUnicodeObject *v;
270
271 /* Argument checks */
272 if (unicode == NULL) {
273 PyErr_BadInternalCall();
274 return -1;
275 }
276 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000277 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 PyErr_BadInternalCall();
279 return -1;
280 }
281
282 /* Resizing unicode_empty and single character objects is not
283 possible since these are being shared. We simply return a fresh
284 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000285 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286 (v == unicode_empty || v->length == 1)) {
287 PyUnicodeObject *w = _PyUnicode_New(length);
288 if (w == NULL)
289 return -1;
290 Py_UNICODE_COPY(w->str, v->str,
291 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000292 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000293 *unicode = (PyObject *)w;
294 return 0;
295 }
296
297 /* Note that we don't have to modify *unicode for unshared Unicode
298 objects, since we can modify them in-place. */
299 return unicode_resize(v, length);
300}
301
302/* Internal API for use in unicodeobject.c only ! */
303#define _PyUnicode_Resize(unicodevar, length) \
304 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308{
309 PyUnicodeObject *unicode;
310
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000311 /* If the Unicode data is known at construction time, we can apply
312 some optimizations which share commonly used objects. */
313 if (u != NULL) {
314
315 /* Optimization for empty strings */
316 if (size == 0 && unicode_empty != NULL) {
317 Py_INCREF(unicode_empty);
318 return (PyObject *)unicode_empty;
319 }
320
321 /* Single character Unicode objects in the Latin-1 range are
322 shared when using this constructor */
323 if (size == 1 && *u < 256) {
324 unicode = unicode_latin1[*u];
325 if (!unicode) {
326 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000327 if (!unicode)
328 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000329 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 unicode_latin1[*u] = unicode;
331 }
332 Py_INCREF(unicode);
333 return (PyObject *)unicode;
334 }
335 }
Tim Petersced69f82003-09-16 20:30:58 +0000336
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 unicode = _PyUnicode_New(size);
338 if (!unicode)
339 return NULL;
340
341 /* Copy the Unicode data into the new object */
342 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
345 return (PyObject *)unicode;
346}
347
348#ifdef HAVE_WCHAR_H
349
350PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000351 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352{
353 PyUnicodeObject *unicode;
354
355 if (w == NULL) {
356 PyErr_BadInternalCall();
357 return NULL;
358 }
359
360 unicode = _PyUnicode_New(size);
361 if (!unicode)
362 return NULL;
363
364 /* Copy the wchar_t data into the new object */
365#ifdef HAVE_USABLE_WCHAR_T
366 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000367#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 {
369 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000370 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000372 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 *u++ = *w++;
374 }
375#endif
376
377 return (PyObject *)unicode;
378}
379
Martin v. Löwis18e16552006-02-15 17:27:45 +0000380Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
381 wchar_t *w,
382 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383{
384 if (unicode == NULL) {
385 PyErr_BadInternalCall();
386 return -1;
387 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000388
389 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000391 size = PyUnicode_GET_SIZE(unicode) + 1;
392
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393#ifdef HAVE_USABLE_WCHAR_T
394 memcpy(w, unicode->str, size * sizeof(wchar_t));
395#else
396 {
397 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000398 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000400 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 *w++ = *u++;
402 }
403#endif
404
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000405 if (size > PyUnicode_GET_SIZE(unicode))
406 return PyUnicode_GET_SIZE(unicode);
407 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 return size;
409}
410
411#endif
412
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000413PyObject *PyUnicode_FromOrdinal(int ordinal)
414{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000415 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000416
417#ifdef Py_UNICODE_WIDE
418 if (ordinal < 0 || ordinal > 0x10ffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x110000) "
421 "(wide Python build)");
422 return NULL;
423 }
424#else
425 if (ordinal < 0 || ordinal > 0xffff) {
426 PyErr_SetString(PyExc_ValueError,
427 "unichr() arg not in range(0x10000) "
428 "(narrow Python build)");
429 return NULL;
430 }
431#endif
432
Hye-Shik Chang40574832004-04-06 07:24:51 +0000433 s[0] = (Py_UNICODE)ordinal;
434 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000435}
436
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437PyObject *PyUnicode_FromObject(register PyObject *obj)
438{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000439 /* XXX Perhaps we should make this API an alias of
440 PyObject_Unicode() instead ?! */
441 if (PyUnicode_CheckExact(obj)) {
442 Py_INCREF(obj);
443 return obj;
444 }
445 if (PyUnicode_Check(obj)) {
446 /* For a Unicode subtype that's not a Unicode object,
447 return a true Unicode object with the same data. */
448 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
449 PyUnicode_GET_SIZE(obj));
450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
452}
453
454PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
455 const char *encoding,
456 const char *errors)
457{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000458 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000459 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000460 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 if (obj == NULL) {
463 PyErr_BadInternalCall();
464 return NULL;
465 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000467#if 0
468 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000469 that no encodings is given and then redirect to
470 PyObject_Unicode() which then applies the additional logic for
471 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000472
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000473 NOTE: This API should really only be used for object which
474 represent *encoded* Unicode !
475
476 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 if (PyUnicode_Check(obj)) {
478 if (encoding) {
479 PyErr_SetString(PyExc_TypeError,
480 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000482 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000483 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000484 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000485#else
486 if (PyUnicode_Check(obj)) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
489 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491#endif
492
493 /* Coerce object */
494 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 s = PyString_AS_STRING(obj);
496 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000497 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000498 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
499 /* Overwrite the error message with something more useful in
500 case of a TypeError. */
501 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000502 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000503 "coercing to Unicode: need string or buffer, "
504 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 obj->ob_type->tp_name);
506 goto onError;
507 }
Tim Petersced69f82003-09-16 20:30:58 +0000508
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (len == 0) {
511 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 return v;
518
519 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521}
522
523PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000524 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525 const char *encoding,
526 const char *errors)
527{
528 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000529
530 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000531 encoding = PyUnicode_GetDefaultEncoding();
532
533 /* Shortcuts for common default encodings */
534 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000536 else if (strcmp(encoding, "latin-1") == 0)
537 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000538#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
539 else if (strcmp(encoding, "mbcs") == 0)
540 return PyUnicode_DecodeMBCS(s, size, errors);
541#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000542 else if (strcmp(encoding, "ascii") == 0)
543 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
545 /* Decode via the codec registry */
546 buffer = PyBuffer_FromMemory((void *)s, size);
547 if (buffer == NULL)
548 goto onError;
549 unicode = PyCodec_Decode(buffer, encoding, errors);
550 if (unicode == NULL)
551 goto onError;
552 if (!PyUnicode_Check(unicode)) {
553 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000554 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 unicode->ob_type->tp_name);
556 Py_DECREF(unicode);
557 goto onError;
558 }
559 Py_DECREF(buffer);
560 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 onError:
563 Py_XDECREF(buffer);
564 return NULL;
565}
566
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000567PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
568 const char *encoding,
569 const char *errors)
570{
571 PyObject *v;
572
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577
578 if (encoding == NULL)
579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Decode via the codec registry */
582 v = PyCodec_Decode(unicode, encoding, errors);
583 if (v == NULL)
584 goto onError;
585 return v;
586
587 onError:
588 return NULL;
589}
590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000592 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 const char *encoding,
594 const char *errors)
595{
596 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000597
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 unicode = PyUnicode_FromUnicode(s, size);
599 if (unicode == NULL)
600 return NULL;
601 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
602 Py_DECREF(unicode);
603 return v;
604}
605
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000606PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
607 const char *encoding,
608 const char *errors)
609{
610 PyObject *v;
611
612 if (!PyUnicode_Check(unicode)) {
613 PyErr_BadArgument();
614 goto onError;
615 }
616
617 if (encoding == NULL)
618 encoding = PyUnicode_GetDefaultEncoding();
619
620 /* Encode via the codec registry */
621 v = PyCodec_Encode(unicode, encoding, errors);
622 if (v == NULL)
623 goto onError;
624 return v;
625
626 onError:
627 return NULL;
628}
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
631 const char *encoding,
632 const char *errors)
633{
634 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 if (!PyUnicode_Check(unicode)) {
637 PyErr_BadArgument();
638 goto onError;
639 }
Fred Drakee4315f52000-05-09 19:53:39 +0000640
Tim Petersced69f82003-09-16 20:30:58 +0000641 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000642 encoding = PyUnicode_GetDefaultEncoding();
643
644 /* Shortcuts for common default encodings */
645 if (errors == NULL) {
646 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000647 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000648 else if (strcmp(encoding, "latin-1") == 0)
649 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000650#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
651 else if (strcmp(encoding, "mbcs") == 0)
652 return PyUnicode_AsMBCSString(unicode);
653#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000654 else if (strcmp(encoding, "ascii") == 0)
655 return PyUnicode_AsASCIIString(unicode);
656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657
658 /* Encode via the codec registry */
659 v = PyCodec_Encode(unicode, encoding, errors);
660 if (v == NULL)
661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 if (!PyString_Check(v)) {
663 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000664 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 v->ob_type->tp_name);
666 Py_DECREF(v);
667 goto onError;
668 }
669 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000670
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 onError:
672 return NULL;
673}
674
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000675PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
676 const char *errors)
677{
678 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
679
680 if (v)
681 return v;
682 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
683 if (v && errors == NULL)
684 ((PyUnicodeObject *)unicode)->defenc = v;
685 return v;
686}
687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
689{
690 if (!PyUnicode_Check(unicode)) {
691 PyErr_BadArgument();
692 goto onError;
693 }
694 return PyUnicode_AS_UNICODE(unicode);
695
696 onError:
697 return NULL;
698}
699
Martin v. Löwis18e16552006-02-15 17:27:45 +0000700Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701{
702 if (!PyUnicode_Check(unicode)) {
703 PyErr_BadArgument();
704 goto onError;
705 }
706 return PyUnicode_GET_SIZE(unicode);
707
708 onError:
709 return -1;
710}
711
Thomas Wouters78890102000-07-22 19:25:51 +0000712const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000713{
714 return unicode_default_encoding;
715}
716
717int PyUnicode_SetDefaultEncoding(const char *encoding)
718{
719 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000720
Fred Drakee4315f52000-05-09 19:53:39 +0000721 /* Make sure the encoding is valid. As side effect, this also
722 loads the encoding into the codec registry cache. */
723 v = _PyCodec_Lookup(encoding);
724 if (v == NULL)
725 goto onError;
726 Py_DECREF(v);
727 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000728 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000729 sizeof(unicode_default_encoding));
730 return 0;
731
732 onError:
733 return -1;
734}
735
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000736/* error handling callback helper:
737 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000738 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000739 and adjust various state variables.
740 return 0 on success, -1 on error
741*/
742
743static
744int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
745 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000746 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
747 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000749 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750
751 PyObject *restuple = NULL;
752 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000753 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
754 Py_ssize_t requiredsize;
755 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000758 int res = -1;
759
760 if (*errorHandler == NULL) {
761 *errorHandler = PyCodec_LookupError(errors);
762 if (*errorHandler == NULL)
763 goto onError;
764 }
765
766 if (*exceptionObject == NULL) {
767 *exceptionObject = PyUnicodeDecodeError_Create(
768 encoding, input, insize, *startinpos, *endinpos, reason);
769 if (*exceptionObject == NULL)
770 goto onError;
771 }
772 else {
773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
774 goto onError;
775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
776 goto onError;
777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
778 goto onError;
779 }
780
781 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
782 if (restuple == NULL)
783 goto onError;
784 if (!PyTuple_Check(restuple)) {
785 PyErr_Format(PyExc_TypeError, &argparse[4]);
786 goto onError;
787 }
788 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
789 goto onError;
790 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000791 newpos = insize+newpos;
792 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000793 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000794 goto onError;
795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796
797 /* need more space? (at least enough for what we
798 have+the replacement+the rest of the string (starting
799 at the new input position), so we won't have to check space
800 when there are no errors in the rest of the string) */
801 repptr = PyUnicode_AS_UNICODE(repunicode);
802 repsize = PyUnicode_GET_SIZE(repunicode);
803 requiredsize = *outpos + repsize + insize-newpos;
804 if (requiredsize > outsize) {
805 if (requiredsize<2*outsize)
806 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000807 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 goto onError;
809 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
810 }
811 *endinpos = newpos;
812 *inptr = input + newpos;
813 Py_UNICODE_COPY(*outptr, repptr, repsize);
814 *outptr += repsize;
815 *outpos += repsize;
816 /* we made it! */
817 res = 0;
818
819 onError:
820 Py_XDECREF(restuple);
821 return res;
822}
823
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000824/* --- UTF-7 Codec -------------------------------------------------------- */
825
826/* see RFC2152 for details */
827
Tim Petersced69f82003-09-16 20:30:58 +0000828static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000829char utf7_special[128] = {
830 /* indicate whether a UTF-7 character is special i.e. cannot be directly
831 encoded:
832 0 - not special
833 1 - special
834 2 - whitespace (optional)
835 3 - RFC2152 Set O (optional) */
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
837 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
838 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
840 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
842 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
844
845};
846
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000847/* Note: The comparison (c) <= 0 is a trick to work-around gcc
848 warnings about the comparison always being false; since
849 utf7_special[0] is 1, we can safely make that one comparison
850 true */
851
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000852#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000853 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000854 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000855 (encodeO && (utf7_special[(c)] == 3)))
856
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000857#define B64(n) \
858 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
859#define B64CHAR(c) \
860 (isalnum(c) || (c) == '+' || (c) == '/')
861#define UB64(c) \
862 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
863 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000864
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000865#define ENCODE(out, ch, bits) \
866 while (bits >= 6) { \
867 *out++ = B64(ch >> (bits-6)); \
868 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000869 }
870
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000871#define DECODE(out, ch, bits, surrogate) \
872 while (bits >= 16) { \
873 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
874 bits -= 16; \
875 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000876 /* We have already generated an error for the high surrogate \
877 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000878 surrogate = 0; \
879 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000880 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000881 it in a 16-bit character */ \
882 surrogate = 1; \
883 errmsg = "code pairs are not supported"; \
884 goto utf7Error; \
885 } else { \
886 *out++ = outCh; \
887 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000888 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000891 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 const char *errors)
893{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000894 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000895 Py_ssize_t startinpos;
896 Py_ssize_t endinpos;
897 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 const char *e;
899 PyUnicodeObject *unicode;
900 Py_UNICODE *p;
901 const char *errmsg = "";
902 int inShift = 0;
903 unsigned int bitsleft = 0;
904 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000905 int surrogate = 0;
906 PyObject *errorHandler = NULL;
907 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908
909 unicode = _PyUnicode_New(size);
910 if (!unicode)
911 return NULL;
912 if (size == 0)
913 return (PyObject *)unicode;
914
915 p = unicode->str;
916 e = s + size;
917
918 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 Py_UNICODE ch;
920 restart:
921 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922
923 if (inShift) {
924 if ((ch == '-') || !B64CHAR(ch)) {
925 inShift = 0;
926 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000927
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
929 if (bitsleft >= 6) {
930 /* The shift sequence has a partial character in it. If
931 bitsleft < 6 then we could just classify it as padding
932 but that is not the case here */
933
934 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 }
937 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000938 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 here so indicate the potential of a misencoded character. */
940
941 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
942 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
943 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000944 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 }
946
947 if (ch == '-') {
948 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000949 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 inShift = 1;
951 }
952 } else if (SPECIAL(ch,0,0)) {
953 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000954 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 } else {
956 *p++ = ch;
957 }
958 } else {
959 charsleft = (charsleft << 6) | UB64(ch);
960 bitsleft += 6;
961 s++;
962 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
963 }
964 }
965 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000967 s++;
968 if (s < e && *s == '-') {
969 s++;
970 *p++ = '+';
971 } else
972 {
973 inShift = 1;
974 bitsleft = 0;
975 }
976 }
977 else if (SPECIAL(ch,0,0)) {
978 errmsg = "unexpected special character";
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982 else {
983 *p++ = ch;
984 s++;
985 }
986 continue;
987 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000988 outpos = p-PyUnicode_AS_UNICODE(unicode);
989 endinpos = s-starts;
990 if (unicode_decode_call_errorhandler(
991 errors, &errorHandler,
992 "utf7", errmsg,
993 starts, size, &startinpos, &endinpos, &exc, &s,
994 (PyObject **)&unicode, &outpos, &p))
995 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 }
997
998 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 outpos = p-PyUnicode_AS_UNICODE(unicode);
1000 endinpos = size;
1001 if (unicode_decode_call_errorhandler(
1002 errors, &errorHandler,
1003 "utf7", "unterminated shift sequence",
1004 starts, size, &startinpos, &endinpos, &exc, &s,
1005 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 if (s < e)
1008 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 }
1010
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001011 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 goto onError;
1013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 Py_XDECREF(errorHandler);
1015 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 return (PyObject *)unicode;
1017
1018onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 Py_XDECREF(errorHandler);
1020 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021 Py_DECREF(unicode);
1022 return NULL;
1023}
1024
1025
1026PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028 int encodeSetO,
1029 int encodeWhiteSpace,
1030 const char *errors)
1031{
1032 PyObject *v;
1033 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001034 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001036 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 unsigned int bitsleft = 0;
1038 unsigned long charsleft = 0;
1039 char * out;
1040 char * start;
1041
1042 if (size == 0)
1043 return PyString_FromStringAndSize(NULL, 0);
1044
1045 v = PyString_FromStringAndSize(NULL, cbAllocated);
1046 if (v == NULL)
1047 return NULL;
1048
1049 start = out = PyString_AS_STRING(v);
1050 for (;i < size; ++i) {
1051 Py_UNICODE ch = s[i];
1052
1053 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001054 if (ch == '+') {
1055 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001056 *out++ = '-';
1057 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1058 charsleft = ch;
1059 bitsleft = 16;
1060 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001061 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001063 } else {
1064 *out++ = (char) ch;
1065 }
1066 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001067 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1068 *out++ = B64(charsleft << (6-bitsleft));
1069 charsleft = 0;
1070 bitsleft = 0;
1071 /* Characters not in the BASE64 set implicitly unshift the sequence
1072 so no '-' is required, except if the character is itself a '-' */
1073 if (B64CHAR(ch) || ch == '-') {
1074 *out++ = '-';
1075 }
1076 inShift = 0;
1077 *out++ = (char) ch;
1078 } else {
1079 bitsleft += 16;
1080 charsleft = (charsleft << 16) | ch;
1081 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1082
1083 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001084 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 or '-' then the shift sequence will be terminated implicitly and we
1086 don't have to insert a '-'. */
1087
1088 if (bitsleft == 0) {
1089 if (i + 1 < size) {
1090 Py_UNICODE ch2 = s[i+1];
1091
1092 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001093
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 } else if (B64CHAR(ch2) || ch2 == '-') {
1095 *out++ = '-';
1096 inShift = 0;
1097 } else {
1098 inShift = 0;
1099 }
1100
1101 }
1102 else {
1103 *out++ = '-';
1104 inShift = 0;
1105 }
1106 }
Tim Petersced69f82003-09-16 20:30:58 +00001107 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 if (bitsleft) {
1111 *out++= B64(charsleft << (6-bitsleft) );
1112 *out++ = '-';
1113 }
1114
Tim Peters5de98422002-04-27 18:44:32 +00001115 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 return v;
1117}
1118
1119#undef SPECIAL
1120#undef B64
1121#undef B64CHAR
1122#undef UB64
1123#undef ENCODE
1124#undef DECODE
1125
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126/* --- UTF-8 Codec -------------------------------------------------------- */
1127
Tim Petersced69f82003-09-16 20:30:58 +00001128static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129char utf8_code_length[256] = {
1130 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1131 illegal prefix. see RFC 2279 for details */
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1146 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1147 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1148};
1149
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001151 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *errors)
1153{
Walter Dörwald69652032004-09-07 20:24:22 +00001154 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1155}
1156
1157PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001159 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001160 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001164 Py_ssize_t startinpos;
1165 Py_ssize_t endinpos;
1166 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *e;
1168 PyUnicodeObject *unicode;
1169 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001170 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001171 PyObject *errorHandler = NULL;
1172 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
1174 /* Note: size will always be longer than the resulting Unicode
1175 character count */
1176 unicode = _PyUnicode_New(size);
1177 if (!unicode)
1178 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001179 if (size == 0) {
1180 if (consumed)
1181 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184
1185 /* Unpack UTF-8 encoded data */
1186 p = unicode->str;
1187 e = s + size;
1188
1189 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191
1192 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 s++;
1195 continue;
1196 }
1197
1198 n = utf8_code_length[ch];
1199
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001201 if (consumed)
1202 break;
1203 else {
1204 errmsg = "unexpected end of data";
1205 startinpos = s-starts;
1206 endinpos = size;
1207 goto utf8Error;
1208 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210
1211 switch (n) {
1212
1213 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001215 startinpos = s-starts;
1216 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218
1219 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001221 startinpos = s-starts;
1222 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001223 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224
1225 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 if ((s[1] & 0xc0) != 0x80) {
1227 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 startinpos = s-starts;
1229 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 goto utf8Error;
1231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001234 startinpos = s-starts;
1235 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 errmsg = "illegal encoding";
1237 goto utf8Error;
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 break;
1242
1243 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001244 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001245 (s[2] & 0xc0) != 0x80) {
1246 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001247 startinpos = s-starts;
1248 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001249 goto utf8Error;
1250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 if (ch < 0x0800) {
1253 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001254 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001255
1256 XXX For wide builds (UCS-4) we should probably try
1257 to recombine the surrogates into a single code
1258 unit.
1259 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001260 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 startinpos = s-starts;
1262 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 goto utf8Error;
1264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001266 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001267 break;
1268
1269 case 4:
1270 if ((s[1] & 0xc0) != 0x80 ||
1271 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 (s[3] & 0xc0) != 0x80) {
1273 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 startinpos = s-starts;
1275 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 goto utf8Error;
1277 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001278 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1279 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1280 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001281 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001282 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001284 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001285 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 startinpos = s-starts;
1288 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 goto utf8Error;
1290 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001291#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001292 *p++ = (Py_UNICODE)ch;
1293#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001294 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001295
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001296 /* translate from 10000..10FFFF to 0..FFFF */
1297 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001298
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001299 /* high surrogate = top 10 bits added to D800 */
1300 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001301
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001303 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001304#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 break;
1306
1307 default:
1308 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001310 startinpos = s-starts;
1311 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
1314 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001316
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 outpos = p-PyUnicode_AS_UNICODE(unicode);
1319 if (unicode_decode_call_errorhandler(
1320 errors, &errorHandler,
1321 "utf8", errmsg,
1322 starts, size, &startinpos, &endinpos, &exc, &s,
1323 (PyObject **)&unicode, &outpos, &p))
1324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
Walter Dörwald69652032004-09-07 20:24:22 +00001326 if (consumed)
1327 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001330 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 goto onError;
1332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 Py_XDECREF(errorHandler);
1334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335 return (PyObject *)unicode;
1336
1337onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 Py_XDECREF(errorHandler);
1339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 Py_DECREF(unicode);
1341 return NULL;
1342}
1343
Tim Peters602f7402002-04-27 18:03:26 +00001344/* Allocation strategy: if the string is short, convert into a stack buffer
1345 and allocate exactly as much space needed at the end. Else allocate the
1346 maximum possible needed (4 result bytes per Unicode character), and return
1347 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001348*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001349PyObject *
1350PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001351 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001352 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353{
Tim Peters602f7402002-04-27 18:03:26 +00001354#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001355
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001357 PyObject *v; /* result string object */
1358 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001359 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001360 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001361 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 assert(s != NULL);
1364 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters602f7402002-04-27 18:03:26 +00001366 if (size <= MAX_SHORT_UNICHARS) {
1367 /* Write into the stack buffer; nallocated can't overflow.
1368 * At the end, we'll allocate exactly as much heap space as it
1369 * turns out we need.
1370 */
1371 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1372 v = NULL; /* will allocate after we're done */
1373 p = stackbuf;
1374 }
1375 else {
1376 /* Overallocate on the heap, and give the excess back at the end. */
1377 nallocated = size * 4;
1378 if (nallocated / 4 != size) /* overflow! */
1379 return PyErr_NoMemory();
1380 v = PyString_FromStringAndSize(NULL, nallocated);
1381 if (v == NULL)
1382 return NULL;
1383 p = PyString_AS_STRING(v);
1384 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001385
Tim Peters602f7402002-04-27 18:03:26 +00001386 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001387 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001388
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001389 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001390 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001394 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001395 *p++ = (char)(0xc0 | (ch >> 6));
1396 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001397 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001398 else {
Tim Peters602f7402002-04-27 18:03:26 +00001399 /* Encode UCS2 Unicode ordinals */
1400 if (ch < 0x10000) {
1401 /* Special case: check for high surrogate */
1402 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1403 Py_UCS4 ch2 = s[i];
1404 /* Check for low surrogate and combine the two to
1405 form a UCS4 value */
1406 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001407 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001408 i++;
1409 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 }
Tim Peters602f7402002-04-27 18:03:26 +00001411 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001413 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001414 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1415 *p++ = (char)(0x80 | (ch & 0x3f));
1416 continue;
1417 }
1418encodeUCS4:
1419 /* Encode UCS4 Unicode ordinals */
1420 *p++ = (char)(0xf0 | (ch >> 18));
1421 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1422 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1423 *p++ = (char)(0x80 | (ch & 0x3f));
1424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001426
Tim Peters602f7402002-04-27 18:03:26 +00001427 if (v == NULL) {
1428 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001429 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001430 assert(nneeded <= nallocated);
1431 v = PyString_FromStringAndSize(stackbuf, nneeded);
1432 }
1433 else {
1434 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001435 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001436 assert(nneeded <= nallocated);
1437 _PyString_Resize(&v, nneeded);
1438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001440
Tim Peters602f7402002-04-27 18:03:26 +00001441#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442}
1443
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1445{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 return NULL;
1449 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001450 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1451 PyUnicode_GET_SIZE(unicode),
1452 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453}
1454
1455/* --- UTF-16 Codec ------------------------------------------------------- */
1456
Tim Peters772747b2001-08-09 22:21:55 +00001457PyObject *
1458PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001460 const char *errors,
1461 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462{
Walter Dörwald69652032004-09-07 20:24:22 +00001463 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1464}
1465
1466PyObject *
1467PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001469 const char *errors,
1470 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001472{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t startinpos;
1475 Py_ssize_t endinpos;
1476 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 PyUnicodeObject *unicode;
1478 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001479 const unsigned char *q, *e;
1480 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001481 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001482 /* Offsets from q for retrieving byte pairs in the right order. */
1483#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1484 int ihi = 1, ilo = 0;
1485#else
1486 int ihi = 0, ilo = 1;
1487#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001488 PyObject *errorHandler = NULL;
1489 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490
1491 /* Note: size will always be longer than the resulting Unicode
1492 character count */
1493 unicode = _PyUnicode_New(size);
1494 if (!unicode)
1495 return NULL;
1496 if (size == 0)
1497 return (PyObject *)unicode;
1498
1499 /* Unpack UTF-16 encoded data */
1500 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001501 q = (unsigned char *)s;
1502 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503
1504 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001505 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001507 /* Check for BOM marks (U+FEFF) in the input and adjust current
1508 byte order setting accordingly. In native mode, the leading BOM
1509 mark is skipped, in all other modes, it is copied to the output
1510 stream as-is (giving a ZWNBSP character). */
1511 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001512 if (size >= 2) {
1513 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001514#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001515 if (bom == 0xFEFF) {
1516 q += 2;
1517 bo = -1;
1518 }
1519 else if (bom == 0xFFFE) {
1520 q += 2;
1521 bo = 1;
1522 }
Tim Petersced69f82003-09-16 20:30:58 +00001523#else
Walter Dörwald69652032004-09-07 20:24:22 +00001524 if (bom == 0xFEFF) {
1525 q += 2;
1526 bo = 1;
1527 }
1528 else if (bom == 0xFFFE) {
1529 q += 2;
1530 bo = -1;
1531 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001532#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001533 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535
Tim Peters772747b2001-08-09 22:21:55 +00001536 if (bo == -1) {
1537 /* force LE */
1538 ihi = 1;
1539 ilo = 0;
1540 }
1541 else if (bo == 1) {
1542 /* force BE */
1543 ihi = 0;
1544 ilo = 1;
1545 }
1546
1547 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001548 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001549 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001551 if (consumed)
1552 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 errmsg = "truncated data";
1554 startinpos = ((const char *)q)-starts;
1555 endinpos = ((const char *)e)-starts;
1556 goto utf16Error;
1557 /* The remaining input chars are ignored if the callback
1558 chooses to skip the input */
1559 }
1560 ch = (q[ihi] << 8) | q[ilo];
1561
Tim Peters772747b2001-08-09 22:21:55 +00001562 q += 2;
1563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 if (ch < 0xD800 || ch > 0xDFFF) {
1565 *p++ = ch;
1566 continue;
1567 }
1568
1569 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001570 if (q >= e) {
1571 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = (((const char *)q)-2)-starts;
1573 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 goto utf16Error;
1575 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001576 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1578 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001580#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001581 *p++ = ch;
1582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583#else
1584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001585#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001586 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 }
1588 else {
1589 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 startinpos = (((const char *)q)-4)-starts;
1591 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001592 goto utf16Error;
1593 }
1594
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001596 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 startinpos = (((const char *)q)-2)-starts;
1598 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 /* Fall through to report the error */
1600
1601 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 outpos = p-PyUnicode_AS_UNICODE(unicode);
1603 if (unicode_decode_call_errorhandler(
1604 errors, &errorHandler,
1605 "utf16", errmsg,
1606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1607 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 }
1610
1611 if (byteorder)
1612 *byteorder = bo;
1613
Walter Dörwald69652032004-09-07 20:24:22 +00001614 if (consumed)
1615 *consumed = (const char *)q-starts;
1616
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 goto onError;
1620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_XDECREF(errorHandler);
1622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return (PyObject *)unicode;
1624
1625onError:
1626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 Py_XDECREF(errorHandler);
1628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 return NULL;
1630}
1631
Tim Peters772747b2001-08-09 22:21:55 +00001632PyObject *
1633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001634 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001635 const char *errors,
1636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637{
1638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001639 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001641 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001642#else
1643 const int pairs = 0;
1644#endif
Tim Peters772747b2001-08-09 22:21:55 +00001645 /* Offsets from p for storing byte pairs in the right order. */
1646#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1647 int ihi = 1, ilo = 0;
1648#else
1649 int ihi = 0, ilo = 1;
1650#endif
1651
1652#define STORECHAR(CH) \
1653 do { \
1654 p[ihi] = ((CH) >> 8) & 0xff; \
1655 p[ilo] = (CH) & 0xff; \
1656 p += 2; \
1657 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001659#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001660 for (i = pairs = 0; i < size; i++)
1661 if (s[i] >= 0x10000)
1662 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001663#endif
Tim Petersced69f82003-09-16 20:30:58 +00001664 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001665 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 if (v == NULL)
1667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001668
Tim Peters772747b2001-08-09 22:21:55 +00001669 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001671 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001672 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001673 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001674
1675 if (byteorder == -1) {
1676 /* force LE */
1677 ihi = 1;
1678 ilo = 0;
1679 }
1680 else if (byteorder == 1) {
1681 /* force BE */
1682 ihi = 0;
1683 ilo = 1;
1684 }
1685
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001686 while (size-- > 0) {
1687 Py_UNICODE ch = *s++;
1688 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001689#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001690 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1692 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#endif
Tim Peters772747b2001-08-09 22:21:55 +00001695 STORECHAR(ch);
1696 if (ch2)
1697 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001700#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701}
1702
1703PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1704{
1705 if (!PyUnicode_Check(unicode)) {
1706 PyErr_BadArgument();
1707 return NULL;
1708 }
1709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 NULL,
1712 0);
1713}
1714
1715/* --- Unicode Escape Codec ----------------------------------------------- */
1716
Fredrik Lundh06d12682001-01-24 07:59:11 +00001717static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001718
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001720 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 const char *errors)
1722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001724 Py_ssize_t startinpos;
1725 Py_ssize_t endinpos;
1726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001731 char* message;
1732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 PyObject *errorHandler = NULL;
1734 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001735
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 /* Escaped strings will always be longer than the resulting
1737 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 length after conversion to the true value.
1739 (but if the error callback returns a long replacement string
1740 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 v = _PyUnicode_New(size);
1742 if (v == NULL)
1743 goto onError;
1744 if (size == 0)
1745 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 while (s < end) {
1751 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001752 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 /* Non-escape characters are interpreted as Unicode ordinals */
1756 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 continue;
1759 }
1760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 /* \ - Escapes */
1763 s++;
1764 switch (*s++) {
1765
1766 /* \x escapes */
1767 case '\n': break;
1768 case '\\': *p++ = '\\'; break;
1769 case '\'': *p++ = '\''; break;
1770 case '\"': *p++ = '\"'; break;
1771 case 'b': *p++ = '\b'; break;
1772 case 'f': *p++ = '\014'; break; /* FF */
1773 case 't': *p++ = '\t'; break;
1774 case 'n': *p++ = '\n'; break;
1775 case 'r': *p++ = '\r'; break;
1776 case 'v': *p++ = '\013'; break; /* VT */
1777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1778
1779 /* \OOO (octal) escapes */
1780 case '0': case '1': case '2': case '3':
1781 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001782 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001784 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001786 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001788 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 break;
1790
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 /* hex escapes */
1792 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001794 digits = 2;
1795 message = "truncated \\xXX escape";
1796 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 digits = 4;
1801 message = "truncated \\uXXXX escape";
1802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Fredrik Lundhccc74732001-02-18 22:13:49 +00001804 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001805 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 digits = 8;
1807 message = "truncated \\UXXXXXXXX escape";
1808 hexescape:
1809 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (s+digits>end) {
1812 endinpos = size;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "unicodeescape", "end of string in escape sequence",
1816 starts, size, &startinpos, &endinpos, &exc, &s,
1817 (PyObject **)&v, &outpos, &p))
1818 goto onError;
1819 goto nextByte;
1820 }
1821 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 endinpos = (s+i+1)-starts;
1825 if (unicode_decode_call_errorhandler(
1826 errors, &errorHandler,
1827 "unicodeescape", message,
1828 starts, size, &startinpos, &endinpos, &exc, &s,
1829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001832 }
1833 chr = (chr<<4) & ~0xF;
1834 if (c >= '0' && c <= '9')
1835 chr += c - '0';
1836 else if (c >= 'a' && c <= 'f')
1837 chr += 10 + c - 'a';
1838 else
1839 chr += 10 + c - 'A';
1840 }
1841 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001842 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 /* _decoding_error will have already written into the
1844 target buffer. */
1845 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001847 /* when we get here, chr is a 32-bit unicode character */
1848 if (chr <= 0xffff)
1849 /* UCS-2 character */
1850 *p++ = (Py_UNICODE) chr;
1851 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001852 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001853 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001854#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001855 *p++ = chr;
1856#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 chr -= 0x10000L;
1858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001860#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001861 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 endinpos = s-starts;
1863 outpos = p-PyUnicode_AS_UNICODE(v);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "unicodeescape", "illegal Unicode character",
1867 starts, size, &startinpos, &endinpos, &exc, &s,
1868 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001869 goto onError;
1870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 break;
1872
1873 /* \N{name} */
1874 case 'N':
1875 message = "malformed \\N character escape";
1876 if (ucnhash_CAPI == NULL) {
1877 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001878 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 m = PyImport_ImportModule("unicodedata");
1880 if (m == NULL)
1881 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001882 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001883 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001884 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001885 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001887 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001888 if (ucnhash_CAPI == NULL)
1889 goto ucnhashError;
1890 }
1891 if (*s == '{') {
1892 const char *start = s+1;
1893 /* look for the closing brace */
1894 while (*s != '}' && s < end)
1895 s++;
1896 if (s > start && s < end && *s == '}') {
1897 /* found a name. look it up in the unicode database */
1898 message = "unknown Unicode character name";
1899 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001901 goto store;
1902 }
1903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 endinpos = s-starts;
1905 outpos = p-PyUnicode_AS_UNICODE(v);
1906 if (unicode_decode_call_errorhandler(
1907 errors, &errorHandler,
1908 "unicodeescape", message,
1909 starts, size, &startinpos, &endinpos, &exc, &s,
1910 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 break;
1913
1914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001916 message = "\\ at end of string";
1917 s--;
1918 endinpos = s-starts;
1919 outpos = p-PyUnicode_AS_UNICODE(v);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "unicodeescape", message,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001925 goto onError;
1926 }
1927 else {
1928 *p++ = '\\';
1929 *p++ = (unsigned char)s[-1];
1930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 nextByte:
1934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001941
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001943 PyErr_SetString(
1944 PyExc_UnicodeError,
1945 "\\N escapes not supported (can't load unicodedata module)"
1946 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 Py_XDECREF(errorHandler);
1949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001950 return NULL;
1951
Fredrik Lundhccc74732001-02-18 22:13:49 +00001952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 Py_XDECREF(errorHandler);
1955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 return NULL;
1957}
1958
1959/* Return a Unicode-Escape string version of the Unicode object.
1960
1961 If quotes is true, the string is enclosed in u"" or u'' quotes as
1962 appropriate.
1963
1964*/
1965
Barry Warsaw51ac5802000-03-20 16:36:48 +00001966static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001968 Py_UNICODE ch);
1969
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970static
1971PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 int quotes)
1974{
1975 PyObject *repr;
1976 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1981 if (repr == NULL)
1982 return NULL;
1983
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001984 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001988 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 !findchar(s, size, '"')) ? '"' : '\'';
1990 }
1991 while (size-- > 0) {
1992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001993
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001994 /* Escape quotes and backslashes */
1995 if ((quotes &&
1996 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 *p++ = '\\';
1998 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001999 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002000 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002002#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002003 /* Map 21-bit characters to '\U00xxxxxx' */
2004 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002005 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002006
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007 /* Resize the string if necessary */
2008 if (offset + 12 > PyString_GET_SIZE(repr)) {
2009 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002010 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002011 p = PyString_AS_STRING(repr) + offset;
2012 }
2013
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014 *p++ = '\\';
2015 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002016 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2022 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002023 *p++ = hexdigit[ch & 0x0000000F];
2024 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002026#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002027 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2028 else if (ch >= 0xD800 && ch < 0xDC00) {
2029 Py_UNICODE ch2;
2030 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002032 ch2 = *s++;
2033 size--;
2034 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2035 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2036 *p++ = '\\';
2037 *p++ = 'U';
2038 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2044 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2045 *p++ = hexdigit[ucs & 0x0000000F];
2046 continue;
2047 }
2048 /* Fall through: isolated surrogates are copied as-is */
2049 s--;
2050 size++;
2051 }
2052
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002054 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = '\\';
2056 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002057 *p++ = hexdigit[(ch >> 12) & 0x000F];
2058 *p++ = hexdigit[(ch >> 8) & 0x000F];
2059 *p++ = hexdigit[(ch >> 4) & 0x000F];
2060 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002062
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002063 /* Map special whitespace to '\t', \n', '\r' */
2064 else if (ch == '\t') {
2065 *p++ = '\\';
2066 *p++ = 't';
2067 }
2068 else if (ch == '\n') {
2069 *p++ = '\\';
2070 *p++ = 'n';
2071 }
2072 else if (ch == '\r') {
2073 *p++ = '\\';
2074 *p++ = 'r';
2075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002076
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002077 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002078 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002080 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002081 *p++ = hexdigit[(ch >> 4) & 0x000F];
2082 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 /* Copy everything else as-is */
2086 else
2087 *p++ = (char) ch;
2088 }
2089 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002090 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002093 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 return repr;
2095}
2096
2097PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002098 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099{
2100 return unicodeescape_string(s, size, 0);
2101}
2102
2103PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2104{
2105 if (!PyUnicode_Check(unicode)) {
2106 PyErr_BadArgument();
2107 return NULL;
2108 }
2109 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2110 PyUnicode_GET_SIZE(unicode));
2111}
2112
2113/* --- Raw Unicode Escape Codec ------------------------------------------- */
2114
2115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 const char *errors)
2118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002120 Py_ssize_t startinpos;
2121 Py_ssize_t endinpos;
2122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 const char *end;
2126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 PyObject *errorHandler = NULL;
2128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002129
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 /* Escaped strings will always be longer than the resulting
2131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 length after conversion to the true value. (But decoding error
2133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 v = _PyUnicode_New(size);
2135 if (v == NULL)
2136 goto onError;
2137 if (size == 0)
2138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 end = s + size;
2141 while (s < end) {
2142 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002143 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
2147 /* Non-escape characters are interpreted as Unicode ordinals */
2148 if (*s != '\\') {
2149 *p++ = (unsigned char)*s++;
2150 continue;
2151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
2154 /* \u-escapes are only interpreted iff the number of leading
2155 backslashes if odd */
2156 bs = s;
2157 for (;s < end;) {
2158 if (*s != '\\')
2159 break;
2160 *p++ = (unsigned char)*s++;
2161 }
2162 if (((s - bs) & 1) == 0 ||
2163 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002164 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 continue;
2166 }
2167 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002168 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 s++;
2170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002173 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 endinpos = s-starts;
2177 if (unicode_decode_call_errorhandler(
2178 errors, &errorHandler,
2179 "rawunicodeescape", "truncated \\uXXXX",
2180 starts, size, &startinpos, &endinpos, &exc, &s,
2181 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 }
2185 x = (x<<4) & ~0xF;
2186 if (c >= '0' && c <= '9')
2187 x += c - '0';
2188 else if (c >= 'a' && c <= 'f')
2189 x += 10 + c - 'a';
2190 else
2191 x += 10 + c - 'A';
2192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002193#ifndef Py_UNICODE_WIDE
2194 if (x > 0x10000) {
2195 if (unicode_decode_call_errorhandler(
2196 errors, &errorHandler,
2197 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2198 starts, size, &startinpos, &endinpos, &exc, &s,
2199 (PyObject **)&v, &outpos, &p))
2200 goto onError;
2201 }
2202#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 *p++ = x;
2204 nextByte:
2205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002207 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 Py_XDECREF(errorHandler);
2210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 onError:
2214 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 Py_XDECREF(errorHandler);
2216 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return NULL;
2218}
2219
2220PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002221 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222{
2223 PyObject *repr;
2224 char *p;
2225 char *q;
2226
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002227 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002229#ifdef Py_UNICODE_WIDE
2230 repr = PyString_FromStringAndSize(NULL, 10 * size);
2231#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002233#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 if (repr == NULL)
2235 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002236 if (size == 0)
2237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 p = q = PyString_AS_STRING(repr);
2240 while (size-- > 0) {
2241 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002242#ifdef Py_UNICODE_WIDE
2243 /* Map 32-bit characters to '\Uxxxxxxxx' */
2244 if (ch >= 0x10000) {
2245 *p++ = '\\';
2246 *p++ = 'U';
2247 *p++ = hexdigit[(ch >> 28) & 0xf];
2248 *p++ = hexdigit[(ch >> 24) & 0xf];
2249 *p++ = hexdigit[(ch >> 20) & 0xf];
2250 *p++ = hexdigit[(ch >> 16) & 0xf];
2251 *p++ = hexdigit[(ch >> 12) & 0xf];
2252 *p++ = hexdigit[(ch >> 8) & 0xf];
2253 *p++ = hexdigit[(ch >> 4) & 0xf];
2254 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 else
2257#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 /* Map 16-bit characters to '\uxxxx' */
2259 if (ch >= 256) {
2260 *p++ = '\\';
2261 *p++ = 'u';
2262 *p++ = hexdigit[(ch >> 12) & 0xf];
2263 *p++ = hexdigit[(ch >> 8) & 0xf];
2264 *p++ = hexdigit[(ch >> 4) & 0xf];
2265 *p++ = hexdigit[ch & 15];
2266 }
2267 /* Copy everything else as-is */
2268 else
2269 *p++ = (char) ch;
2270 }
2271 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002272 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 return repr;
2274}
2275
2276PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2277{
2278 if (!PyUnicode_Check(unicode)) {
2279 PyErr_BadArgument();
2280 return NULL;
2281 }
2282 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2283 PyUnicode_GET_SIZE(unicode));
2284}
2285
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002286/* --- Unicode Internal Codec ------------------------------------------- */
2287
2288PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002290 const char *errors)
2291{
2292 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002296 PyUnicodeObject *v;
2297 Py_UNICODE *p;
2298 const char *end;
2299 const char *reason;
2300 PyObject *errorHandler = NULL;
2301 PyObject *exc = NULL;
2302
Neal Norwitzd43069c2006-01-08 01:12:10 +00002303#ifdef Py_UNICODE_WIDE
2304 Py_UNICODE unimax = PyUnicode_GetMax();
2305#endif
2306
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2308 if (v == NULL)
2309 goto onError;
2310 if (PyUnicode_GetSize((PyObject *)v) == 0)
2311 return (PyObject *)v;
2312 p = PyUnicode_AS_UNICODE(v);
2313 end = s + size;
2314
2315 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002316 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002317 /* We have to sanity check the raw data, otherwise doom looms for
2318 some malformed UCS-4 data. */
2319 if (
2320 #ifdef Py_UNICODE_WIDE
2321 *p > unimax || *p < 0 ||
2322 #endif
2323 end-s < Py_UNICODE_SIZE
2324 )
2325 {
2326 startinpos = s - starts;
2327 if (end-s < Py_UNICODE_SIZE) {
2328 endinpos = end-starts;
2329 reason = "truncated input";
2330 }
2331 else {
2332 endinpos = s - starts + Py_UNICODE_SIZE;
2333 reason = "illegal code point (> 0x10FFFF)";
2334 }
2335 outpos = p - PyUnicode_AS_UNICODE(v);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "unicode_internal", reason,
2339 starts, size, &startinpos, &endinpos, &exc, &s,
2340 (PyObject **)&v, &outpos, &p)) {
2341 goto onError;
2342 }
2343 }
2344 else {
2345 p++;
2346 s += Py_UNICODE_SIZE;
2347 }
2348 }
2349
Martin v. Löwis412fb672006-04-13 06:34:32 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002351 goto onError;
2352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
2354 return (PyObject *)v;
2355
2356 onError:
2357 Py_XDECREF(v);
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return NULL;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Latin-1 Codec ------------------------------------------------------ */
2364
2365PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002366 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 const char *errors)
2368{
2369 PyUnicodeObject *v;
2370 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002371
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002373 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002374 Py_UNICODE r = *(unsigned char*)s;
2375 return PyUnicode_FromUnicode(&r, 1);
2376 }
2377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 v = _PyUnicode_New(size);
2379 if (v == NULL)
2380 goto onError;
2381 if (size == 0)
2382 return (PyObject *)v;
2383 p = PyUnicode_AS_UNICODE(v);
2384 while (size-- > 0)
2385 *p++ = (unsigned char)*s++;
2386 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 onError:
2389 Py_XDECREF(v);
2390 return NULL;
2391}
2392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393/* create or adjust a UnicodeEncodeError */
2394static void make_encode_exception(PyObject **exceptionObject,
2395 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002396 const Py_UNICODE *unicode, Py_ssize_t size,
2397 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 if (*exceptionObject == NULL) {
2401 *exceptionObject = PyUnicodeEncodeError_Create(
2402 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002405 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2406 goto onError;
2407 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2408 goto onError;
2409 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2410 goto onError;
2411 return;
2412 onError:
2413 Py_DECREF(*exceptionObject);
2414 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
2416}
2417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418/* raises a UnicodeEncodeError */
2419static void raise_encode_exception(PyObject **exceptionObject,
2420 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002421 const Py_UNICODE *unicode, Py_ssize_t size,
2422 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 const char *reason)
2424{
2425 make_encode_exception(exceptionObject,
2426 encoding, unicode, size, startpos, endpos, reason);
2427 if (*exceptionObject != NULL)
2428 PyCodec_StrictErrors(*exceptionObject);
2429}
2430
2431/* error handling callback helper:
2432 build arguments, call the callback and check the arguments,
2433 put the result into newpos and return the replacement string, which
2434 has to be freed by the caller */
2435static PyObject *unicode_encode_call_errorhandler(const char *errors,
2436 PyObject **errorHandler,
2437 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2439 Py_ssize_t startpos, Py_ssize_t endpos,
2440 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002442 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443
2444 PyObject *restuple;
2445 PyObject *resunicode;
2446
2447 if (*errorHandler == NULL) {
2448 *errorHandler = PyCodec_LookupError(errors);
2449 if (*errorHandler == NULL)
2450 return NULL;
2451 }
2452
2453 make_encode_exception(exceptionObject,
2454 encoding, unicode, size, startpos, endpos, reason);
2455 if (*exceptionObject == NULL)
2456 return NULL;
2457
2458 restuple = PyObject_CallFunctionObjArgs(
2459 *errorHandler, *exceptionObject, NULL);
2460 if (restuple == NULL)
2461 return NULL;
2462 if (!PyTuple_Check(restuple)) {
2463 PyErr_Format(PyExc_TypeError, &argparse[4]);
2464 Py_DECREF(restuple);
2465 return NULL;
2466 }
2467 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2468 &resunicode, newpos)) {
2469 Py_DECREF(restuple);
2470 return NULL;
2471 }
2472 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002473 *newpos = size+*newpos;
2474 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002475 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002476 Py_DECREF(restuple);
2477 return NULL;
2478 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_INCREF(resunicode);
2480 Py_DECREF(restuple);
2481 return resunicode;
2482}
2483
2484static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002485 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *errors,
2487 int limit)
2488{
2489 /* output object */
2490 PyObject *res;
2491 /* pointers to the beginning and end+1 of input */
2492 const Py_UNICODE *startp = p;
2493 const Py_UNICODE *endp = p + size;
2494 /* pointer to the beginning of the unencodable characters */
2495 /* const Py_UNICODE *badp = NULL; */
2496 /* pointer into the output */
2497 char *str;
2498 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 Py_ssize_t respos = 0;
2500 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002501 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2502 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 PyObject *errorHandler = NULL;
2504 PyObject *exc = NULL;
2505 /* the following variable is used for caching string comparisons
2506 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2507 int known_errorHandler = -1;
2508
2509 /* allocate enough for a simple encoding without
2510 replacements, if we need more, we'll resize */
2511 res = PyString_FromStringAndSize(NULL, size);
2512 if (res == NULL)
2513 goto onError;
2514 if (size == 0)
2515 return res;
2516 str = PyString_AS_STRING(res);
2517 ressize = size;
2518
2519 while (p<endp) {
2520 Py_UNICODE c = *p;
2521
2522 /* can we encode this? */
2523 if (c<limit) {
2524 /* no overflow check, because we know that the space is enough */
2525 *str++ = (char)c;
2526 ++p;
2527 }
2528 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002529 Py_ssize_t unicodepos = p-startp;
2530 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002532 Py_ssize_t repsize;
2533 Py_ssize_t newpos;
2534 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_UNICODE *uni2;
2536 /* startpos for collecting unencodable chars */
2537 const Py_UNICODE *collstart = p;
2538 const Py_UNICODE *collend = p;
2539 /* find all unecodable characters */
2540 while ((collend < endp) && ((*collend)>=limit))
2541 ++collend;
2542 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2543 if (known_errorHandler==-1) {
2544 if ((errors==NULL) || (!strcmp(errors, "strict")))
2545 known_errorHandler = 1;
2546 else if (!strcmp(errors, "replace"))
2547 known_errorHandler = 2;
2548 else if (!strcmp(errors, "ignore"))
2549 known_errorHandler = 3;
2550 else if (!strcmp(errors, "xmlcharrefreplace"))
2551 known_errorHandler = 4;
2552 else
2553 known_errorHandler = 0;
2554 }
2555 switch (known_errorHandler) {
2556 case 1: /* strict */
2557 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2558 goto onError;
2559 case 2: /* replace */
2560 while (collstart++<collend)
2561 *str++ = '?'; /* fall through */
2562 case 3: /* ignore */
2563 p = collend;
2564 break;
2565 case 4: /* xmlcharrefreplace */
2566 respos = str-PyString_AS_STRING(res);
2567 /* determine replacement size (temporarily (mis)uses p) */
2568 for (p = collstart, repsize = 0; p < collend; ++p) {
2569 if (*p<10)
2570 repsize += 2+1+1;
2571 else if (*p<100)
2572 repsize += 2+2+1;
2573 else if (*p<1000)
2574 repsize += 2+3+1;
2575 else if (*p<10000)
2576 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002577#ifndef Py_UNICODE_WIDE
2578 else
2579 repsize += 2+5+1;
2580#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 else if (*p<100000)
2582 repsize += 2+5+1;
2583 else if (*p<1000000)
2584 repsize += 2+6+1;
2585 else
2586 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002587#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 }
2589 requiredsize = respos+repsize+(endp-collend);
2590 if (requiredsize > ressize) {
2591 if (requiredsize<2*ressize)
2592 requiredsize = 2*ressize;
2593 if (_PyString_Resize(&res, requiredsize))
2594 goto onError;
2595 str = PyString_AS_STRING(res) + respos;
2596 ressize = requiredsize;
2597 }
2598 /* generate replacement (temporarily (mis)uses p) */
2599 for (p = collstart; p < collend; ++p) {
2600 str += sprintf(str, "&#%d;", (int)*p);
2601 }
2602 p = collend;
2603 break;
2604 default:
2605 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2606 encoding, reason, startp, size, &exc,
2607 collstart-startp, collend-startp, &newpos);
2608 if (repunicode == NULL)
2609 goto onError;
2610 /* need more space? (at least enough for what we
2611 have+the replacement+the rest of the string, so
2612 we won't have to check space for encodable characters) */
2613 respos = str-PyString_AS_STRING(res);
2614 repsize = PyUnicode_GET_SIZE(repunicode);
2615 requiredsize = respos+repsize+(endp-collend);
2616 if (requiredsize > ressize) {
2617 if (requiredsize<2*ressize)
2618 requiredsize = 2*ressize;
2619 if (_PyString_Resize(&res, requiredsize)) {
2620 Py_DECREF(repunicode);
2621 goto onError;
2622 }
2623 str = PyString_AS_STRING(res) + respos;
2624 ressize = requiredsize;
2625 }
2626 /* check if there is anything unencodable in the replacement
2627 and copy it to the output */
2628 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2629 c = *uni2;
2630 if (c >= limit) {
2631 raise_encode_exception(&exc, encoding, startp, size,
2632 unicodepos, unicodepos+1, reason);
2633 Py_DECREF(repunicode);
2634 goto onError;
2635 }
2636 *str = (char)c;
2637 }
2638 p = startp + newpos;
2639 Py_DECREF(repunicode);
2640 }
2641 }
2642 }
2643 /* Resize if we allocated to much */
2644 respos = str-PyString_AS_STRING(res);
2645 if (respos<ressize)
2646 /* If this falls res will be NULL */
2647 _PyString_Resize(&res, respos);
2648 Py_XDECREF(errorHandler);
2649 Py_XDECREF(exc);
2650 return res;
2651
2652 onError:
2653 Py_XDECREF(res);
2654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
2656 return NULL;
2657}
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002660 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 const char *errors)
2662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2667{
2668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
2672 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2673 PyUnicode_GET_SIZE(unicode),
2674 NULL);
2675}
2676
2677/* --- 7-bit ASCII Codec -------------------------------------------------- */
2678
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 const char *errors)
2682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 PyUnicodeObject *v;
2685 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *e;
2690 PyObject *errorHandler = NULL;
2691 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002694 if (size == 1 && *(unsigned char*)s < 128) {
2695 Py_UNICODE r = *(unsigned char*)s;
2696 return PyUnicode_FromUnicode(&r, 1);
2697 }
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 v = _PyUnicode_New(size);
2700 if (v == NULL)
2701 goto onError;
2702 if (size == 0)
2703 return (PyObject *)v;
2704 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 e = s + size;
2706 while (s < e) {
2707 register unsigned char c = (unsigned char)*s;
2708 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 ++s;
2711 }
2712 else {
2713 startinpos = s-starts;
2714 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002715 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "ascii", "ordinal not in range(128)",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002725 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 onError:
2732 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 return NULL;
2736}
2737
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 const char *errors)
2741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743}
2744
2745PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2746{
2747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 return NULL;
2750 }
2751 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2752 PyUnicode_GET_SIZE(unicode),
2753 NULL);
2754}
2755
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002756#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002760PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002762 const char *errors)
2763{
2764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002767
2768 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002769 assert(size < INT_MAX);
2770 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002771 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002772 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2773
2774 v = _PyUnicode_New(usize);
2775 if (v == NULL)
2776 return NULL;
2777 if (usize == 0)
2778 return (PyObject *)v;
2779 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002780 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002781 Py_DECREF(v);
2782 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2783 }
2784
2785 return (PyObject *)v;
2786}
2787
2788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002789 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002790 const char *errors)
2791{
2792 PyObject *repr;
2793 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002794 DWORD mbcssize;
2795
2796 /* If there are no characters, bail now! */
2797 if (size==0)
2798 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002799
2800 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 assert(size<INT_MAX);
2802 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 if (mbcssize==0)
2804 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2805
2806 repr = PyString_FromStringAndSize(NULL, mbcssize);
2807 if (repr == NULL)
2808 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002809 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002810 return repr;
2811
2812 /* Do the conversion */
2813 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 assert(size < INT_MAX);
2815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002816 Py_DECREF(repr);
2817 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2818 }
2819 return repr;
2820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002821
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002822PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831}
2832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002833#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835/* --- Character Mapping Codec -------------------------------------------- */
2836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002838 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyObject *mapping,
2840 const char *errors)
2841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 Py_ssize_t startinpos;
2844 Py_ssize_t endinpos;
2845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 PyUnicodeObject *v;
2848 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 PyObject *errorHandler = NULL;
2851 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002852 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Default to Latin-1 */
2856 if (mapping == NULL)
2857 return PyUnicode_DecodeLatin1(s, size, errors);
2858
2859 v = _PyUnicode_New(size);
2860 if (v == NULL)
2861 goto onError;
2862 if (size == 0)
2863 return (PyObject *)v;
2864 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002866 if (PyUnicode_CheckExact(mapping)) {
2867 mapstring = PyUnicode_AS_UNICODE(mapping);
2868 maplen = PyUnicode_GET_SIZE(mapping);
2869 while (s < e) {
2870 unsigned char ch = *s;
2871 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002873 if (ch < maplen)
2874 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002876 if (x == 0xfffe) {
2877 /* undefined mapping */
2878 outpos = p-PyUnicode_AS_UNICODE(v);
2879 startinpos = s-starts;
2880 endinpos = startinpos+1;
2881 if (unicode_decode_call_errorhandler(
2882 errors, &errorHandler,
2883 "charmap", "character maps to <undefined>",
2884 starts, size, &startinpos, &endinpos, &exc, &s,
2885 (PyObject **)&v, &outpos, &p)) {
2886 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002887 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002888 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002889 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002890 *p++ = x;
2891 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002893 }
2894 else {
2895 while (s < e) {
2896 unsigned char ch = *s;
2897 PyObject *w, *x;
2898
2899 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2900 w = PyInt_FromLong((long)ch);
2901 if (w == NULL)
2902 goto onError;
2903 x = PyObject_GetItem(mapping, w);
2904 Py_DECREF(w);
2905 if (x == NULL) {
2906 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2907 /* No mapping found means: mapping is undefined. */
2908 PyErr_Clear();
2909 x = Py_None;
2910 Py_INCREF(x);
2911 } else
2912 goto onError;
2913 }
2914
2915 /* Apply mapping */
2916 if (PyInt_Check(x)) {
2917 long value = PyInt_AS_LONG(x);
2918 if (value < 0 || value > 65535) {
2919 PyErr_SetString(PyExc_TypeError,
2920 "character mapping must be in range(65536)");
2921 Py_DECREF(x);
2922 goto onError;
2923 }
2924 *p++ = (Py_UNICODE)value;
2925 }
2926 else if (x == Py_None) {
2927 /* undefined mapping */
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 startinpos = s-starts;
2930 endinpos = startinpos+1;
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "charmap", "character maps to <undefined>",
2934 starts, size, &startinpos, &endinpos, &exc, &s,
2935 (PyObject **)&v, &outpos, &p)) {
2936 Py_DECREF(x);
2937 goto onError;
2938 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002939 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002940 continue;
2941 }
2942 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944
2945 if (targetsize == 1)
2946 /* 1-1 mapping */
2947 *p++ = *PyUnicode_AS_UNICODE(x);
2948
2949 else if (targetsize > 1) {
2950 /* 1-n mapping */
2951 if (targetsize > extrachars) {
2952 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2954 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002955 (targetsize << 2);
2956 extrachars += needed;
2957 if (_PyUnicode_Resize(&v,
2958 PyUnicode_GET_SIZE(v) + needed) < 0) {
2959 Py_DECREF(x);
2960 goto onError;
2961 }
2962 p = PyUnicode_AS_UNICODE(v) + oldpos;
2963 }
2964 Py_UNICODE_COPY(p,
2965 PyUnicode_AS_UNICODE(x),
2966 targetsize);
2967 p += targetsize;
2968 extrachars -= targetsize;
2969 }
2970 /* 1-0 mapping: skip the character */
2971 }
2972 else {
2973 /* wrong return value */
2974 PyErr_SetString(PyExc_TypeError,
2975 "character mapping must return integer, None or unicode");
2976 Py_DECREF(x);
2977 goto onError;
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002980 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
2983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 Py_XDECREF(errorHandler);
2987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 Py_XDECREF(errorHandler);
2992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 Py_XDECREF(v);
2994 return NULL;
2995}
2996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997/* Lookup the character ch in the mapping. If the character
2998 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002999 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 PyObject *w = PyInt_FromLong((long)c);
3003 PyObject *x;
3004
3005 if (w == NULL)
3006 return NULL;
3007 x = PyObject_GetItem(mapping, w);
3008 Py_DECREF(w);
3009 if (x == NULL) {
3010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3011 /* No mapping found means: mapping is undefined. */
3012 PyErr_Clear();
3013 x = Py_None;
3014 Py_INCREF(x);
3015 return x;
3016 } else
3017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003019 else if (x == Py_None)
3020 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 else if (PyInt_Check(x)) {
3022 long value = PyInt_AS_LONG(x);
3023 if (value < 0 || value > 255) {
3024 PyErr_SetString(PyExc_TypeError,
3025 "character mapping must be in range(256)");
3026 Py_DECREF(x);
3027 return NULL;
3028 }
3029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 else if (PyString_Check(x))
3032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 /* wrong return value */
3035 PyErr_SetString(PyExc_TypeError,
3036 "character mapping must return integer, None or str");
3037 Py_DECREF(x);
3038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040}
3041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042/* lookup the character, put the result in the output string and adjust
3043 various state variables. Reallocate the output string if not enough
3044 space is available. Return a new reference to the object that
3045 was put in the output buffer, or Py_None, if the mapping was undefined
3046 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003047 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048static
3049PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051{
3052 PyObject *rep = charmapencode_lookup(c, mapping);
3053
3054 if (rep==NULL)
3055 return NULL;
3056 else if (rep==Py_None)
3057 return rep;
3058 else {
3059 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 if (outsize<requiredsize) {
3064 /* exponentially overallocate to minimize reallocations */
3065 if (requiredsize < 2*outsize)
3066 requiredsize = 2*outsize;
3067 if (_PyString_Resize(outobj, requiredsize)) {
3068 Py_DECREF(rep);
3069 return NULL;
3070 }
3071 outstart = PyString_AS_STRING(*outobj);
3072 }
3073 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3074 }
3075 else {
3076 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003077 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3078 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 if (outsize<requiredsize) {
3080 /* exponentially overallocate to minimize reallocations */
3081 if (requiredsize < 2*outsize)
3082 requiredsize = 2*outsize;
3083 if (_PyString_Resize(outobj, requiredsize)) {
3084 Py_DECREF(rep);
3085 return NULL;
3086 }
3087 outstart = PyString_AS_STRING(*outobj);
3088 }
3089 memcpy(outstart + *outpos, repchars, repsize);
3090 *outpos += repsize;
3091 }
3092 }
3093 return rep;
3094}
3095
3096/* handle an error in PyUnicode_EncodeCharmap
3097 Return 0 on success, -1 on error */
3098static
3099int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003100 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003102 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003103 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104{
3105 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t repsize;
3107 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 Py_UNICODE *uni2;
3109 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003110 Py_ssize_t collstartpos = *inpos;
3111 Py_ssize_t collendpos = *inpos+1;
3112 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 char *encoding = "charmap";
3114 char *reason = "character maps to <undefined>";
3115
3116 PyObject *x;
3117 /* find all unencodable characters */
3118 while (collendpos < size) {
3119 x = charmapencode_lookup(p[collendpos], mapping);
3120 if (x==NULL)
3121 return -1;
3122 else if (x!=Py_None) {
3123 Py_DECREF(x);
3124 break;
3125 }
3126 Py_DECREF(x);
3127 ++collendpos;
3128 }
3129 /* cache callback name lookup
3130 * (if not done yet, i.e. it's the first error) */
3131 if (*known_errorHandler==-1) {
3132 if ((errors==NULL) || (!strcmp(errors, "strict")))
3133 *known_errorHandler = 1;
3134 else if (!strcmp(errors, "replace"))
3135 *known_errorHandler = 2;
3136 else if (!strcmp(errors, "ignore"))
3137 *known_errorHandler = 3;
3138 else if (!strcmp(errors, "xmlcharrefreplace"))
3139 *known_errorHandler = 4;
3140 else
3141 *known_errorHandler = 0;
3142 }
3143 switch (*known_errorHandler) {
3144 case 1: /* strict */
3145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3146 return -1;
3147 case 2: /* replace */
3148 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3149 x = charmapencode_output('?', mapping, res, respos);
3150 if (x==NULL) {
3151 return -1;
3152 }
3153 else if (x==Py_None) {
3154 Py_DECREF(x);
3155 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3156 return -1;
3157 }
3158 Py_DECREF(x);
3159 }
3160 /* fall through */
3161 case 3: /* ignore */
3162 *inpos = collendpos;
3163 break;
3164 case 4: /* xmlcharrefreplace */
3165 /* generate replacement (temporarily (mis)uses p) */
3166 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3167 char buffer[2+29+1+1];
3168 char *cp;
3169 sprintf(buffer, "&#%d;", (int)p[collpos]);
3170 for (cp = buffer; *cp; ++cp) {
3171 x = charmapencode_output(*cp, mapping, res, respos);
3172 if (x==NULL)
3173 return -1;
3174 else if (x==Py_None) {
3175 Py_DECREF(x);
3176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3177 return -1;
3178 }
3179 Py_DECREF(x);
3180 }
3181 }
3182 *inpos = collendpos;
3183 break;
3184 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003185 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 encoding, reason, p, size, exceptionObject,
3187 collstartpos, collendpos, &newpos);
3188 if (repunicode == NULL)
3189 return -1;
3190 /* generate replacement */
3191 repsize = PyUnicode_GET_SIZE(repunicode);
3192 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3193 x = charmapencode_output(*uni2, mapping, res, respos);
3194 if (x==NULL) {
3195 Py_DECREF(repunicode);
3196 return -1;
3197 }
3198 else if (x==Py_None) {
3199 Py_DECREF(repunicode);
3200 Py_DECREF(x);
3201 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3202 return -1;
3203 }
3204 Py_DECREF(x);
3205 }
3206 *inpos = newpos;
3207 Py_DECREF(repunicode);
3208 }
3209 return 0;
3210}
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyObject *mapping,
3215 const char *errors)
3216{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* output object */
3218 PyObject *res = NULL;
3219 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 PyObject *errorHandler = NULL;
3224 PyObject *exc = NULL;
3225 /* the following variable is used for caching string comparisons
3226 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3227 * 3=ignore, 4=xmlcharrefreplace */
3228 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229
3230 /* Default to Latin-1 */
3231 if (mapping == NULL)
3232 return PyUnicode_EncodeLatin1(p, size, errors);
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 /* allocate enough for a simple encoding without
3235 replacements, if we need more, we'll resize */
3236 res = PyString_FromStringAndSize(NULL, size);
3237 if (res == NULL)
3238 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003239 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 while (inpos<size) {
3243 /* try to encode it */
3244 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3245 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 if (x==Py_None) { /* unencodable character */
3248 if (charmap_encoding_error(p, size, &inpos, mapping,
3249 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003250 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003251 &res, &respos)) {
3252 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003253 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 else
3257 /* done with this character => adjust input position */
3258 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 Py_DECREF(x);
3260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 /* Resize if we allocated to much */
3263 if (respos<PyString_GET_SIZE(res)) {
3264 if (_PyString_Resize(&res, respos))
3265 goto onError;
3266 }
3267 Py_XDECREF(exc);
3268 Py_XDECREF(errorHandler);
3269 return res;
3270
3271 onError:
3272 Py_XDECREF(res);
3273 Py_XDECREF(exc);
3274 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 return NULL;
3276}
3277
3278PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3279 PyObject *mapping)
3280{
3281 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3286 PyUnicode_GET_SIZE(unicode),
3287 mapping,
3288 NULL);
3289}
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291/* create or adjust a UnicodeTranslateError */
3292static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003293 const Py_UNICODE *unicode, Py_ssize_t size,
3294 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 if (*exceptionObject == NULL) {
3298 *exceptionObject = PyUnicodeTranslateError_Create(
3299 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3303 goto onError;
3304 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3305 goto onError;
3306 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3307 goto onError;
3308 return;
3309 onError:
3310 Py_DECREF(*exceptionObject);
3311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
3313}
3314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315/* raises a UnicodeTranslateError */
3316static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 const Py_UNICODE *unicode, Py_ssize_t size,
3318 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 const char *reason)
3320{
3321 make_translate_exception(exceptionObject,
3322 unicode, size, startpos, endpos, reason);
3323 if (*exceptionObject != NULL)
3324 PyCodec_StrictErrors(*exceptionObject);
3325}
3326
3327/* error handling callback helper:
3328 build arguments, call the callback and check the arguments,
3329 put the result into newpos and return the replacement string, which
3330 has to be freed by the caller */
3331static PyObject *unicode_translate_call_errorhandler(const char *errors,
3332 PyObject **errorHandler,
3333 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003334 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3335 Py_ssize_t startpos, Py_ssize_t endpos,
3336 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003338 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339
Martin v. Löwis412fb672006-04-13 06:34:32 +00003340 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *restuple;
3342 PyObject *resunicode;
3343
3344 if (*errorHandler == NULL) {
3345 *errorHandler = PyCodec_LookupError(errors);
3346 if (*errorHandler == NULL)
3347 return NULL;
3348 }
3349
3350 make_translate_exception(exceptionObject,
3351 unicode, size, startpos, endpos, reason);
3352 if (*exceptionObject == NULL)
3353 return NULL;
3354
3355 restuple = PyObject_CallFunctionObjArgs(
3356 *errorHandler, *exceptionObject, NULL);
3357 if (restuple == NULL)
3358 return NULL;
3359 if (!PyTuple_Check(restuple)) {
3360 PyErr_Format(PyExc_TypeError, &argparse[4]);
3361 Py_DECREF(restuple);
3362 return NULL;
3363 }
3364 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_DECREF(restuple);
3367 return NULL;
3368 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 if (i_newpos<0)
3370 *newpos = size+i_newpos;
3371 else
3372 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003373 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003375 Py_DECREF(restuple);
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_INCREF(resunicode);
3379 Py_DECREF(restuple);
3380 return resunicode;
3381}
3382
3383/* Lookup the character ch in the mapping and put the result in result,
3384 which must be decrefed by the caller.
3385 Return 0 on success, -1 on error */
3386static
3387int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3388{
3389 PyObject *w = PyInt_FromLong((long)c);
3390 PyObject *x;
3391
3392 if (w == NULL)
3393 return -1;
3394 x = PyObject_GetItem(mapping, w);
3395 Py_DECREF(w);
3396 if (x == NULL) {
3397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3398 /* No mapping found means: use 1:1 mapping. */
3399 PyErr_Clear();
3400 *result = NULL;
3401 return 0;
3402 } else
3403 return -1;
3404 }
3405 else if (x == Py_None) {
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyInt_Check(x)) {
3410 long value = PyInt_AS_LONG(x);
3411 long max = PyUnicode_GetMax();
3412 if (value < 0 || value > max) {
3413 PyErr_Format(PyExc_TypeError,
3414 "character mapping must be in range(0x%lx)", max+1);
3415 Py_DECREF(x);
3416 return -1;
3417 }
3418 *result = x;
3419 return 0;
3420 }
3421 else if (PyUnicode_Check(x)) {
3422 *result = x;
3423 return 0;
3424 }
3425 else {
3426 /* wrong return value */
3427 PyErr_SetString(PyExc_TypeError,
3428 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003429 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 return -1;
3431 }
3432}
3433/* ensure that *outobj is at least requiredsize characters long,
3434if not reallocate and adjust various state variables.
3435Return 0 on success, -1 on error */
3436static
Walter Dörwald4894c302003-10-24 14:25:28 +00003437int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003441 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003443 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003445 if (requiredsize < 2 * oldsize)
3446 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003447 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 return -1;
3449 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 }
3451 return 0;
3452}
3453/* lookup the character, put the result in the output string and adjust
3454 various state variables. Return a new reference to the object that
3455 was put in the output buffer in *result, or Py_None, if the mapping was
3456 undefined (in which case no character was written).
3457 The called must decref result.
3458 Return 0 on success, -1 on error. */
3459static
Walter Dörwald4894c302003-10-24 14:25:28 +00003460int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003461 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003462 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463{
Walter Dörwald4894c302003-10-24 14:25:28 +00003464 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 return -1;
3466 if (*res==NULL) {
3467 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 }
3470 else if (*res==Py_None)
3471 ;
3472 else if (PyInt_Check(*res)) {
3473 /* no overflow check, because we know that the space is enough */
3474 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3475 }
3476 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (repsize==1) {
3479 /* no overflow check, because we know that the space is enough */
3480 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3481 }
3482 else if (repsize!=0) {
3483 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003484 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003485 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003486 repsize - 1;
3487 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 return -1;
3489 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3490 *outp += repsize;
3491 }
3492 }
3493 else
3494 return -1;
3495 return 0;
3496}
3497
3498PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 PyObject *mapping,
3501 const char *errors)
3502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 /* output object */
3504 PyObject *res = NULL;
3505 /* pointers to the beginning and end+1 of input */
3506 const Py_UNICODE *startp = p;
3507 const Py_UNICODE *endp = p + size;
3508 /* pointer into the output */
3509 Py_UNICODE *str;
3510 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 char *reason = "character maps to <undefined>";
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515 /* the following variable is used for caching string comparisons
3516 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3517 * 3=ignore, 4=xmlcharrefreplace */
3518 int known_errorHandler = -1;
3519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (mapping == NULL) {
3521 PyErr_BadArgument();
3522 return NULL;
3523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524
3525 /* allocate enough for a simple 1:1 translation without
3526 replacements, if we need more, we'll resize */
3527 res = PyUnicode_FromUnicode(NULL, size);
3528 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003529 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 return res;
3532 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 while (p<endp) {
3535 /* try to encode it */
3536 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003537 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 goto onError;
3540 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003541 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (x!=Py_None) /* it worked => adjust input pointer */
3543 ++p;
3544 else { /* untranslatable character */
3545 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t repsize;
3547 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting untranslatable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p+1;
3552 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 /* find all untranslatable characters */
3555 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003556 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 goto onError;
3558 Py_XDECREF(x);
3559 if (x!=Py_None)
3560 break;
3561 ++collend;
3562 }
3563 /* cache callback name lookup
3564 * (if not done yet, i.e. it's the first error) */
3565 if (known_errorHandler==-1) {
3566 if ((errors==NULL) || (!strcmp(errors, "strict")))
3567 known_errorHandler = 1;
3568 else if (!strcmp(errors, "replace"))
3569 known_errorHandler = 2;
3570 else if (!strcmp(errors, "ignore"))
3571 known_errorHandler = 3;
3572 else if (!strcmp(errors, "xmlcharrefreplace"))
3573 known_errorHandler = 4;
3574 else
3575 known_errorHandler = 0;
3576 }
3577 switch (known_errorHandler) {
3578 case 1: /* strict */
3579 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3580 goto onError;
3581 case 2: /* replace */
3582 /* No need to check for space, this is a 1:1 replacement */
3583 for (coll = collstart; coll<collend; ++coll)
3584 *str++ = '?';
3585 /* fall through */
3586 case 3: /* ignore */
3587 p = collend;
3588 break;
3589 case 4: /* xmlcharrefreplace */
3590 /* generate replacement (temporarily (mis)uses p) */
3591 for (p = collstart; p < collend; ++p) {
3592 char buffer[2+29+1+1];
3593 char *cp;
3594 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003595 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3597 goto onError;
3598 for (cp = buffer; *cp; ++cp)
3599 *str++ = *cp;
3600 }
3601 p = collend;
3602 break;
3603 default:
3604 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3605 reason, startp, size, &exc,
3606 collstart-startp, collend-startp, &newpos);
3607 if (repunicode == NULL)
3608 goto onError;
3609 /* generate replacement */
3610 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003611 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3617 *str++ = *uni2;
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 /* Resize if we allocated to much */
3624 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003625 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003626 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 }
3629 Py_XDECREF(exc);
3630 Py_XDECREF(errorHandler);
3631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 onError:
3634 Py_XDECREF(res);
3635 Py_XDECREF(exc);
3636 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 return NULL;
3638}
3639
3640PyObject *PyUnicode_Translate(PyObject *str,
3641 PyObject *mapping,
3642 const char *errors)
3643{
3644 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 str = PyUnicode_FromObject(str);
3647 if (str == NULL)
3648 goto onError;
3649 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3650 PyUnicode_GET_SIZE(str),
3651 mapping,
3652 errors);
3653 Py_DECREF(str);
3654 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 onError:
3657 Py_XDECREF(str);
3658 return NULL;
3659}
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossum9e896b32000-04-05 20:11:21 +00003661/* --- Decimal Encoder ---------------------------------------------------- */
3662
3663int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003665 char *output,
3666 const char *errors)
3667{
3668 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 PyObject *errorHandler = NULL;
3670 PyObject *exc = NULL;
3671 const char *encoding = "decimal";
3672 const char *reason = "invalid decimal Unicode string";
3673 /* the following variable is used for caching string comparisons
3674 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3675 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003676
3677 if (output == NULL) {
3678 PyErr_BadArgument();
3679 return -1;
3680 }
3681
3682 p = s;
3683 end = s + length;
3684 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003686 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t repsize;
3689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_UNICODE *uni2;
3691 Py_UNICODE *collstart;
3692 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossum9e896b32000-04-05 20:11:21 +00003694 if (Py_UNICODE_ISSPACE(ch)) {
3695 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003697 continue;
3698 }
3699 decimal = Py_UNICODE_TODECIMAL(ch);
3700 if (decimal >= 0) {
3701 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003703 continue;
3704 }
Guido van Rossumba477042000-04-06 18:18:10 +00003705 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003706 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003708 continue;
3709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* All other characters are considered unencodable */
3711 collstart = p;
3712 collend = p+1;
3713 while (collend < end) {
3714 if ((0 < *collend && *collend < 256) ||
3715 !Py_UNICODE_ISSPACE(*collend) ||
3716 Py_UNICODE_TODECIMAL(*collend))
3717 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* cache callback name lookup
3720 * (if not done yet, i.e. it's the first error) */
3721 if (known_errorHandler==-1) {
3722 if ((errors==NULL) || (!strcmp(errors, "strict")))
3723 known_errorHandler = 1;
3724 else if (!strcmp(errors, "replace"))
3725 known_errorHandler = 2;
3726 else if (!strcmp(errors, "ignore"))
3727 known_errorHandler = 3;
3728 else if (!strcmp(errors, "xmlcharrefreplace"))
3729 known_errorHandler = 4;
3730 else
3731 known_errorHandler = 0;
3732 }
3733 switch (known_errorHandler) {
3734 case 1: /* strict */
3735 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3736 goto onError;
3737 case 2: /* replace */
3738 for (p = collstart; p < collend; ++p)
3739 *output++ = '?';
3740 /* fall through */
3741 case 3: /* ignore */
3742 p = collend;
3743 break;
3744 case 4: /* xmlcharrefreplace */
3745 /* generate replacement (temporarily (mis)uses p) */
3746 for (p = collstart; p < collend; ++p)
3747 output += sprintf(output, "&#%d;", (int)*p);
3748 p = collend;
3749 break;
3750 default:
3751 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3752 encoding, reason, s, length, &exc,
3753 collstart-s, collend-s, &newpos);
3754 if (repunicode == NULL)
3755 goto onError;
3756 /* generate replacement */
3757 repsize = PyUnicode_GET_SIZE(repunicode);
3758 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3759 Py_UNICODE ch = *uni2;
3760 if (Py_UNICODE_ISSPACE(ch))
3761 *output++ = ' ';
3762 else {
3763 decimal = Py_UNICODE_TODECIMAL(ch);
3764 if (decimal >= 0)
3765 *output++ = '0' + decimal;
3766 else if (0 < ch && ch < 256)
3767 *output++ = (char)ch;
3768 else {
3769 Py_DECREF(repunicode);
3770 raise_encode_exception(&exc, encoding,
3771 s, length, collstart-s, collend-s, reason);
3772 goto onError;
3773 }
3774 }
3775 }
3776 p = s + newpos;
3777 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 }
3779 }
3780 /* 0-terminate the output string */
3781 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_XDECREF(exc);
3783 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003784 return 0;
3785
3786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 return -1;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792/* --- Helpers ------------------------------------------------------------ */
3793
Tim Petersced69f82003-09-16 20:30:58 +00003794static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003795Py_ssize_t count(PyUnicodeObject *self,
3796 Py_ssize_t start,
3797 Py_ssize_t end,
3798 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003800 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003802 if (start < 0)
3803 start += self->length;
3804 if (start < 0)
3805 start = 0;
3806 if (end > self->length)
3807 end = self->length;
3808 if (end < 0)
3809 end += self->length;
3810 if (end < 0)
3811 end = 0;
3812
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003813 if (substring->length == 0)
3814 return (end - start + 1);
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 end -= substring->length;
3817
3818 while (start <= end)
3819 if (Py_UNICODE_MATCH(self, start, substring)) {
3820 count++;
3821 start += substring->length;
3822 } else
3823 start++;
3824
3825 return count;
3826}
3827
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t start,
3831 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 str = PyUnicode_FromObject(str);
3836 if (str == NULL)
3837 return -1;
3838 substr = PyUnicode_FromObject(substr);
3839 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003840 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 return -1;
3842 }
Tim Petersced69f82003-09-16 20:30:58 +00003843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 result = count((PyUnicodeObject *)str,
3845 start, end,
3846 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 Py_DECREF(str);
3849 Py_DECREF(substr);
3850 return result;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t start,
3857 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 int direction)
3859{
3860 if (start < 0)
3861 start += self->length;
3862 if (start < 0)
3863 start = 0;
3864
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (end > self->length)
3866 end = self->length;
3867 if (end < 0)
3868 end += self->length;
3869 if (end < 0)
3870 end = 0;
3871
Guido van Rossum76afbd92002-08-20 17:29:29 +00003872 if (substring->length == 0)
3873 return (direction > 0) ? start : end;
3874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875 end -= substring->length;
3876
3877 if (direction < 0) {
3878 for (; end >= start; end--)
3879 if (Py_UNICODE_MATCH(self, end, substring))
3880 return end;
3881 } else {
3882 for (; start <= end; start++)
3883 if (Py_UNICODE_MATCH(self, start, substring))
3884 return start;
3885 }
3886
3887 return -1;
3888}
3889
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003892 Py_ssize_t start,
3893 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 int direction)
3895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 str = PyUnicode_FromObject(str);
3899 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003900 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 substr = PyUnicode_FromObject(substr);
3902 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003903 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003904 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 }
Tim Petersced69f82003-09-16 20:30:58 +00003906
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 result = findstring((PyUnicodeObject *)str,
3908 (PyUnicodeObject *)substr,
3909 start, end, direction);
3910 Py_DECREF(str);
3911 Py_DECREF(substr);
3912 return result;
3913}
3914
Tim Petersced69f82003-09-16 20:30:58 +00003915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916int tailmatch(PyUnicodeObject *self,
3917 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t start,
3919 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 int direction)
3921{
3922 if (start < 0)
3923 start += self->length;
3924 if (start < 0)
3925 start = 0;
3926
3927 if (substring->length == 0)
3928 return 1;
3929
3930 if (end > self->length)
3931 end = self->length;
3932 if (end < 0)
3933 end += self->length;
3934 if (end < 0)
3935 end = 0;
3936
3937 end -= substring->length;
3938 if (end < start)
3939 return 0;
3940
3941 if (direction > 0) {
3942 if (Py_UNICODE_MATCH(self, end, substring))
3943 return 1;
3944 } else {
3945 if (Py_UNICODE_MATCH(self, start, substring))
3946 return 1;
3947 }
3948
3949 return 0;
3950}
3951
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t start,
3955 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int direction)
3957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 str = PyUnicode_FromObject(str);
3961 if (str == NULL)
3962 return -1;
3963 substr = PyUnicode_FromObject(substr);
3964 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003965 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 return -1;
3967 }
Tim Petersced69f82003-09-16 20:30:58 +00003968
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 result = tailmatch((PyUnicodeObject *)str,
3970 (PyUnicodeObject *)substr,
3971 start, end, direction);
3972 Py_DECREF(str);
3973 Py_DECREF(substr);
3974 return result;
3975}
3976
Tim Petersced69f82003-09-16 20:30:58 +00003977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 Py_UNICODE ch)
3981{
3982 /* like wcschr, but doesn't stop at NULL characters */
3983
3984 while (size-- > 0) {
3985 if (*s == ch)
3986 return s;
3987 s++;
3988 }
3989
3990 return NULL;
3991}
3992
3993/* Apply fixfct filter to the Unicode object self and return a
3994 reference to the modified object */
3995
Tim Petersced69f82003-09-16 20:30:58 +00003996static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997PyObject *fixup(PyUnicodeObject *self,
3998 int (*fixfct)(PyUnicodeObject *s))
3999{
4000
4001 PyUnicodeObject *u;
4002
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (u == NULL)
4005 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006
4007 Py_UNICODE_COPY(u->str, self->str, self->length);
4008
Tim Peters7a29bd52001-09-12 03:03:31 +00004009 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 /* fixfct should return TRUE if it modified the buffer. If
4011 FALSE, return a reference to the original buffer instead
4012 (to save space, not time) */
4013 Py_INCREF(self);
4014 Py_DECREF(u);
4015 return (PyObject*) self;
4016 }
4017 return (PyObject*) u;
4018}
4019
Tim Petersced69f82003-09-16 20:30:58 +00004020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021int fixupper(PyUnicodeObject *self)
4022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 Py_UNICODE *s = self->str;
4025 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004026
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 while (len-- > 0) {
4028 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004029
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 ch = Py_UNICODE_TOUPPER(*s);
4031 if (ch != *s) {
4032 status = 1;
4033 *s = ch;
4034 }
4035 s++;
4036 }
4037
4038 return status;
4039}
4040
Tim Petersced69f82003-09-16 20:30:58 +00004041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042int fixlower(PyUnicodeObject *self)
4043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_UNICODE *s = self->str;
4046 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 while (len-- > 0) {
4049 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 ch = Py_UNICODE_TOLOWER(*s);
4052 if (ch != *s) {
4053 status = 1;
4054 *s = ch;
4055 }
4056 s++;
4057 }
4058
4059 return status;
4060}
4061
Tim Petersced69f82003-09-16 20:30:58 +00004062static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063int fixswapcase(PyUnicodeObject *self)
4064{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 Py_UNICODE *s = self->str;
4067 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 while (len-- > 0) {
4070 if (Py_UNICODE_ISUPPER(*s)) {
4071 *s = Py_UNICODE_TOLOWER(*s);
4072 status = 1;
4073 } else if (Py_UNICODE_ISLOWER(*s)) {
4074 *s = Py_UNICODE_TOUPPER(*s);
4075 status = 1;
4076 }
4077 s++;
4078 }
4079
4080 return status;
4081}
4082
Tim Petersced69f82003-09-16 20:30:58 +00004083static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084int fixcapitalize(PyUnicodeObject *self)
4085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004086 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004087 Py_UNICODE *s = self->str;
4088 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004090 if (len == 0)
4091 return 0;
4092 if (Py_UNICODE_ISLOWER(*s)) {
4093 *s = Py_UNICODE_TOUPPER(*s);
4094 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004096 s++;
4097 while (--len > 0) {
4098 if (Py_UNICODE_ISUPPER(*s)) {
4099 *s = Py_UNICODE_TOLOWER(*s);
4100 status = 1;
4101 }
4102 s++;
4103 }
4104 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
4107static
4108int fixtitle(PyUnicodeObject *self)
4109{
4110 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4111 register Py_UNICODE *e;
4112 int previous_is_cased;
4113
4114 /* Shortcut for single character strings */
4115 if (PyUnicode_GET_SIZE(self) == 1) {
4116 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4117 if (*p != ch) {
4118 *p = ch;
4119 return 1;
4120 }
4121 else
4122 return 0;
4123 }
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 e = p + PyUnicode_GET_SIZE(self);
4126 previous_is_cased = 0;
4127 for (; p < e; p++) {
4128 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (previous_is_cased)
4131 *p = Py_UNICODE_TOLOWER(ch);
4132 else
4133 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004134
4135 if (Py_UNICODE_ISLOWER(ch) ||
4136 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 Py_UNICODE_ISTITLE(ch))
4138 previous_is_cased = 1;
4139 else
4140 previous_is_cased = 0;
4141 }
4142 return 1;
4143}
4144
Tim Peters8ce9f162004-08-27 01:49:32 +00004145PyObject *
4146PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
Tim Peters8ce9f162004-08-27 01:49:32 +00004148 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004149 const Py_UNICODE blank = ' ';
4150 const Py_UNICODE *sep = &blank;
4151 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 PyUnicodeObject *res = NULL; /* the result */
4153 size_t res_alloc = 100; /* # allocated bytes for string in res */
4154 size_t res_used; /* # used bytes */
4155 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4156 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004158 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004159 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 fseq = PySequence_Fast(seq, "");
4162 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004163 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004164 }
4165
Tim Peters91879ab2004-08-27 22:35:44 +00004166 /* Grrrr. A codec may be invoked to convert str objects to
4167 * Unicode, and so it's possible to call back into Python code
4168 * during PyUnicode_FromObject(), and so it's possible for a sick
4169 * codec to change the size of fseq (if seq is a list). Therefore
4170 * we have to keep refetching the size -- can't assume seqlen
4171 * is invariant.
4172 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004173 seqlen = PySequence_Fast_GET_SIZE(fseq);
4174 /* If empty sequence, return u"". */
4175 if (seqlen == 0) {
4176 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4177 goto Done;
4178 }
4179 /* If singleton sequence with an exact Unicode, return that. */
4180 if (seqlen == 1) {
4181 item = PySequence_Fast_GET_ITEM(fseq, 0);
4182 if (PyUnicode_CheckExact(item)) {
4183 Py_INCREF(item);
4184 res = (PyUnicodeObject *)item;
4185 goto Done;
4186 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004187 }
4188
Tim Peters05eba1f2004-08-27 21:32:02 +00004189 /* At least two items to join, or one that isn't exact Unicode. */
4190 if (seqlen > 1) {
4191 /* Set up sep and seplen -- they're needed. */
4192 if (separator == NULL) {
4193 sep = &blank;
4194 seplen = 1;
4195 }
4196 else {
4197 internal_separator = PyUnicode_FromObject(separator);
4198 if (internal_separator == NULL)
4199 goto onError;
4200 sep = PyUnicode_AS_UNICODE(internal_separator);
4201 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004202 /* In case PyUnicode_FromObject() mutated seq. */
4203 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004204 }
4205 }
4206
4207 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004208 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004209 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004210 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004211 res_p = PyUnicode_AS_UNICODE(res);
4212 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004213
Tim Peters05eba1f2004-08-27 21:32:02 +00004214 for (i = 0; i < seqlen; ++i) {
4215 size_t itemlen;
4216 size_t new_res_used;
4217
4218 item = PySequence_Fast_GET_ITEM(fseq, i);
4219 /* Convert item to Unicode. */
4220 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4221 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004222 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004223 " %.80s found",
4224 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004225 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004226 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004227 item = PyUnicode_FromObject(item);
4228 if (item == NULL)
4229 goto onError;
4230 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004231
Tim Peters91879ab2004-08-27 22:35:44 +00004232 /* In case PyUnicode_FromObject() mutated seq. */
4233 seqlen = PySequence_Fast_GET_SIZE(fseq);
4234
Tim Peters8ce9f162004-08-27 01:49:32 +00004235 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004237 new_res_used = res_used + itemlen;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004238 if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 if (i < seqlen - 1) {
4241 new_res_used += seplen;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004242 if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX)
Tim Peters05eba1f2004-08-27 21:32:02 +00004243 goto Overflow;
4244 }
4245 if (new_res_used > res_alloc) {
4246 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004247 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004248 size_t oldsize = res_alloc;
4249 res_alloc += res_alloc;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004250 if (res_alloc < oldsize || res_alloc > PY_SSIZE_T_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004251 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004252 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004253 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004254 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004256 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004257 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004259
4260 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004261 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004262 res_p += itemlen;
4263 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004264 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004265 res_p += seplen;
4266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004268 res_used = new_res_used;
4269 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004270
Tim Peters05eba1f2004-08-27 21:32:02 +00004271 /* Shrink res to match the used area; this probably can't fail,
4272 * but it's cheap to check.
4273 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004274 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004275 goto onError;
4276
4277 Done:
4278 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004279 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 return (PyObject *)res;
4281
Tim Peters8ce9f162004-08-27 01:49:32 +00004282 Overflow:
4283 PyErr_SetString(PyExc_OverflowError,
4284 "join() is too long for a Python string");
4285 Py_DECREF(item);
4286 /* fall through */
4287
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004289 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004290 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004291 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 return NULL;
4293}
4294
Tim Petersced69f82003-09-16 20:30:58 +00004295static
4296PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004297 Py_ssize_t left,
4298 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 Py_UNICODE fill)
4300{
4301 PyUnicodeObject *u;
4302
4303 if (left < 0)
4304 left = 0;
4305 if (right < 0)
4306 right = 0;
4307
Tim Peters7a29bd52001-09-12 03:03:31 +00004308 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 Py_INCREF(self);
4310 return self;
4311 }
4312
4313 u = _PyUnicode_New(left + self->length + right);
4314 if (u) {
4315 if (left)
4316 Py_UNICODE_FILL(u->str, fill, left);
4317 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4318 if (right)
4319 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4320 }
4321
4322 return u;
4323}
4324
4325#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004326 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 if (!str) \
4328 goto onError; \
4329 if (PyList_Append(list, str)) { \
4330 Py_DECREF(str); \
4331 goto onError; \
4332 } \
4333 else \
4334 Py_DECREF(str);
4335
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004336#define SPLIT_INSERT(data, left, right) \
4337 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4338 if (!str) \
4339 goto onError; \
4340 if (PyList_Insert(list, 0, str)) { \
4341 Py_DECREF(str); \
4342 goto onError; \
4343 } \
4344 else \
4345 Py_DECREF(str);
4346
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347static
4348PyObject *split_whitespace(PyUnicodeObject *self,
4349 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004352 register Py_ssize_t i;
4353 register Py_ssize_t j;
4354 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 PyObject *str;
4356
4357 for (i = j = 0; i < len; ) {
4358 /* find a token */
4359 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4360 i++;
4361 j = i;
4362 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4363 i++;
4364 if (j < i) {
4365 if (maxcount-- <= 0)
4366 break;
4367 SPLIT_APPEND(self->str, j, i);
4368 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4369 i++;
4370 j = i;
4371 }
4372 }
4373 if (j < len) {
4374 SPLIT_APPEND(self->str, j, len);
4375 }
4376 return list;
4377
4378 onError:
4379 Py_DECREF(list);
4380 return NULL;
4381}
4382
4383PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004384 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 register Py_ssize_t i;
4387 register Py_ssize_t j;
4388 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 PyObject *list;
4390 PyObject *str;
4391 Py_UNICODE *data;
4392
4393 string = PyUnicode_FromObject(string);
4394 if (string == NULL)
4395 return NULL;
4396 data = PyUnicode_AS_UNICODE(string);
4397 len = PyUnicode_GET_SIZE(string);
4398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 list = PyList_New(0);
4400 if (!list)
4401 goto onError;
4402
4403 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004404 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 /* Find a line and append it */
4407 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4408 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004411 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 if (i < len) {
4413 if (data[i] == '\r' && i + 1 < len &&
4414 data[i+1] == '\n')
4415 i += 2;
4416 else
4417 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004418 if (keepends)
4419 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 }
Guido van Rossum86662912000-04-11 15:38:46 +00004421 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 j = i;
4423 }
4424 if (j < len) {
4425 SPLIT_APPEND(data, j, len);
4426 }
4427
4428 Py_DECREF(string);
4429 return list;
4430
4431 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004432 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 Py_DECREF(string);
4434 return NULL;
4435}
4436
Tim Petersced69f82003-09-16 20:30:58 +00004437static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438PyObject *split_char(PyUnicodeObject *self,
4439 PyObject *list,
4440 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004443 register Py_ssize_t i;
4444 register Py_ssize_t j;
4445 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 PyObject *str;
4447
4448 for (i = j = 0; i < len; ) {
4449 if (self->str[i] == ch) {
4450 if (maxcount-- <= 0)
4451 break;
4452 SPLIT_APPEND(self->str, j, i);
4453 i = j = i + 1;
4454 } else
4455 i++;
4456 }
4457 if (j <= len) {
4458 SPLIT_APPEND(self->str, j, len);
4459 }
4460 return list;
4461
4462 onError:
4463 Py_DECREF(list);
4464 return NULL;
4465}
4466
Tim Petersced69f82003-09-16 20:30:58 +00004467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468PyObject *split_substring(PyUnicodeObject *self,
4469 PyObject *list,
4470 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004471 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004473 register Py_ssize_t i;
4474 register Py_ssize_t j;
4475 Py_ssize_t len = self->length;
4476 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 PyObject *str;
4478
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004479 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (Py_UNICODE_MATCH(self, i, substring)) {
4481 if (maxcount-- <= 0)
4482 break;
4483 SPLIT_APPEND(self->str, j, i);
4484 i = j = i + sublen;
4485 } else
4486 i++;
4487 }
4488 if (j <= len) {
4489 SPLIT_APPEND(self->str, j, len);
4490 }
4491 return list;
4492
4493 onError:
4494 Py_DECREF(list);
4495 return NULL;
4496}
4497
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004498static
4499PyObject *rsplit_whitespace(PyUnicodeObject *self,
4500 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004501 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 register Py_ssize_t i;
4504 register Py_ssize_t j;
4505 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004506 PyObject *str;
4507
4508 for (i = j = len - 1; i >= 0; ) {
4509 /* find a token */
4510 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4511 i--;
4512 j = i;
4513 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4514 i--;
4515 if (j > i) {
4516 if (maxcount-- <= 0)
4517 break;
4518 SPLIT_INSERT(self->str, i + 1, j + 1);
4519 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4520 i--;
4521 j = i;
4522 }
4523 }
4524 if (j >= 0) {
4525 SPLIT_INSERT(self->str, 0, j + 1);
4526 }
4527 return list;
4528
4529 onError:
4530 Py_DECREF(list);
4531 return NULL;
4532}
4533
4534static
4535PyObject *rsplit_char(PyUnicodeObject *self,
4536 PyObject *list,
4537 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 register Py_ssize_t i;
4541 register Py_ssize_t j;
4542 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004543 PyObject *str;
4544
4545 for (i = j = len - 1; i >= 0; ) {
4546 if (self->str[i] == ch) {
4547 if (maxcount-- <= 0)
4548 break;
4549 SPLIT_INSERT(self->str, i + 1, j + 1);
4550 j = i = i - 1;
4551 } else
4552 i--;
4553 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004554 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004555 SPLIT_INSERT(self->str, 0, j + 1);
4556 }
4557 return list;
4558
4559 onError:
4560 Py_DECREF(list);
4561 return NULL;
4562}
4563
4564static
4565PyObject *rsplit_substring(PyUnicodeObject *self,
4566 PyObject *list,
4567 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004570 register Py_ssize_t i;
4571 register Py_ssize_t j;
4572 Py_ssize_t len = self->length;
4573 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004574 PyObject *str;
4575
4576 for (i = len - sublen, j = len; i >= 0; ) {
4577 if (Py_UNICODE_MATCH(self, i, substring)) {
4578 if (maxcount-- <= 0)
4579 break;
4580 SPLIT_INSERT(self->str, i + sublen, j);
4581 j = i;
4582 i -= sublen;
4583 } else
4584 i--;
4585 }
4586 if (j >= 0) {
4587 SPLIT_INSERT(self->str, 0, j);
4588 }
4589 return list;
4590
4591 onError:
4592 Py_DECREF(list);
4593 return NULL;
4594}
4595
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004597#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598
4599static
4600PyObject *split(PyUnicodeObject *self,
4601 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603{
4604 PyObject *list;
4605
4606 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004607 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608
4609 list = PyList_New(0);
4610 if (!list)
4611 return NULL;
4612
4613 if (substring == NULL)
4614 return split_whitespace(self,list,maxcount);
4615
4616 else if (substring->length == 1)
4617 return split_char(self,list,substring->str[0],maxcount);
4618
4619 else if (substring->length == 0) {
4620 Py_DECREF(list);
4621 PyErr_SetString(PyExc_ValueError, "empty separator");
4622 return NULL;
4623 }
4624 else
4625 return split_substring(self,list,substring,maxcount);
4626}
4627
Tim Petersced69f82003-09-16 20:30:58 +00004628static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004629PyObject *rsplit(PyUnicodeObject *self,
4630 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004631 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004632{
4633 PyObject *list;
4634
4635 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004636 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004637
4638 list = PyList_New(0);
4639 if (!list)
4640 return NULL;
4641
4642 if (substring == NULL)
4643 return rsplit_whitespace(self,list,maxcount);
4644
4645 else if (substring->length == 1)
4646 return rsplit_char(self,list,substring->str[0],maxcount);
4647
4648 else if (substring->length == 0) {
4649 Py_DECREF(list);
4650 PyErr_SetString(PyExc_ValueError, "empty separator");
4651 return NULL;
4652 }
4653 else
4654 return rsplit_substring(self,list,substring,maxcount);
4655}
4656
4657static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658PyObject *replace(PyUnicodeObject *self,
4659 PyUnicodeObject *str1,
4660 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662{
4663 PyUnicodeObject *u;
4664
4665 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004666 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667
4668 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004669 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670
4671 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004672 if (!findchar(self->str, self->length, str1->str[0]) &&
4673 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 /* nothing to replace, return original string */
4675 Py_INCREF(self);
4676 u = self;
4677 } else {
4678 Py_UNICODE u1 = str1->str[0];
4679 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004680
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004682 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 self->length
4684 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004685 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004686 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004687 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 for (i = 0; i < u->length; i++)
4689 if (u->str[i] == u1) {
4690 if (--maxcount < 0)
4691 break;
4692 u->str[i] = u2;
4693 }
4694 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
4697 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004698 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 Py_UNICODE *p;
4700
4701 /* replace strings */
4702 n = count(self, 0, self->length, str1);
4703 if (n > maxcount)
4704 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004705 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004707 if (PyUnicode_CheckExact(self)) {
4708 Py_INCREF(self);
4709 u = self;
4710 }
4711 else {
4712 u = (PyUnicodeObject *)
4713 PyUnicode_FromUnicode(self->str, self->length);
4714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 } else {
4716 u = _PyUnicode_New(
4717 self->length + n * (str2->length - str1->length));
4718 if (u) {
4719 i = 0;
4720 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004721 if (str1->length > 0) {
4722 while (i <= self->length - str1->length)
4723 if (Py_UNICODE_MATCH(self, i, str1)) {
4724 /* replace string segment */
4725 Py_UNICODE_COPY(p, str2->str, str2->length);
4726 p += str2->length;
4727 i += str1->length;
4728 if (--n <= 0) {
4729 /* copy remaining part */
4730 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4731 break;
4732 }
4733 } else
4734 *p++ = self->str[i++];
4735 } else {
4736 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 Py_UNICODE_COPY(p, str2->str, str2->length);
4738 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004739 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004742 }
4743 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 }
4746 }
4747 }
Tim Petersced69f82003-09-16 20:30:58 +00004748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 return (PyObject *) u;
4750}
4751
4752/* --- Unicode Object Methods --------------------------------------------- */
4753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004754PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755"S.title() -> unicode\n\
4756\n\
4757Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004758characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759
4760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004761unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 return fixup(self, fixtitle);
4764}
4765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004766PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767"S.capitalize() -> unicode\n\
4768\n\
4769Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004770have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
4772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004773unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return fixup(self, fixcapitalize);
4776}
4777
4778#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004779PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780"S.capwords() -> unicode\n\
4781\n\
4782Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004783normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784
4785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004786unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
4788 PyObject *list;
4789 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004790 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 /* Split into words */
4793 list = split(self, NULL, -1);
4794 if (!list)
4795 return NULL;
4796
4797 /* Capitalize each word */
4798 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4799 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4800 fixcapitalize);
4801 if (item == NULL)
4802 goto onError;
4803 Py_DECREF(PyList_GET_ITEM(list, i));
4804 PyList_SET_ITEM(list, i, item);
4805 }
4806
4807 /* Join the words to form a new string */
4808 item = PyUnicode_Join(NULL, list);
4809
4810onError:
4811 Py_DECREF(list);
4812 return (PyObject *)item;
4813}
4814#endif
4815
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004816/* Argument converter. Coerces to a single unicode character */
4817
4818static int
4819convert_uc(PyObject *obj, void *addr)
4820{
4821 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4822 PyObject *uniobj;
4823 Py_UNICODE *unistr;
4824
4825 uniobj = PyUnicode_FromObject(obj);
4826 if (uniobj == NULL) {
4827 PyErr_SetString(PyExc_TypeError,
4828 "The fill character cannot be converted to Unicode");
4829 return 0;
4830 }
4831 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4832 PyErr_SetString(PyExc_TypeError,
4833 "The fill character must be exactly one character long");
4834 Py_DECREF(uniobj);
4835 return 0;
4836 }
4837 unistr = PyUnicode_AS_UNICODE(uniobj);
4838 *fillcharloc = unistr[0];
4839 Py_DECREF(uniobj);
4840 return 1;
4841}
4842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004843PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004844"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004846Return S centered in a Unicode string of length width. Padding is\n\
4847done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848
4849static PyObject *
4850unicode_center(PyUnicodeObject *self, PyObject *args)
4851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t marg, left;
4853 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004854 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
Thomas Woutersde017742006-02-16 19:34:37 +00004856 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return NULL;
4858
Tim Peters7a29bd52001-09-12 03:03:31 +00004859 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 Py_INCREF(self);
4861 return (PyObject*) self;
4862 }
4863
4864 marg = width - self->length;
4865 left = marg / 2 + (marg & width & 1);
4866
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004867 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868}
4869
Marc-André Lemburge5034372000-08-08 08:04:29 +00004870#if 0
4871
4872/* This code should go into some future Unicode collation support
4873 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004874 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004875
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004876/* speedy UTF-16 code point order comparison */
4877/* gleaned from: */
4878/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4879
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004880static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004882 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004883 0, 0, 0, 0, 0, 0, 0, 0,
4884 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004885 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004886};
4887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888static int
4889unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004891 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004892
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 Py_UNICODE *s1 = str1->str;
4894 Py_UNICODE *s2 = str2->str;
4895
4896 len1 = str1->length;
4897 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004898
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004900 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004901
4902 c1 = *s1++;
4903 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004904
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004905 if (c1 > (1<<11) * 26)
4906 c1 += utf16Fixup[c1>>11];
4907 if (c2 > (1<<11) * 26)
4908 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004909 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004910
4911 if (c1 != c2)
4912 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004913
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004914 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 }
4916
4917 return (len1 < len2) ? -1 : (len1 != len2);
4918}
4919
Marc-André Lemburge5034372000-08-08 08:04:29 +00004920#else
4921
4922static int
4923unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4924{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004925 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004926
4927 Py_UNICODE *s1 = str1->str;
4928 Py_UNICODE *s2 = str2->str;
4929
4930 len1 = str1->length;
4931 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004932
Marc-André Lemburge5034372000-08-08 08:04:29 +00004933 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004934 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004935
Fredrik Lundh45714e92001-06-26 16:39:36 +00004936 c1 = *s1++;
4937 c2 = *s2++;
4938
4939 if (c1 != c2)
4940 return (c1 < c2) ? -1 : 1;
4941
Marc-André Lemburge5034372000-08-08 08:04:29 +00004942 len1--; len2--;
4943 }
4944
4945 return (len1 < len2) ? -1 : (len1 != len2);
4946}
4947
4948#endif
4949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950int PyUnicode_Compare(PyObject *left,
4951 PyObject *right)
4952{
4953 PyUnicodeObject *u = NULL, *v = NULL;
4954 int result;
4955
4956 /* Coerce the two arguments */
4957 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4958 if (u == NULL)
4959 goto onError;
4960 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4961 if (v == NULL)
4962 goto onError;
4963
Thomas Wouters7e474022000-07-16 12:04:32 +00004964 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 if (v == u) {
4966 Py_DECREF(u);
4967 Py_DECREF(v);
4968 return 0;
4969 }
4970
4971 result = unicode_compare(u, v);
4972
4973 Py_DECREF(u);
4974 Py_DECREF(v);
4975 return result;
4976
4977onError:
4978 Py_XDECREF(u);
4979 Py_XDECREF(v);
4980 return -1;
4981}
4982
Guido van Rossum403d68b2000-03-13 15:55:09 +00004983int PyUnicode_Contains(PyObject *container,
4984 PyObject *element)
4985{
4986 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 int result;
4988 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004989 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004990
4991 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004992 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004993 if (v == NULL) {
4994 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004995 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004996 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004997 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004999 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005001
Barry Warsaw817918c2002-08-06 16:58:21 +00005002 size = PyUnicode_GET_SIZE(v);
5003 rhs = PyUnicode_AS_UNICODE(v);
5004 lhs = PyUnicode_AS_UNICODE(u);
5005
Guido van Rossum403d68b2000-03-13 15:55:09 +00005006 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005007 if (size == 1) {
5008 end = lhs + PyUnicode_GET_SIZE(u);
5009 while (lhs < end) {
5010 if (*lhs++ == *rhs) {
5011 result = 1;
5012 break;
5013 }
5014 }
5015 }
5016 else {
5017 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5018 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005019 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005020 result = 1;
5021 break;
5022 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005023 }
5024 }
5025
5026 Py_DECREF(u);
5027 Py_DECREF(v);
5028 return result;
5029
5030onError:
5031 Py_XDECREF(u);
5032 Py_XDECREF(v);
5033 return -1;
5034}
5035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036/* Concat to string or Unicode object giving a new Unicode object. */
5037
5038PyObject *PyUnicode_Concat(PyObject *left,
5039 PyObject *right)
5040{
5041 PyUnicodeObject *u = NULL, *v = NULL, *w;
5042
5043 /* Coerce the two arguments */
5044 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5045 if (u == NULL)
5046 goto onError;
5047 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5048 if (v == NULL)
5049 goto onError;
5050
5051 /* Shortcuts */
5052 if (v == unicode_empty) {
5053 Py_DECREF(v);
5054 return (PyObject *)u;
5055 }
5056 if (u == unicode_empty) {
5057 Py_DECREF(u);
5058 return (PyObject *)v;
5059 }
5060
5061 /* Concat the two Unicode strings */
5062 w = _PyUnicode_New(u->length + v->length);
5063 if (w == NULL)
5064 goto onError;
5065 Py_UNICODE_COPY(w->str, u->str, u->length);
5066 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5067
5068 Py_DECREF(u);
5069 Py_DECREF(v);
5070 return (PyObject *)w;
5071
5072onError:
5073 Py_XDECREF(u);
5074 Py_XDECREF(v);
5075 return NULL;
5076}
5077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005078PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079"S.count(sub[, start[, end]]) -> int\n\
5080\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005081Return the number of non-overlapping occurrences of substring sub in\n\
5082Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005083interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
5085static PyObject *
5086unicode_count(PyUnicodeObject *self, PyObject *args)
5087{
5088 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005089 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005090 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 PyObject *result;
5092
Guido van Rossumb8872e62000-05-09 14:14:27 +00005093 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5094 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 return NULL;
5096
5097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5098 (PyObject *)substring);
5099 if (substring == NULL)
5100 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005101
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (start < 0)
5103 start += self->length;
5104 if (start < 0)
5105 start = 0;
5106 if (end > self->length)
5107 end = self->length;
5108 if (end < 0)
5109 end += self->length;
5110 if (end < 0)
5111 end = 0;
5112
5113 result = PyInt_FromLong((long) count(self, start, end, substring));
5114
5115 Py_DECREF(substring);
5116 return result;
5117}
5118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005119PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005120"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005122Encodes S using the codec registered for encoding. encoding defaults\n\
5123to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005124handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5126'xmlcharrefreplace' as well as any other name registered with\n\
5127codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128
5129static PyObject *
5130unicode_encode(PyUnicodeObject *self, PyObject *args)
5131{
5132 char *encoding = NULL;
5133 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005134 PyObject *v;
5135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5137 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005138 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005139 if (v == NULL)
5140 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005141 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5142 PyErr_Format(PyExc_TypeError,
5143 "encoder did not return a string/unicode object "
5144 "(type=%.400s)",
5145 v->ob_type->tp_name);
5146 Py_DECREF(v);
5147 return NULL;
5148 }
5149 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005150
5151 onError:
5152 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005153}
5154
5155PyDoc_STRVAR(decode__doc__,
5156"S.decode([encoding[,errors]]) -> string or unicode\n\
5157\n\
5158Decodes S using the codec registered for encoding. encoding defaults\n\
5159to the default encoding. errors may be given to set a different error\n\
5160handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5161a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5162as well as any other name registerd with codecs.register_error that is\n\
5163able to handle UnicodeDecodeErrors.");
5164
5165static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005166unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005167{
5168 char *encoding = NULL;
5169 char *errors = NULL;
5170 PyObject *v;
5171
5172 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5173 return NULL;
5174 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005175 if (v == NULL)
5176 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005177 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5178 PyErr_Format(PyExc_TypeError,
5179 "decoder did not return a string/unicode object "
5180 "(type=%.400s)",
5181 v->ob_type->tp_name);
5182 Py_DECREF(v);
5183 return NULL;
5184 }
5185 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005186
5187 onError:
5188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189}
5190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005191PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192"S.expandtabs([tabsize]) -> unicode\n\
5193\n\
5194Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197static PyObject*
5198unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5199{
5200 Py_UNICODE *e;
5201 Py_UNICODE *p;
5202 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 PyUnicodeObject *u;
5205 int tabsize = 8;
5206
5207 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5208 return NULL;
5209
Thomas Wouters7e474022000-07-16 12:04:32 +00005210 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 i = j = 0;
5212 e = self->str + self->length;
5213 for (p = self->str; p < e; p++)
5214 if (*p == '\t') {
5215 if (tabsize > 0)
5216 j += tabsize - (j % tabsize);
5217 }
5218 else {
5219 j++;
5220 if (*p == '\n' || *p == '\r') {
5221 i += j;
5222 j = 0;
5223 }
5224 }
5225
5226 /* Second pass: create output string and fill it */
5227 u = _PyUnicode_New(i + j);
5228 if (!u)
5229 return NULL;
5230
5231 j = 0;
5232 q = u->str;
5233
5234 for (p = self->str; p < e; p++)
5235 if (*p == '\t') {
5236 if (tabsize > 0) {
5237 i = tabsize - (j % tabsize);
5238 j += i;
5239 while (i--)
5240 *q++ = ' ';
5241 }
5242 }
5243 else {
5244 j++;
5245 *q++ = *p;
5246 if (*p == '\n' || *p == '\r')
5247 j = 0;
5248 }
5249
5250 return (PyObject*) u;
5251}
5252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005253PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254"S.find(sub [,start [,end]]) -> int\n\
5255\n\
5256Return the lowest index in S where substring sub is found,\n\
5257such that sub is contained within s[start,end]. Optional\n\
5258arguments start and end are interpreted as in slice notation.\n\
5259\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005260Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
5262static PyObject *
5263unicode_find(PyUnicodeObject *self, PyObject *args)
5264{
5265 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005266 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005267 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 PyObject *result;
5269
Guido van Rossumb8872e62000-05-09 14:14:27 +00005270 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5271 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 return NULL;
5273 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5274 (PyObject *)substring);
5275 if (substring == NULL)
5276 return NULL;
5277
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
5280 Py_DECREF(substring);
5281 return result;
5282}
5283
5284static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
5287 if (index < 0 || index >= self->length) {
5288 PyErr_SetString(PyExc_IndexError, "string index out of range");
5289 return NULL;
5290 }
5291
5292 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5293}
5294
5295static long
5296unicode_hash(PyUnicodeObject *self)
5297{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005298 /* Since Unicode objects compare equal to their ASCII string
5299 counterparts, they should use the individual character values
5300 as basis for their hash value. This is needed to assure that
5301 strings and Unicode objects behave in the same way as
5302 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005305 register Py_UNICODE *p;
5306 register long x;
5307
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 if (self->hash != -1)
5309 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005310 len = PyUnicode_GET_SIZE(self);
5311 p = PyUnicode_AS_UNICODE(self);
5312 x = *p << 7;
5313 while (--len >= 0)
5314 x = (1000003*x) ^ *p++;
5315 x ^= PyUnicode_GET_SIZE(self);
5316 if (x == -1)
5317 x = -2;
5318 self->hash = x;
5319 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320}
5321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005322PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323"S.index(sub [,start [,end]]) -> int\n\
5324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005325Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326
5327static PyObject *
5328unicode_index(PyUnicodeObject *self, PyObject *args)
5329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005333 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334
Guido van Rossumb8872e62000-05-09 14:14:27 +00005335 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5336 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005338
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5340 (PyObject *)substring);
5341 if (substring == NULL)
5342 return NULL;
5343
5344 result = findstring(self, substring, start, end, 1);
5345
5346 Py_DECREF(substring);
5347 if (result < 0) {
5348 PyErr_SetString(PyExc_ValueError, "substring not found");
5349 return NULL;
5350 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352}
5353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005354PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005355"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005357Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005358at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
5360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005361unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362{
5363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5364 register const Py_UNICODE *e;
5365 int cased;
5366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 /* Shortcut for single character strings */
5368 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005369 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005372 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 e = p + PyUnicode_GET_SIZE(self);
5376 cased = 0;
5377 for (; p < e; p++) {
5378 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005381 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 else if (!cased && Py_UNICODE_ISLOWER(ch))
5383 cased = 1;
5384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005385 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386}
5387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005388PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005389"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005391Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005392at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
5394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005395unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
5397 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5398 register const Py_UNICODE *e;
5399 int cased;
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Shortcut for single character strings */
5402 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005403 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005405 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005406 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 e = p + PyUnicode_GET_SIZE(self);
5410 cased = 0;
5411 for (; p < e; p++) {
5412 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 else if (!cased && Py_UNICODE_ISUPPER(ch))
5417 cased = 1;
5418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005419 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005422PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005423"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005425Return True if S is a titlecased string and there is at least one\n\
5426character in S, i.e. upper- and titlecase characters may only\n\
5427follow uncased characters and lowercase characters only cased ones.\n\
5428Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005431unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432{
5433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5434 register const Py_UNICODE *e;
5435 int cased, previous_is_cased;
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 /* Shortcut for single character strings */
5438 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005439 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5440 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005442 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005443 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005444 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 e = p + PyUnicode_GET_SIZE(self);
5447 cased = 0;
5448 previous_is_cased = 0;
5449 for (; p < e; p++) {
5450 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5453 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005454 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 previous_is_cased = 1;
5456 cased = 1;
5457 }
5458 else if (Py_UNICODE_ISLOWER(ch)) {
5459 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005460 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 previous_is_cased = 1;
5462 cased = 1;
5463 }
5464 else
5465 previous_is_cased = 0;
5466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005467 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468}
5469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005470PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005471"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005473Return True if all characters in S are whitespace\n\
5474and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005477unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
5479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5480 register const Py_UNICODE *e;
5481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 /* Shortcut for single character strings */
5483 if (PyUnicode_GET_SIZE(self) == 1 &&
5484 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005488 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 e = p + PyUnicode_GET_SIZE(self);
5492 for (; p < e; p++) {
5493 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497}
5498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005499PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005500"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005501\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005502Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005503and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005504
5505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005506unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005507{
5508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509 register const Py_UNICODE *e;
5510
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005511 /* Shortcut for single character strings */
5512 if (PyUnicode_GET_SIZE(self) == 1 &&
5513 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005514 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005515
5516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005517 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005518 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005519
5520 e = p + PyUnicode_GET_SIZE(self);
5521 for (; p < e; p++) {
5522 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005523 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005525 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005526}
5527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005528PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005529"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005530\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005531Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005532and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005533
5534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005535unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005536{
5537 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5538 register const Py_UNICODE *e;
5539
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005540 /* Shortcut for single character strings */
5541 if (PyUnicode_GET_SIZE(self) == 1 &&
5542 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005543 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005544
5545 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005546 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005547 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005548
5549 e = p + PyUnicode_GET_SIZE(self);
5550 for (; p < e; p++) {
5551 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005552 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005554 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005555}
5556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005557PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005558"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005560Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005561False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
5563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005564unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565{
5566 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5567 register const Py_UNICODE *e;
5568
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 /* Shortcut for single character strings */
5570 if (PyUnicode_GET_SIZE(self) == 1 &&
5571 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005572 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005574 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005575 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005576 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 e = p + PyUnicode_GET_SIZE(self);
5579 for (; p < e; p++) {
5580 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005581 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005583 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584}
5585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005586PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005587"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005589Return True if all characters in S are digits\n\
5590and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591
5592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005593unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
5595 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5596 register const Py_UNICODE *e;
5597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 /* Shortcut for single character strings */
5599 if (PyUnicode_GET_SIZE(self) == 1 &&
5600 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005601 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005603 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005604 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005605 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 e = p + PyUnicode_GET_SIZE(self);
5608 for (; p < e; p++) {
5609 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005610 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005612 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613}
5614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005615PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005616"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005618Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005619False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
5621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005622unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
5624 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5625 register const Py_UNICODE *e;
5626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 /* Shortcut for single character strings */
5628 if (PyUnicode_GET_SIZE(self) == 1 &&
5629 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005630 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005632 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005633 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005634 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005635
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 e = p + PyUnicode_GET_SIZE(self);
5637 for (; p < e; p++) {
5638 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005639 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005641 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642}
5643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645"S.join(sequence) -> unicode\n\
5646\n\
5647Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005648sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
5650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005651unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005653 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654}
5655
Martin v. Löwis18e16552006-02-15 17:27:45 +00005656static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657unicode_length(PyUnicodeObject *self)
5658{
5659 return self->length;
5660}
5661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005662PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005663"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664\n\
5665Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005666done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
5668static PyObject *
5669unicode_ljust(PyUnicodeObject *self, PyObject *args)
5670{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005671 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005672 Py_UNICODE fillchar = ' ';
5673
Martin v. Löwis412fb672006-04-13 06:34:32 +00005674 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 return NULL;
5676
Tim Peters7a29bd52001-09-12 03:03:31 +00005677 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 Py_INCREF(self);
5679 return (PyObject*) self;
5680 }
5681
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005682 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683}
5684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005685PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686"S.lower() -> unicode\n\
5687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005691unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return fixup(self, fixlower);
5694}
5695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005696#define LEFTSTRIP 0
5697#define RIGHTSTRIP 1
5698#define BOTHSTRIP 2
5699
5700/* Arrays indexed by above */
5701static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5702
5703#define STRIPNAME(i) (stripformat[i]+3)
5704
5705static const Py_UNICODE *
5706unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5707{
Tim Peters030a5ce2002-04-22 19:00:10 +00005708 size_t i;
5709 for (i = 0; i < n; ++i)
5710 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005711 return s+i;
5712 return NULL;
5713}
5714
5715/* externally visible for str.strip(unicode) */
5716PyObject *
5717_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5718{
5719 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005721 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5723 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005724
5725 i = 0;
5726 if (striptype != RIGHTSTRIP) {
5727 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5728 i++;
5729 }
5730 }
5731
5732 j = len;
5733 if (striptype != LEFTSTRIP) {
5734 do {
5735 j--;
5736 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5737 j++;
5738 }
5739
5740 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5741 Py_INCREF(self);
5742 return (PyObject*)self;
5743 }
5744 else
5745 return PyUnicode_FromUnicode(s+i, j-i);
5746}
5747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005750do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005752 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005754
5755 i = 0;
5756 if (striptype != RIGHTSTRIP) {
5757 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5758 i++;
5759 }
5760 }
5761
5762 j = len;
5763 if (striptype != LEFTSTRIP) {
5764 do {
5765 j--;
5766 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5767 j++;
5768 }
5769
5770 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5771 Py_INCREF(self);
5772 return (PyObject*)self;
5773 }
5774 else
5775 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776}
5777
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005778
5779static PyObject *
5780do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5781{
5782 PyObject *sep = NULL;
5783
5784 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5785 return NULL;
5786
5787 if (sep != NULL && sep != Py_None) {
5788 if (PyUnicode_Check(sep))
5789 return _PyUnicode_XStrip(self, striptype, sep);
5790 else if (PyString_Check(sep)) {
5791 PyObject *res;
5792 sep = PyUnicode_FromObject(sep);
5793 if (sep==NULL)
5794 return NULL;
5795 res = _PyUnicode_XStrip(self, striptype, sep);
5796 Py_DECREF(sep);
5797 return res;
5798 }
5799 else {
5800 PyErr_Format(PyExc_TypeError,
5801 "%s arg must be None, unicode or str",
5802 STRIPNAME(striptype));
5803 return NULL;
5804 }
5805 }
5806
5807 return do_strip(self, striptype);
5808}
5809
5810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005811PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005812"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005813\n\
5814Return a copy of the string S with leading and trailing\n\
5815whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005816If chars is given and not None, remove characters in chars instead.\n\
5817If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005818
5819static PyObject *
5820unicode_strip(PyUnicodeObject *self, PyObject *args)
5821{
5822 if (PyTuple_GET_SIZE(args) == 0)
5823 return do_strip(self, BOTHSTRIP); /* Common case */
5824 else
5825 return do_argstrip(self, BOTHSTRIP, args);
5826}
5827
5828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005829PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005830"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005831\n\
5832Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005833If chars is given and not None, remove characters in chars instead.\n\
5834If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005835
5836static PyObject *
5837unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5838{
5839 if (PyTuple_GET_SIZE(args) == 0)
5840 return do_strip(self, LEFTSTRIP); /* Common case */
5841 else
5842 return do_argstrip(self, LEFTSTRIP, args);
5843}
5844
5845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005847"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005848\n\
5849Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005850If chars is given and not None, remove characters in chars instead.\n\
5851If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005852
5853static PyObject *
5854unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5855{
5856 if (PyTuple_GET_SIZE(args) == 0)
5857 return do_strip(self, RIGHTSTRIP); /* Common case */
5858 else
5859 return do_argstrip(self, RIGHTSTRIP, args);
5860}
5861
5862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
5866 PyUnicodeObject *u;
5867 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005869 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 if (len < 0)
5872 len = 0;
5873
Tim Peters7a29bd52001-09-12 03:03:31 +00005874 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* no repeat, return original string */
5876 Py_INCREF(str);
5877 return (PyObject*) str;
5878 }
Tim Peters8f422462000-09-09 06:13:41 +00005879
5880 /* ensure # of chars needed doesn't overflow int and # of bytes
5881 * needed doesn't overflow size_t
5882 */
5883 nchars = len * str->length;
5884 if (len && nchars / len != str->length) {
5885 PyErr_SetString(PyExc_OverflowError,
5886 "repeated string is too long");
5887 return NULL;
5888 }
5889 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5890 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5891 PyErr_SetString(PyExc_OverflowError,
5892 "repeated string is too long");
5893 return NULL;
5894 }
5895 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (!u)
5897 return NULL;
5898
5899 p = u->str;
5900
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005901 if (str->length == 1 && len > 0) {
5902 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005903 } else {
5904 int done = 0; /* number of characters copied this far */
5905 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005906 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005907 done = str->length;
5908 }
5909 while (done < nchars) {
5910 int n = (done <= nchars-done) ? done : nchars-done;
5911 Py_UNICODE_COPY(p+done, p, n);
5912 done += n;
5913 }
5914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
5916 return (PyObject*) u;
5917}
5918
5919PyObject *PyUnicode_Replace(PyObject *obj,
5920 PyObject *subobj,
5921 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
5924 PyObject *self;
5925 PyObject *str1;
5926 PyObject *str2;
5927 PyObject *result;
5928
5929 self = PyUnicode_FromObject(obj);
5930 if (self == NULL)
5931 return NULL;
5932 str1 = PyUnicode_FromObject(subobj);
5933 if (str1 == NULL) {
5934 Py_DECREF(self);
5935 return NULL;
5936 }
5937 str2 = PyUnicode_FromObject(replobj);
5938 if (str2 == NULL) {
5939 Py_DECREF(self);
5940 Py_DECREF(str1);
5941 return NULL;
5942 }
Tim Petersced69f82003-09-16 20:30:58 +00005943 result = replace((PyUnicodeObject *)self,
5944 (PyUnicodeObject *)str1,
5945 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 maxcount);
5947 Py_DECREF(self);
5948 Py_DECREF(str1);
5949 Py_DECREF(str2);
5950 return result;
5951}
5952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005953PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954"S.replace (old, new[, maxsplit]) -> unicode\n\
5955\n\
5956Return a copy of S with all occurrences of substring\n\
5957old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960static PyObject*
5961unicode_replace(PyUnicodeObject *self, PyObject *args)
5962{
5963 PyUnicodeObject *str1;
5964 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005965 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 PyObject *result;
5967
Martin v. Löwis18e16552006-02-15 17:27:45 +00005968 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 return NULL;
5970 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5971 if (str1 == NULL)
5972 return NULL;
5973 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005974 if (str2 == NULL) {
5975 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
5979 result = replace(self, str1, str2, maxcount);
5980
5981 Py_DECREF(str1);
5982 Py_DECREF(str2);
5983 return result;
5984}
5985
5986static
5987PyObject *unicode_repr(PyObject *unicode)
5988{
5989 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5990 PyUnicode_GET_SIZE(unicode),
5991 1);
5992}
5993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005994PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995"S.rfind(sub [,start [,end]]) -> int\n\
5996\n\
5997Return the highest index in S where substring sub is found,\n\
5998such that sub is contained within s[start,end]. Optional\n\
5999arguments start and end are interpreted as in slice notation.\n\
6000\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006001Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
6003static PyObject *
6004unicode_rfind(PyUnicodeObject *self, PyObject *args)
6005{
6006 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006008 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 PyObject *result;
6010
Guido van Rossumb8872e62000-05-09 14:14:27 +00006011 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6012 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 return NULL;
6014 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6015 (PyObject *)substring);
6016 if (substring == NULL)
6017 return NULL;
6018
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021 Py_DECREF(substring);
6022 return result;
6023}
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026"S.rindex(sub [,start [,end]]) -> int\n\
6027\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
6030static PyObject *
6031unicode_rindex(PyUnicodeObject *self, PyObject *args)
6032{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006033 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006036 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Guido van Rossumb8872e62000-05-09 14:14:27 +00006038 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6039 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 return NULL;
6041 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6042 (PyObject *)substring);
6043 if (substring == NULL)
6044 return NULL;
6045
6046 result = findstring(self, substring, start, end, -1);
6047
6048 Py_DECREF(substring);
6049 if (result < 0) {
6050 PyErr_SetString(PyExc_ValueError, "substring not found");
6051 return NULL;
6052 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006053 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006057"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058\n\
6059Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006060done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
6062static PyObject *
6063unicode_rjust(PyUnicodeObject *self, PyObject *args)
6064{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006065 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006066 Py_UNICODE fillchar = ' ';
6067
Martin v. Löwis412fb672006-04-13 06:34:32 +00006068 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return NULL;
6070
Tim Peters7a29bd52001-09-12 03:03:31 +00006071 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_INCREF(self);
6073 return (PyObject*) self;
6074 }
6075
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006076 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
6082 /* standard clamping */
6083 if (start < 0)
6084 start = 0;
6085 if (end < 0)
6086 end = 0;
6087 if (end > self->length)
6088 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006089 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 /* full slice, return original string */
6091 Py_INCREF(self);
6092 return (PyObject*) self;
6093 }
6094 if (start > end)
6095 start = end;
6096 /* copy slice */
6097 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6098 end - start);
6099}
6100
6101PyObject *PyUnicode_Split(PyObject *s,
6102 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104{
6105 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006106
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 s = PyUnicode_FromObject(s);
6108 if (s == NULL)
6109 return NULL;
6110 if (sep != NULL) {
6111 sep = PyUnicode_FromObject(sep);
6112 if (sep == NULL) {
6113 Py_DECREF(s);
6114 return NULL;
6115 }
6116 }
6117
6118 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6119
6120 Py_DECREF(s);
6121 Py_XDECREF(sep);
6122 return result;
6123}
6124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126"S.split([sep [,maxsplit]]) -> list of strings\n\
6127\n\
6128Return a list of the words in S, using sep as the\n\
6129delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006130splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006131any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
6133static PyObject*
6134unicode_split(PyUnicodeObject *self, PyObject *args)
6135{
6136 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Martin v. Löwis18e16552006-02-15 17:27:45 +00006139 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 return NULL;
6141
6142 if (substring == Py_None)
6143 return split(self, NULL, maxcount);
6144 else if (PyUnicode_Check(substring))
6145 return split(self, (PyUnicodeObject *)substring, maxcount);
6146 else
6147 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6148}
6149
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006150PyObject *PyUnicode_RSplit(PyObject *s,
6151 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006152 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006153{
6154 PyObject *result;
6155
6156 s = PyUnicode_FromObject(s);
6157 if (s == NULL)
6158 return NULL;
6159 if (sep != NULL) {
6160 sep = PyUnicode_FromObject(sep);
6161 if (sep == NULL) {
6162 Py_DECREF(s);
6163 return NULL;
6164 }
6165 }
6166
6167 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6168
6169 Py_DECREF(s);
6170 Py_XDECREF(sep);
6171 return result;
6172}
6173
6174PyDoc_STRVAR(rsplit__doc__,
6175"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6176\n\
6177Return a list of the words in S, using sep as the\n\
6178delimiter string, starting at the end of the string and\n\
6179working to the front. If maxsplit is given, at most maxsplit\n\
6180splits are done. If sep is not specified, any whitespace string\n\
6181is a separator.");
6182
6183static PyObject*
6184unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6185{
6186 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006187 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006188
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006190 return NULL;
6191
6192 if (substring == Py_None)
6193 return rsplit(self, NULL, maxcount);
6194 else if (PyUnicode_Check(substring))
6195 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6196 else
6197 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6198}
6199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006200PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006201"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202\n\
6203Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006204Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
6207static PyObject*
6208unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6209{
Guido van Rossum86662912000-04-11 15:38:46 +00006210 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Guido van Rossum86662912000-04-11 15:38:46 +00006212 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return NULL;
6214
Guido van Rossum86662912000-04-11 15:38:46 +00006215 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216}
6217
6218static
6219PyObject *unicode_str(PyUnicodeObject *self)
6220{
Fred Drakee4315f52000-05-09 19:53:39 +00006221 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222}
6223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006224PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225"S.swapcase() -> unicode\n\
6226\n\
6227Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006228and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
6230static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006231unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 return fixup(self, fixswapcase);
6234}
6235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006236PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237"S.translate(table) -> unicode\n\
6238\n\
6239Return a copy of the string S, where all characters have been mapped\n\
6240through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006241Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6242Unmapped characters are left untouched. Characters mapped to None\n\
6243are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244
6245static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006246unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247{
Tim Petersced69f82003-09-16 20:30:58 +00006248 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006250 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 "ignore");
6252}
6253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006254PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255"S.upper() -> unicode\n\
6256\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006257Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
6259static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006260unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 return fixup(self, fixupper);
6263}
6264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266"S.zfill(width) -> unicode\n\
6267\n\
6268Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006269of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
6271static PyObject *
6272unicode_zfill(PyUnicodeObject *self, PyObject *args)
6273{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006274 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 PyUnicodeObject *u;
6276
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 Py_ssize_t width;
6278 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 return NULL;
6280
6281 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006282 if (PyUnicode_CheckExact(self)) {
6283 Py_INCREF(self);
6284 return (PyObject*) self;
6285 }
6286 else
6287 return PyUnicode_FromUnicode(
6288 PyUnicode_AS_UNICODE(self),
6289 PyUnicode_GET_SIZE(self)
6290 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 }
6292
6293 fill = width - self->length;
6294
6295 u = pad(self, fill, 0, '0');
6296
Walter Dörwald068325e2002-04-15 13:36:47 +00006297 if (u == NULL)
6298 return NULL;
6299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 if (u->str[fill] == '+' || u->str[fill] == '-') {
6301 /* move sign to beginning of string */
6302 u->str[0] = u->str[fill];
6303 u->str[fill] = '0';
6304 }
6305
6306 return (PyObject*) u;
6307}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
6309#if 0
6310static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006311unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return PyInt_FromLong(unicode_freelist_size);
6314}
6315#endif
6316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006317PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006318"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006320Return True if S starts with the specified prefix, False otherwise.\n\
6321With optional start, test S beginning at that position.\n\
6322With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324static PyObject *
6325unicode_startswith(PyUnicodeObject *self,
6326 PyObject *args)
6327{
6328 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006329 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006330 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 PyObject *result;
6332
Guido van Rossumb8872e62000-05-09 14:14:27 +00006333 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6334 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 return NULL;
6336 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6337 (PyObject *)substring);
6338 if (substring == NULL)
6339 return NULL;
6340
Guido van Rossum77f6a652002-04-03 22:41:51 +00006341 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343 Py_DECREF(substring);
6344 return result;
6345}
6346
6347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006348PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006349"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006351Return True if S ends with the specified suffix, False otherwise.\n\
6352With optional start, test S beginning at that position.\n\
6353With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355static PyObject *
6356unicode_endswith(PyUnicodeObject *self,
6357 PyObject *args)
6358{
6359 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006361 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 PyObject *result;
6363
Guido van Rossumb8872e62000-05-09 14:14:27 +00006364 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6365 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 return NULL;
6367 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6368 (PyObject *)substring);
6369 if (substring == NULL)
6370 return NULL;
6371
Guido van Rossum77f6a652002-04-03 22:41:51 +00006372 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374 Py_DECREF(substring);
6375 return result;
6376}
6377
6378
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006379
6380static PyObject *
6381unicode_getnewargs(PyUnicodeObject *v)
6382{
6383 return Py_BuildValue("(u#)", v->str, v->length);
6384}
6385
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387static PyMethodDef unicode_methods[] = {
6388
6389 /* Order is according to common usage: often used methods should
6390 appear first, since lookup is done sequentially. */
6391
Georg Brandlecdc0a92006-03-30 12:19:07 +00006392 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006393 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6394 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006395 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6397 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6398 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6399 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6400 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6401 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6402 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6403 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6404 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6405 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006406 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006407 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006408/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6409 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6410 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6411 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006412 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006413 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006414 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006415 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6416 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6417 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6418 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6419 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6420 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6421 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6422 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6423 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6424 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6425 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6426 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6427 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6428 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006429 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006430#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006431 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432#endif
6433
6434#if 0
6435 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006436 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437#endif
6438
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006439 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 {NULL, NULL}
6441};
6442
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006443static PyObject *
6444unicode_mod(PyObject *v, PyObject *w)
6445{
6446 if (!PyUnicode_Check(v)) {
6447 Py_INCREF(Py_NotImplemented);
6448 return Py_NotImplemented;
6449 }
6450 return PyUnicode_Format(v, w);
6451}
6452
6453static PyNumberMethods unicode_as_number = {
6454 0, /*nb_add*/
6455 0, /*nb_subtract*/
6456 0, /*nb_multiply*/
6457 0, /*nb_divide*/
6458 unicode_mod, /*nb_remainder*/
6459};
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006463 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006464 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6465 (ssizeargfunc) unicode_getitem, /* sq_item */
6466 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 0, /* sq_ass_item */
6468 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006469 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470};
6471
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006472#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6473
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006474static PyObject*
6475unicode_subscript(PyUnicodeObject* self, PyObject* item)
6476{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006477 PyNumberMethods *nb = item->ob_type->tp_as_number;
6478 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6479 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006480 if (i == -1 && PyErr_Occurred())
6481 return NULL;
6482 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006483 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006484 return unicode_getitem(self, i);
6485 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006486 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006487 Py_UNICODE* source_buf;
6488 Py_UNICODE* result_buf;
6489 PyObject* result;
6490
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006491 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006492 &start, &stop, &step, &slicelength) < 0) {
6493 return NULL;
6494 }
6495
6496 if (slicelength <= 0) {
6497 return PyUnicode_FromUnicode(NULL, 0);
6498 } else {
6499 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006500 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6501 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006502
6503 if (result_buf == NULL)
6504 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006505
6506 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6507 result_buf[i] = source_buf[cur];
6508 }
Tim Petersced69f82003-09-16 20:30:58 +00006509
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006510 result = PyUnicode_FromUnicode(result_buf, slicelength);
6511 PyMem_FREE(result_buf);
6512 return result;
6513 }
6514 } else {
6515 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6516 return NULL;
6517 }
6518}
6519
6520static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006522 (binaryfunc)unicode_subscript, /* mp_subscript */
6523 (objobjargproc)0, /* mp_ass_subscript */
6524};
6525
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 const void **ptr)
6530{
6531 if (index != 0) {
6532 PyErr_SetString(PyExc_SystemError,
6533 "accessing non-existent unicode segment");
6534 return -1;
6535 }
6536 *ptr = (void *) self->str;
6537 return PyUnicode_GET_DATA_SIZE(self);
6538}
6539
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540static Py_ssize_t
6541unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 const void **ptr)
6543{
6544 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006545 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 return -1;
6547}
6548
6549static int
6550unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006551 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552{
6553 if (lenp)
6554 *lenp = PyUnicode_GET_DATA_SIZE(self);
6555 return 1;
6556}
6557
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006558static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006560 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 const void **ptr)
6562{
6563 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006564
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 if (index != 0) {
6566 PyErr_SetString(PyExc_SystemError,
6567 "accessing non-existent unicode segment");
6568 return -1;
6569 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006570 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 if (str == NULL)
6572 return -1;
6573 *ptr = (void *) PyString_AS_STRING(str);
6574 return PyString_GET_SIZE(str);
6575}
6576
6577/* Helpers for PyUnicode_Format() */
6578
6579static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 if (argidx < arglen) {
6584 (*p_argidx)++;
6585 if (arglen < 0)
6586 return args;
6587 else
6588 return PyTuple_GetItem(args, argidx);
6589 }
6590 PyErr_SetString(PyExc_TypeError,
6591 "not enough arguments for format string");
6592 return NULL;
6593}
6594
6595#define F_LJUST (1<<0)
6596#define F_SIGN (1<<1)
6597#define F_BLANK (1<<2)
6598#define F_ALT (1<<3)
6599#define F_ZERO (1<<4)
6600
Martin v. Löwis18e16552006-02-15 17:27:45 +00006601static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006602strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006604 register Py_ssize_t i;
6605 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 for (i = len - 1; i >= 0; i--)
6607 buffer[i] = (Py_UNICODE) charbuffer[i];
6608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return len;
6610}
6611
Neal Norwitzfc76d632006-01-10 06:03:13 +00006612static int
6613doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6614{
Tim Peters15231542006-02-16 01:08:01 +00006615 Py_ssize_t result;
6616
Neal Norwitzfc76d632006-01-10 06:03:13 +00006617 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006618 result = strtounicode(buffer, (char *)buffer);
6619 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006620}
6621
6622static int
6623longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6624{
Tim Peters15231542006-02-16 01:08:01 +00006625 Py_ssize_t result;
6626
Neal Norwitzfc76d632006-01-10 06:03:13 +00006627 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006628 result = strtounicode(buffer, (char *)buffer);
6629 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006630}
6631
Guido van Rossum078151d2002-08-11 04:24:12 +00006632/* XXX To save some code duplication, formatfloat/long/int could have been
6633 shared with stringobject.c, converting from 8-bit to Unicode after the
6634 formatting is done. */
6635
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636static int
6637formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006638 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 int flags,
6640 int prec,
6641 int type,
6642 PyObject *v)
6643{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006644 /* fmt = '%#.' + `prec` + `type`
6645 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 char fmt[20];
6647 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 x = PyFloat_AsDouble(v);
6650 if (x == -1.0 && PyErr_Occurred())
6651 return -1;
6652 if (prec < 0)
6653 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6655 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006656 /* Worst case length calc to ensure no buffer overrun:
6657
6658 'g' formats:
6659 fmt = %#.<prec>g
6660 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6661 for any double rep.)
6662 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6663
6664 'f' formats:
6665 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6666 len = 1 + 50 + 1 + prec = 52 + prec
6667
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006668 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006669 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006670
6671 */
6672 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6673 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006674 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006675 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006676 return -1;
6677 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006678 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6679 (flags&F_ALT) ? "#" : "",
6680 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006681 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682}
6683
Tim Peters38fd5b62000-09-21 05:43:11 +00006684static PyObject*
6685formatlong(PyObject *val, int flags, int prec, int type)
6686{
6687 char *buf;
6688 int i, len;
6689 PyObject *str; /* temporary string object. */
6690 PyUnicodeObject *result;
6691
6692 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6693 if (!str)
6694 return NULL;
6695 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006696 if (!result) {
6697 Py_DECREF(str);
6698 return NULL;
6699 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006700 for (i = 0; i < len; i++)
6701 result->str[i] = buf[i];
6702 result->str[len] = 0;
6703 Py_DECREF(str);
6704 return (PyObject*)result;
6705}
6706
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707static int
6708formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006709 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 int flags,
6711 int prec,
6712 int type,
6713 PyObject *v)
6714{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006715 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006716 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6717 * + 1 + 1
6718 * = 24
6719 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006720 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006721 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 long x;
6723
6724 x = PyInt_AsLong(v);
6725 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006726 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006727 if (x < 0 && type == 'u') {
6728 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006729 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006730 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6731 sign = "-";
6732 else
6733 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006735 prec = 1;
6736
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006737 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6738 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006739 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006740 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006741 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006742 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006743 return -1;
6744 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006745
6746 if ((flags & F_ALT) &&
6747 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006748 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006749 * of issues that cause pain:
6750 * - when 0 is being converted, the C standard leaves off
6751 * the '0x' or '0X', which is inconsistent with other
6752 * %#x/%#X conversions and inconsistent with Python's
6753 * hex() function
6754 * - there are platforms that violate the standard and
6755 * convert 0 with the '0x' or '0X'
6756 * (Metrowerks, Compaq Tru64)
6757 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006758 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006759 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006760 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006761 * We can achieve the desired consistency by inserting our
6762 * own '0x' or '0X' prefix, and substituting %x/%X in place
6763 * of %#x/%#X.
6764 *
6765 * Note that this is the same approach as used in
6766 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006767 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006768 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6769 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006770 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006771 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006772 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6773 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006774 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006775 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006776 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006777 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006778 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006779 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
6782static int
6783formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006784 size_t buflen,
6785 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006787 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006788 if (PyUnicode_Check(v)) {
6789 if (PyUnicode_GET_SIZE(v) != 1)
6790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006794 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006795 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006796 goto onError;
6797 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
6800 else {
6801 /* Integer input truncated to a character */
6802 long x;
6803 x = PyInt_AsLong(v);
6804 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006805 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006806#ifdef Py_UNICODE_WIDE
6807 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006808 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006809 "%c arg not in range(0x110000) "
6810 "(wide Python build)");
6811 return -1;
6812 }
6813#else
6814 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006815 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006816 "%c arg not in range(0x10000) "
6817 "(narrow Python build)");
6818 return -1;
6819 }
6820#endif
6821 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 }
6823 buf[1] = '\0';
6824 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006825
6826 onError:
6827 PyErr_SetString(PyExc_TypeError,
6828 "%c requires int or char");
6829 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006832/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6833
6834 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6835 chars are formatted. XXX This is a magic number. Each formatting
6836 routine does bounds checking to ensure no overflow, but a better
6837 solution may be to malloc a buffer of appropriate size for each
6838 format. For now, the current solution is sufficient.
6839*/
6840#define FORMATBUFLEN (size_t)120
6841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842PyObject *PyUnicode_Format(PyObject *format,
6843 PyObject *args)
6844{
6845 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006846 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 int args_owned = 0;
6848 PyUnicodeObject *result = NULL;
6849 PyObject *dict = NULL;
6850 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 if (format == NULL || args == NULL) {
6853 PyErr_BadInternalCall();
6854 return NULL;
6855 }
6856 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006857 if (uformat == NULL)
6858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 fmt = PyUnicode_AS_UNICODE(uformat);
6860 fmtcnt = PyUnicode_GET_SIZE(uformat);
6861
6862 reslen = rescnt = fmtcnt + 100;
6863 result = _PyUnicode_New(reslen);
6864 if (result == NULL)
6865 goto onError;
6866 res = PyUnicode_AS_UNICODE(result);
6867
6868 if (PyTuple_Check(args)) {
6869 arglen = PyTuple_Size(args);
6870 argidx = 0;
6871 }
6872 else {
6873 arglen = -1;
6874 argidx = -2;
6875 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006876 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6877 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 dict = args;
6879
6880 while (--fmtcnt >= 0) {
6881 if (*fmt != '%') {
6882 if (--rescnt < 0) {
6883 rescnt = fmtcnt + 100;
6884 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006885 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6888 --rescnt;
6889 }
6890 *res++ = *fmt++;
6891 }
6892 else {
6893 /* Got a format specifier */
6894 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 Py_UNICODE c = '\0';
6898 Py_UNICODE fill;
6899 PyObject *v = NULL;
6900 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006901 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006904 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
6906 fmt++;
6907 if (*fmt == '(') {
6908 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006909 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 PyObject *key;
6911 int pcount = 1;
6912
6913 if (dict == NULL) {
6914 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006915 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 goto onError;
6917 }
6918 ++fmt;
6919 --fmtcnt;
6920 keystart = fmt;
6921 /* Skip over balanced parentheses */
6922 while (pcount > 0 && --fmtcnt >= 0) {
6923 if (*fmt == ')')
6924 --pcount;
6925 else if (*fmt == '(')
6926 ++pcount;
6927 fmt++;
6928 }
6929 keylen = fmt - keystart - 1;
6930 if (fmtcnt < 0 || pcount > 0) {
6931 PyErr_SetString(PyExc_ValueError,
6932 "incomplete format key");
6933 goto onError;
6934 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006935#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006936 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 then looked up since Python uses strings to hold
6938 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006939 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 key = PyUnicode_EncodeUTF8(keystart,
6941 keylen,
6942 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006943#else
6944 key = PyUnicode_FromUnicode(keystart, keylen);
6945#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 if (key == NULL)
6947 goto onError;
6948 if (args_owned) {
6949 Py_DECREF(args);
6950 args_owned = 0;
6951 }
6952 args = PyObject_GetItem(dict, key);
6953 Py_DECREF(key);
6954 if (args == NULL) {
6955 goto onError;
6956 }
6957 args_owned = 1;
6958 arglen = -1;
6959 argidx = -2;
6960 }
6961 while (--fmtcnt >= 0) {
6962 switch (c = *fmt++) {
6963 case '-': flags |= F_LJUST; continue;
6964 case '+': flags |= F_SIGN; continue;
6965 case ' ': flags |= F_BLANK; continue;
6966 case '#': flags |= F_ALT; continue;
6967 case '0': flags |= F_ZERO; continue;
6968 }
6969 break;
6970 }
6971 if (c == '*') {
6972 v = getnextarg(args, arglen, &argidx);
6973 if (v == NULL)
6974 goto onError;
6975 if (!PyInt_Check(v)) {
6976 PyErr_SetString(PyExc_TypeError,
6977 "* wants int");
6978 goto onError;
6979 }
6980 width = PyInt_AsLong(v);
6981 if (width < 0) {
6982 flags |= F_LJUST;
6983 width = -width;
6984 }
6985 if (--fmtcnt >= 0)
6986 c = *fmt++;
6987 }
6988 else if (c >= '0' && c <= '9') {
6989 width = c - '0';
6990 while (--fmtcnt >= 0) {
6991 c = *fmt++;
6992 if (c < '0' || c > '9')
6993 break;
6994 if ((width*10) / 10 != width) {
6995 PyErr_SetString(PyExc_ValueError,
6996 "width too big");
6997 goto onError;
6998 }
6999 width = width*10 + (c - '0');
7000 }
7001 }
7002 if (c == '.') {
7003 prec = 0;
7004 if (--fmtcnt >= 0)
7005 c = *fmt++;
7006 if (c == '*') {
7007 v = getnextarg(args, arglen, &argidx);
7008 if (v == NULL)
7009 goto onError;
7010 if (!PyInt_Check(v)) {
7011 PyErr_SetString(PyExc_TypeError,
7012 "* wants int");
7013 goto onError;
7014 }
7015 prec = PyInt_AsLong(v);
7016 if (prec < 0)
7017 prec = 0;
7018 if (--fmtcnt >= 0)
7019 c = *fmt++;
7020 }
7021 else if (c >= '0' && c <= '9') {
7022 prec = c - '0';
7023 while (--fmtcnt >= 0) {
7024 c = Py_CHARMASK(*fmt++);
7025 if (c < '0' || c > '9')
7026 break;
7027 if ((prec*10) / 10 != prec) {
7028 PyErr_SetString(PyExc_ValueError,
7029 "prec too big");
7030 goto onError;
7031 }
7032 prec = prec*10 + (c - '0');
7033 }
7034 }
7035 } /* prec */
7036 if (fmtcnt >= 0) {
7037 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 if (--fmtcnt >= 0)
7039 c = *fmt++;
7040 }
7041 }
7042 if (fmtcnt < 0) {
7043 PyErr_SetString(PyExc_ValueError,
7044 "incomplete format");
7045 goto onError;
7046 }
7047 if (c != '%') {
7048 v = getnextarg(args, arglen, &argidx);
7049 if (v == NULL)
7050 goto onError;
7051 }
7052 sign = 0;
7053 fill = ' ';
7054 switch (c) {
7055
7056 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007057 pbuf = formatbuf;
7058 /* presume that buffer length is at least 1 */
7059 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 len = 1;
7061 break;
7062
7063 case 's':
7064 case 'r':
7065 if (PyUnicode_Check(v) && c == 's') {
7066 temp = v;
7067 Py_INCREF(temp);
7068 }
7069 else {
7070 PyObject *unicode;
7071 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007072 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 else
7074 temp = PyObject_Repr(v);
7075 if (temp == NULL)
7076 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007077 if (PyUnicode_Check(temp))
7078 /* nothing to do */;
7079 else if (PyString_Check(temp)) {
7080 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007081 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007083 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007085 Py_DECREF(temp);
7086 temp = unicode;
7087 if (temp == NULL)
7088 goto onError;
7089 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007090 else {
7091 Py_DECREF(temp);
7092 PyErr_SetString(PyExc_TypeError,
7093 "%s argument has non-string str()");
7094 goto onError;
7095 }
7096 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007097 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 len = PyUnicode_GET_SIZE(temp);
7099 if (prec >= 0 && len > prec)
7100 len = prec;
7101 break;
7102
7103 case 'i':
7104 case 'd':
7105 case 'u':
7106 case 'o':
7107 case 'x':
7108 case 'X':
7109 if (c == 'i')
7110 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007111 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007112 temp = formatlong(v, flags, prec, c);
7113 if (!temp)
7114 goto onError;
7115 pbuf = PyUnicode_AS_UNICODE(temp);
7116 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007117 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007119 else {
7120 pbuf = formatbuf;
7121 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7122 flags, prec, c, v);
7123 if (len < 0)
7124 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007125 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007126 }
7127 if (flags & F_ZERO)
7128 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 break;
7130
7131 case 'e':
7132 case 'E':
7133 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007134 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 case 'g':
7136 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007137 if (c == 'F')
7138 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007139 pbuf = formatbuf;
7140 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7141 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 if (len < 0)
7143 goto onError;
7144 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007145 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 fill = '0';
7147 break;
7148
7149 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007150 pbuf = formatbuf;
7151 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 if (len < 0)
7153 goto onError;
7154 break;
7155
7156 default:
7157 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007158 "unsupported format character '%c' (0x%x) "
7159 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007160 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007161 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007162 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 goto onError;
7164 }
7165 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007166 if (*pbuf == '-' || *pbuf == '+') {
7167 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 len--;
7169 }
7170 else if (flags & F_SIGN)
7171 sign = '+';
7172 else if (flags & F_BLANK)
7173 sign = ' ';
7174 else
7175 sign = 0;
7176 }
7177 if (width < len)
7178 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007179 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 reslen -= rescnt;
7181 rescnt = width + fmtcnt + 100;
7182 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007183 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007184 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007185 PyErr_NoMemory();
7186 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007187 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007188 if (_PyUnicode_Resize(&result, reslen) < 0) {
7189 Py_XDECREF(temp);
7190 goto onError;
7191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 res = PyUnicode_AS_UNICODE(result)
7193 + reslen - rescnt;
7194 }
7195 if (sign) {
7196 if (fill != ' ')
7197 *res++ = sign;
7198 rescnt--;
7199 if (width > len)
7200 width--;
7201 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007202 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7203 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007204 assert(pbuf[1] == c);
7205 if (fill != ' ') {
7206 *res++ = *pbuf++;
7207 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007208 }
Tim Petersfff53252001-04-12 18:38:48 +00007209 rescnt -= 2;
7210 width -= 2;
7211 if (width < 0)
7212 width = 0;
7213 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 if (width > len && !(flags & F_LJUST)) {
7216 do {
7217 --rescnt;
7218 *res++ = fill;
7219 } while (--width > len);
7220 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007221 if (fill == ' ') {
7222 if (sign)
7223 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007224 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007225 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007226 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007227 *res++ = *pbuf++;
7228 *res++ = *pbuf++;
7229 }
7230 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007231 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 res += len;
7233 rescnt -= len;
7234 while (--width >= len) {
7235 --rescnt;
7236 *res++ = ' ';
7237 }
7238 if (dict && (argidx < arglen) && c != '%') {
7239 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007240 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007241 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 goto onError;
7243 }
7244 Py_XDECREF(temp);
7245 } /* '%' */
7246 } /* until end */
7247 if (argidx < arglen && !dict) {
7248 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007249 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 goto onError;
7251 }
7252
Thomas Woutersa96affe2006-03-12 00:29:36 +00007253 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 if (args_owned) {
7256 Py_DECREF(args);
7257 }
7258 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 return (PyObject *)result;
7260
7261 onError:
7262 Py_XDECREF(result);
7263 Py_DECREF(uformat);
7264 if (args_owned) {
7265 Py_DECREF(args);
7266 }
7267 return NULL;
7268}
7269
7270static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007271 (readbufferproc) unicode_buffer_getreadbuf,
7272 (writebufferproc) unicode_buffer_getwritebuf,
7273 (segcountproc) unicode_buffer_getsegcount,
7274 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275};
7276
Jeremy Hylton938ace62002-07-17 16:30:39 +00007277static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007278unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7279
Tim Peters6d6c1a32001-08-02 04:15:00 +00007280static PyObject *
7281unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7282{
7283 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007284 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007285 char *encoding = NULL;
7286 char *errors = NULL;
7287
Guido van Rossume023fe02001-08-30 03:12:59 +00007288 if (type != &PyUnicode_Type)
7289 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007290 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7291 kwlist, &x, &encoding, &errors))
7292 return NULL;
7293 if (x == NULL)
7294 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007295 if (encoding == NULL && errors == NULL)
7296 return PyObject_Unicode(x);
7297 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007298 return PyUnicode_FromEncodedObject(x, encoding, errors);
7299}
7300
Guido van Rossume023fe02001-08-30 03:12:59 +00007301static PyObject *
7302unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7303{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007304 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007306
7307 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7308 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7309 if (tmp == NULL)
7310 return NULL;
7311 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007312 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007313 if (pnew == NULL) {
7314 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007315 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007316 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007317 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7318 if (pnew->str == NULL) {
7319 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007320 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007321 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007322 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007323 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007324 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7325 pnew->length = n;
7326 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007327 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007328 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007329}
7330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007331PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007332"unicode(string [, encoding[, errors]]) -> object\n\
7333\n\
7334Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007335encoding defaults to the current default string encoding.\n\
7336errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338PyTypeObject PyUnicode_Type = {
7339 PyObject_HEAD_INIT(&PyType_Type)
7340 0, /* ob_size */
7341 "unicode", /* tp_name */
7342 sizeof(PyUnicodeObject), /* tp_size */
7343 0, /* tp_itemsize */
7344 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007345 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007347 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 0, /* tp_setattr */
7349 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007350 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007351 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007353 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 (hashfunc) unicode_hash, /* tp_hash*/
7355 0, /* tp_call*/
7356 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007357 PyObject_GenericGetAttr, /* tp_getattro */
7358 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007360 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7361 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007362 unicode_doc, /* tp_doc */
7363 0, /* tp_traverse */
7364 0, /* tp_clear */
7365 0, /* tp_richcompare */
7366 0, /* tp_weaklistoffset */
7367 0, /* tp_iter */
7368 0, /* tp_iternext */
7369 unicode_methods, /* tp_methods */
7370 0, /* tp_members */
7371 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007372 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007373 0, /* tp_dict */
7374 0, /* tp_descr_get */
7375 0, /* tp_descr_set */
7376 0, /* tp_dictoffset */
7377 0, /* tp_init */
7378 0, /* tp_alloc */
7379 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007380 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381};
7382
7383/* Initialize the Unicode implementation */
7384
Thomas Wouters78890102000-07-22 19:25:51 +00007385void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007387 int i;
7388
Fred Drakee4315f52000-05-09 19:53:39 +00007389 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007390 unicode_freelist = NULL;
7391 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007393 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007394 for (i = 0; i < 256; i++)
7395 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007396 if (PyType_Ready(&PyUnicode_Type) < 0)
7397 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398}
7399
7400/* Finalize the Unicode implementation */
7401
7402void
Thomas Wouters78890102000-07-22 19:25:51 +00007403_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007405 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007406 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007408 Py_XDECREF(unicode_empty);
7409 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007410
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007411 for (i = 0; i < 256; i++) {
7412 if (unicode_latin1[i]) {
7413 Py_DECREF(unicode_latin1[i]);
7414 unicode_latin1[i] = NULL;
7415 }
7416 }
7417
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007418 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 PyUnicodeObject *v = u;
7420 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007421 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007422 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007423 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007424 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007426 unicode_freelist = NULL;
7427 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007429
Anthony Baxterac6bd462006-04-13 02:06:09 +00007430#ifdef __cplusplus
7431}
7432#endif
7433
7434
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007435/*
7436Local variables:
7437c-basic-offset: 4
7438indent-tabs-mode: nil
7439End:
7440*/