blob: 60b8cd91cd641aa4b3d7908e418e731b8b9f5607 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
51#define MAX_UNICODE_FREELIST_SIZE 1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Barry Warsaw51ac5802000-03-20 16:36:48 +000059 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Anthony Baxterac6bd462006-04-13 02:06:09 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Guido van Rossumd57fd912000-03-10 22:53:23 +000092/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093static PyUnicodeObject *unicode_freelist;
94static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000096/* The empty Unicode object is shared to improve performance. */
97static PyUnicodeObject *unicode_empty;
98
99/* Single character Unicode strings in the Latin-1 range are being
100 shared as well. */
101static PyUnicodeObject *unicode_latin1[256];
102
Fred Drakee4315f52000-05-09 19:53:39 +0000103/* Default encoding to use and assume when NULL is passed as encoding
104 parameter; it is initialized by _PyUnicode_Init().
105
106 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000107 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000108
109*/
Fred Drakee4315f52000-05-09 19:53:39 +0000110static char unicode_default_encoding[100];
111
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000112Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000113PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000114{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000115#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000116 return 0x10FFFF;
117#else
118 /* This is actually an illegal character, so it should
119 not be passed to unichr. */
120 return 0xFFFF;
121#endif
122}
123
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124/* --- Unicode Object ----------------------------------------------------- */
125
126static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000127int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000128 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129{
130 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000131
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000132 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000133 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000136 /* Resizing shared object (unicode_empty or single character
137 objects) in-place is not allowed. Use PyUnicode_Resize()
138 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000139 if (unicode == unicode_empty ||
140 (unicode->length == 1 &&
141 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000144 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000153 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000158 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 return 0;
169}
170
171/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000172 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173
174 XXX This allocator could further be enhanced by assuring that the
175 free list never reduces its size below 1.
176
177*/
178
179static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181{
182 register PyUnicodeObject *unicode;
183
Tim Petersced69f82003-09-16 20:30:58 +0000184 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 if (length == 0 && unicode_empty != NULL) {
186 Py_INCREF(unicode_empty);
187 return unicode_empty;
188 }
189
190 /* Unicode freelist & memory allocation */
191 if (unicode_freelist) {
192 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000193 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000196 /* Keep-Alive optimization: we only upsize the buffer,
197 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000198 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000199 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000204 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000206 }
207 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 }
209 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000210 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 if (unicode == NULL)
212 return NULL;
213 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
214 }
215
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000216 if (!unicode->str) {
217 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000218 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000220 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000221 * the caller fails before initializing str -- unicode_resize()
222 * reads str[0], and the Keep-Alive optimization can keep memory
223 * allocated for str alive across a call to unicode_dealloc(unicode).
224 * We don't want unicode_resize to read uninitialized memory in
225 * that case.
226 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000227 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000229 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000231 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000233
234 onError:
235 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000236 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238}
239
240static
Guido van Rossum9475a232001-10-05 20:51:39 +0000241void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000243 if (PyUnicode_CheckExact(unicode) &&
244 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000245 /* Keep-Alive optimization */
246 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 unicode->str = NULL;
249 unicode->length = 0;
250 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 if (unicode->defenc) {
252 Py_DECREF(unicode->defenc);
253 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 }
255 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 *(PyUnicodeObject **)unicode = unicode_freelist;
257 unicode_freelist = unicode;
258 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000262 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000263 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 }
265}
266
Martin v. Löwis18e16552006-02-15 17:27:45 +0000267int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268{
269 register PyUnicodeObject *v;
270
271 /* Argument checks */
272 if (unicode == NULL) {
273 PyErr_BadInternalCall();
274 return -1;
275 }
276 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000277 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000278 PyErr_BadInternalCall();
279 return -1;
280 }
281
282 /* Resizing unicode_empty and single character objects is not
283 possible since these are being shared. We simply return a fresh
284 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000285 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286 (v == unicode_empty || v->length == 1)) {
287 PyUnicodeObject *w = _PyUnicode_New(length);
288 if (w == NULL)
289 return -1;
290 Py_UNICODE_COPY(w->str, v->str,
291 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000292 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000293 *unicode = (PyObject *)w;
294 return 0;
295 }
296
297 /* Note that we don't have to modify *unicode for unshared Unicode
298 objects, since we can modify them in-place. */
299 return unicode_resize(v, length);
300}
301
302/* Internal API for use in unicodeobject.c only ! */
303#define _PyUnicode_Resize(unicodevar, length) \
304 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308{
309 PyUnicodeObject *unicode;
310
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000311 /* If the Unicode data is known at construction time, we can apply
312 some optimizations which share commonly used objects. */
313 if (u != NULL) {
314
315 /* Optimization for empty strings */
316 if (size == 0 && unicode_empty != NULL) {
317 Py_INCREF(unicode_empty);
318 return (PyObject *)unicode_empty;
319 }
320
321 /* Single character Unicode objects in the Latin-1 range are
322 shared when using this constructor */
323 if (size == 1 && *u < 256) {
324 unicode = unicode_latin1[*u];
325 if (!unicode) {
326 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000327 if (!unicode)
328 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000329 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 unicode_latin1[*u] = unicode;
331 }
332 Py_INCREF(unicode);
333 return (PyObject *)unicode;
334 }
335 }
Tim Petersced69f82003-09-16 20:30:58 +0000336
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 unicode = _PyUnicode_New(size);
338 if (!unicode)
339 return NULL;
340
341 /* Copy the Unicode data into the new object */
342 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
345 return (PyObject *)unicode;
346}
347
348#ifdef HAVE_WCHAR_H
349
350PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000351 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352{
353 PyUnicodeObject *unicode;
354
355 if (w == NULL) {
356 PyErr_BadInternalCall();
357 return NULL;
358 }
359
360 unicode = _PyUnicode_New(size);
361 if (!unicode)
362 return NULL;
363
364 /* Copy the wchar_t data into the new object */
365#ifdef HAVE_USABLE_WCHAR_T
366 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000367#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 {
369 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000370 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000372 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 *u++ = *w++;
374 }
375#endif
376
377 return (PyObject *)unicode;
378}
379
Martin v. Löwis18e16552006-02-15 17:27:45 +0000380Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
381 wchar_t *w,
382 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383{
384 if (unicode == NULL) {
385 PyErr_BadInternalCall();
386 return -1;
387 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000388
389 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000391 size = PyUnicode_GET_SIZE(unicode) + 1;
392
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393#ifdef HAVE_USABLE_WCHAR_T
394 memcpy(w, unicode->str, size * sizeof(wchar_t));
395#else
396 {
397 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000398 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000400 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 *w++ = *u++;
402 }
403#endif
404
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000405 if (size > PyUnicode_GET_SIZE(unicode))
406 return PyUnicode_GET_SIZE(unicode);
407 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 return size;
409}
410
411#endif
412
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000413PyObject *PyUnicode_FromOrdinal(int ordinal)
414{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000415 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000416
417#ifdef Py_UNICODE_WIDE
418 if (ordinal < 0 || ordinal > 0x10ffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x110000) "
421 "(wide Python build)");
422 return NULL;
423 }
424#else
425 if (ordinal < 0 || ordinal > 0xffff) {
426 PyErr_SetString(PyExc_ValueError,
427 "unichr() arg not in range(0x10000) "
428 "(narrow Python build)");
429 return NULL;
430 }
431#endif
432
Hye-Shik Chang40574832004-04-06 07:24:51 +0000433 s[0] = (Py_UNICODE)ordinal;
434 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000435}
436
Guido van Rossumd57fd912000-03-10 22:53:23 +0000437PyObject *PyUnicode_FromObject(register PyObject *obj)
438{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000439 /* XXX Perhaps we should make this API an alias of
440 PyObject_Unicode() instead ?! */
441 if (PyUnicode_CheckExact(obj)) {
442 Py_INCREF(obj);
443 return obj;
444 }
445 if (PyUnicode_Check(obj)) {
446 /* For a Unicode subtype that's not a Unicode object,
447 return a true Unicode object with the same data. */
448 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
449 PyUnicode_GET_SIZE(obj));
450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
452}
453
454PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
455 const char *encoding,
456 const char *errors)
457{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000458 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000459 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000460 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 if (obj == NULL) {
463 PyErr_BadInternalCall();
464 return NULL;
465 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000467#if 0
468 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000469 that no encodings is given and then redirect to
470 PyObject_Unicode() which then applies the additional logic for
471 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000472
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000473 NOTE: This API should really only be used for object which
474 represent *encoded* Unicode !
475
476 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 if (PyUnicode_Check(obj)) {
478 if (encoding) {
479 PyErr_SetString(PyExc_TypeError,
480 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000482 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000483 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000484 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000485#else
486 if (PyUnicode_Check(obj)) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
489 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491#endif
492
493 /* Coerce object */
494 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 s = PyString_AS_STRING(obj);
496 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000497 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000498 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
499 /* Overwrite the error message with something more useful in
500 case of a TypeError. */
501 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000502 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000503 "coercing to Unicode: need string or buffer, "
504 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 obj->ob_type->tp_name);
506 goto onError;
507 }
Tim Petersced69f82003-09-16 20:30:58 +0000508
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 if (len == 0) {
511 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 }
Tim Petersced69f82003-09-16 20:30:58 +0000514 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 return v;
518
519 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521}
522
523PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000524 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525 const char *encoding,
526 const char *errors)
527{
528 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000529
530 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000531 encoding = PyUnicode_GetDefaultEncoding();
532
533 /* Shortcuts for common default encodings */
534 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000536 else if (strcmp(encoding, "latin-1") == 0)
537 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000538#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
539 else if (strcmp(encoding, "mbcs") == 0)
540 return PyUnicode_DecodeMBCS(s, size, errors);
541#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000542 else if (strcmp(encoding, "ascii") == 0)
543 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
545 /* Decode via the codec registry */
546 buffer = PyBuffer_FromMemory((void *)s, size);
547 if (buffer == NULL)
548 goto onError;
549 unicode = PyCodec_Decode(buffer, encoding, errors);
550 if (unicode == NULL)
551 goto onError;
552 if (!PyUnicode_Check(unicode)) {
553 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000554 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 unicode->ob_type->tp_name);
556 Py_DECREF(unicode);
557 goto onError;
558 }
559 Py_DECREF(buffer);
560 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 onError:
563 Py_XDECREF(buffer);
564 return NULL;
565}
566
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000567PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
568 const char *encoding,
569 const char *errors)
570{
571 PyObject *v;
572
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
576 }
577
578 if (encoding == NULL)
579 encoding = PyUnicode_GetDefaultEncoding();
580
581 /* Decode via the codec registry */
582 v = PyCodec_Decode(unicode, encoding, errors);
583 if (v == NULL)
584 goto onError;
585 return v;
586
587 onError:
588 return NULL;
589}
590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000592 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 const char *encoding,
594 const char *errors)
595{
596 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000597
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 unicode = PyUnicode_FromUnicode(s, size);
599 if (unicode == NULL)
600 return NULL;
601 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
602 Py_DECREF(unicode);
603 return v;
604}
605
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000606PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
607 const char *encoding,
608 const char *errors)
609{
610 PyObject *v;
611
612 if (!PyUnicode_Check(unicode)) {
613 PyErr_BadArgument();
614 goto onError;
615 }
616
617 if (encoding == NULL)
618 encoding = PyUnicode_GetDefaultEncoding();
619
620 /* Encode via the codec registry */
621 v = PyCodec_Encode(unicode, encoding, errors);
622 if (v == NULL)
623 goto onError;
624 return v;
625
626 onError:
627 return NULL;
628}
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
631 const char *encoding,
632 const char *errors)
633{
634 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 if (!PyUnicode_Check(unicode)) {
637 PyErr_BadArgument();
638 goto onError;
639 }
Fred Drakee4315f52000-05-09 19:53:39 +0000640
Tim Petersced69f82003-09-16 20:30:58 +0000641 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000642 encoding = PyUnicode_GetDefaultEncoding();
643
644 /* Shortcuts for common default encodings */
645 if (errors == NULL) {
646 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000647 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000648 else if (strcmp(encoding, "latin-1") == 0)
649 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000650#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
651 else if (strcmp(encoding, "mbcs") == 0)
652 return PyUnicode_AsMBCSString(unicode);
653#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000654 else if (strcmp(encoding, "ascii") == 0)
655 return PyUnicode_AsASCIIString(unicode);
656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657
658 /* Encode via the codec registry */
659 v = PyCodec_Encode(unicode, encoding, errors);
660 if (v == NULL)
661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 if (!PyString_Check(v)) {
663 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000664 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 v->ob_type->tp_name);
666 Py_DECREF(v);
667 goto onError;
668 }
669 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000670
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 onError:
672 return NULL;
673}
674
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000675PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
676 const char *errors)
677{
678 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
679
680 if (v)
681 return v;
682 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
683 if (v && errors == NULL)
684 ((PyUnicodeObject *)unicode)->defenc = v;
685 return v;
686}
687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
689{
690 if (!PyUnicode_Check(unicode)) {
691 PyErr_BadArgument();
692 goto onError;
693 }
694 return PyUnicode_AS_UNICODE(unicode);
695
696 onError:
697 return NULL;
698}
699
Martin v. Löwis18e16552006-02-15 17:27:45 +0000700Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701{
702 if (!PyUnicode_Check(unicode)) {
703 PyErr_BadArgument();
704 goto onError;
705 }
706 return PyUnicode_GET_SIZE(unicode);
707
708 onError:
709 return -1;
710}
711
Thomas Wouters78890102000-07-22 19:25:51 +0000712const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000713{
714 return unicode_default_encoding;
715}
716
717int PyUnicode_SetDefaultEncoding(const char *encoding)
718{
719 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000720
Fred Drakee4315f52000-05-09 19:53:39 +0000721 /* Make sure the encoding is valid. As side effect, this also
722 loads the encoding into the codec registry cache. */
723 v = _PyCodec_Lookup(encoding);
724 if (v == NULL)
725 goto onError;
726 Py_DECREF(v);
727 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000728 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000729 sizeof(unicode_default_encoding));
730 return 0;
731
732 onError:
733 return -1;
734}
735
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000736/* error handling callback helper:
737 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000738 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000739 and adjust various state variables.
740 return 0 on success, -1 on error
741*/
742
743static
744int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
745 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000746 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
747 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000749 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750
751 PyObject *restuple = NULL;
752 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000753 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
754 Py_ssize_t requiredsize;
755 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000758 int res = -1;
759
760 if (*errorHandler == NULL) {
761 *errorHandler = PyCodec_LookupError(errors);
762 if (*errorHandler == NULL)
763 goto onError;
764 }
765
766 if (*exceptionObject == NULL) {
767 *exceptionObject = PyUnicodeDecodeError_Create(
768 encoding, input, insize, *startinpos, *endinpos, reason);
769 if (*exceptionObject == NULL)
770 goto onError;
771 }
772 else {
773 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
774 goto onError;
775 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
776 goto onError;
777 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
778 goto onError;
779 }
780
781 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
782 if (restuple == NULL)
783 goto onError;
784 if (!PyTuple_Check(restuple)) {
785 PyErr_Format(PyExc_TypeError, &argparse[4]);
786 goto onError;
787 }
788 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
789 goto onError;
790 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000791 newpos = insize+newpos;
792 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000793 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000794 goto onError;
795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796
797 /* need more space? (at least enough for what we
798 have+the replacement+the rest of the string (starting
799 at the new input position), so we won't have to check space
800 when there are no errors in the rest of the string) */
801 repptr = PyUnicode_AS_UNICODE(repunicode);
802 repsize = PyUnicode_GET_SIZE(repunicode);
803 requiredsize = *outpos + repsize + insize-newpos;
804 if (requiredsize > outsize) {
805 if (requiredsize<2*outsize)
806 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000807 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 goto onError;
809 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
810 }
811 *endinpos = newpos;
812 *inptr = input + newpos;
813 Py_UNICODE_COPY(*outptr, repptr, repsize);
814 *outptr += repsize;
815 *outpos += repsize;
816 /* we made it! */
817 res = 0;
818
819 onError:
820 Py_XDECREF(restuple);
821 return res;
822}
823
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000824/* --- UTF-7 Codec -------------------------------------------------------- */
825
826/* see RFC2152 for details */
827
Tim Petersced69f82003-09-16 20:30:58 +0000828static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000829char utf7_special[128] = {
830 /* indicate whether a UTF-7 character is special i.e. cannot be directly
831 encoded:
832 0 - not special
833 1 - special
834 2 - whitespace (optional)
835 3 - RFC2152 Set O (optional) */
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
837 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
838 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
840 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
842 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
843 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
844
845};
846
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000847/* Note: The comparison (c) <= 0 is a trick to work-around gcc
848 warnings about the comparison always being false; since
849 utf7_special[0] is 1, we can safely make that one comparison
850 true */
851
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000852#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000853 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000854 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000855 (encodeO && (utf7_special[(c)] == 3)))
856
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000857#define B64(n) \
858 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
859#define B64CHAR(c) \
860 (isalnum(c) || (c) == '+' || (c) == '/')
861#define UB64(c) \
862 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
863 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000864
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000865#define ENCODE(out, ch, bits) \
866 while (bits >= 6) { \
867 *out++ = B64(ch >> (bits-6)); \
868 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000869 }
870
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000871#define DECODE(out, ch, bits, surrogate) \
872 while (bits >= 16) { \
873 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
874 bits -= 16; \
875 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000876 /* We have already generated an error for the high surrogate \
877 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000878 surrogate = 0; \
879 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000880 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000881 it in a 16-bit character */ \
882 surrogate = 1; \
883 errmsg = "code pairs are not supported"; \
884 goto utf7Error; \
885 } else { \
886 *out++ = outCh; \
887 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000888 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000891 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 const char *errors)
893{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000894 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000895 Py_ssize_t startinpos;
896 Py_ssize_t endinpos;
897 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 const char *e;
899 PyUnicodeObject *unicode;
900 Py_UNICODE *p;
901 const char *errmsg = "";
902 int inShift = 0;
903 unsigned int bitsleft = 0;
904 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000905 int surrogate = 0;
906 PyObject *errorHandler = NULL;
907 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908
909 unicode = _PyUnicode_New(size);
910 if (!unicode)
911 return NULL;
912 if (size == 0)
913 return (PyObject *)unicode;
914
915 p = unicode->str;
916 e = s + size;
917
918 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 Py_UNICODE ch;
920 restart:
921 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922
923 if (inShift) {
924 if ((ch == '-') || !B64CHAR(ch)) {
925 inShift = 0;
926 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000927
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
929 if (bitsleft >= 6) {
930 /* The shift sequence has a partial character in it. If
931 bitsleft < 6 then we could just classify it as padding
932 but that is not the case here */
933
934 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 }
937 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000938 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 here so indicate the potential of a misencoded character. */
940
941 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
942 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
943 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000944 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 }
946
947 if (ch == '-') {
948 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000949 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 inShift = 1;
951 }
952 } else if (SPECIAL(ch,0,0)) {
953 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000954 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 } else {
956 *p++ = ch;
957 }
958 } else {
959 charsleft = (charsleft << 6) | UB64(ch);
960 bitsleft += 6;
961 s++;
962 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
963 }
964 }
965 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000967 s++;
968 if (s < e && *s == '-') {
969 s++;
970 *p++ = '+';
971 } else
972 {
973 inShift = 1;
974 bitsleft = 0;
975 }
976 }
977 else if (SPECIAL(ch,0,0)) {
978 errmsg = "unexpected special character";
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982 else {
983 *p++ = ch;
984 s++;
985 }
986 continue;
987 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000988 outpos = p-PyUnicode_AS_UNICODE(unicode);
989 endinpos = s-starts;
990 if (unicode_decode_call_errorhandler(
991 errors, &errorHandler,
992 "utf7", errmsg,
993 starts, size, &startinpos, &endinpos, &exc, &s,
994 (PyObject **)&unicode, &outpos, &p))
995 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 }
997
998 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 outpos = p-PyUnicode_AS_UNICODE(unicode);
1000 endinpos = size;
1001 if (unicode_decode_call_errorhandler(
1002 errors, &errorHandler,
1003 "utf7", "unterminated shift sequence",
1004 starts, size, &startinpos, &endinpos, &exc, &s,
1005 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001007 if (s < e)
1008 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 }
1010
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001011 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 goto onError;
1013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 Py_XDECREF(errorHandler);
1015 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 return (PyObject *)unicode;
1017
1018onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 Py_XDECREF(errorHandler);
1020 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021 Py_DECREF(unicode);
1022 return NULL;
1023}
1024
1025
1026PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028 int encodeSetO,
1029 int encodeWhiteSpace,
1030 const char *errors)
1031{
1032 PyObject *v;
1033 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001034 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001036 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 unsigned int bitsleft = 0;
1038 unsigned long charsleft = 0;
1039 char * out;
1040 char * start;
1041
1042 if (size == 0)
1043 return PyString_FromStringAndSize(NULL, 0);
1044
1045 v = PyString_FromStringAndSize(NULL, cbAllocated);
1046 if (v == NULL)
1047 return NULL;
1048
1049 start = out = PyString_AS_STRING(v);
1050 for (;i < size; ++i) {
1051 Py_UNICODE ch = s[i];
1052
1053 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001054 if (ch == '+') {
1055 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001056 *out++ = '-';
1057 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1058 charsleft = ch;
1059 bitsleft = 16;
1060 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001061 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001063 } else {
1064 *out++ = (char) ch;
1065 }
1066 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001067 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1068 *out++ = B64(charsleft << (6-bitsleft));
1069 charsleft = 0;
1070 bitsleft = 0;
1071 /* Characters not in the BASE64 set implicitly unshift the sequence
1072 so no '-' is required, except if the character is itself a '-' */
1073 if (B64CHAR(ch) || ch == '-') {
1074 *out++ = '-';
1075 }
1076 inShift = 0;
1077 *out++ = (char) ch;
1078 } else {
1079 bitsleft += 16;
1080 charsleft = (charsleft << 16) | ch;
1081 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1082
1083 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001084 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 or '-' then the shift sequence will be terminated implicitly and we
1086 don't have to insert a '-'. */
1087
1088 if (bitsleft == 0) {
1089 if (i + 1 < size) {
1090 Py_UNICODE ch2 = s[i+1];
1091
1092 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001093
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 } else if (B64CHAR(ch2) || ch2 == '-') {
1095 *out++ = '-';
1096 inShift = 0;
1097 } else {
1098 inShift = 0;
1099 }
1100
1101 }
1102 else {
1103 *out++ = '-';
1104 inShift = 0;
1105 }
1106 }
Tim Petersced69f82003-09-16 20:30:58 +00001107 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 if (bitsleft) {
1111 *out++= B64(charsleft << (6-bitsleft) );
1112 *out++ = '-';
1113 }
1114
Tim Peters5de98422002-04-27 18:44:32 +00001115 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 return v;
1117}
1118
1119#undef SPECIAL
1120#undef B64
1121#undef B64CHAR
1122#undef UB64
1123#undef ENCODE
1124#undef DECODE
1125
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126/* --- UTF-8 Codec -------------------------------------------------------- */
1127
Tim Petersced69f82003-09-16 20:30:58 +00001128static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129char utf8_code_length[256] = {
1130 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1131 illegal prefix. see RFC 2279 for details */
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1146 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1147 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1148};
1149
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001151 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *errors)
1153{
Walter Dörwald69652032004-09-07 20:24:22 +00001154 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1155}
1156
1157PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001159 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001160 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001164 Py_ssize_t startinpos;
1165 Py_ssize_t endinpos;
1166 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *e;
1168 PyUnicodeObject *unicode;
1169 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001170 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001171 PyObject *errorHandler = NULL;
1172 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
1174 /* Note: size will always be longer than the resulting Unicode
1175 character count */
1176 unicode = _PyUnicode_New(size);
1177 if (!unicode)
1178 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001179 if (size == 0) {
1180 if (consumed)
1181 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184
1185 /* Unpack UTF-8 encoded data */
1186 p = unicode->str;
1187 e = s + size;
1188
1189 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191
1192 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194 s++;
1195 continue;
1196 }
1197
1198 n = utf8_code_length[ch];
1199
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001201 if (consumed)
1202 break;
1203 else {
1204 errmsg = "unexpected end of data";
1205 startinpos = s-starts;
1206 endinpos = size;
1207 goto utf8Error;
1208 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210
1211 switch (n) {
1212
1213 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001215 startinpos = s-starts;
1216 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218
1219 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001221 startinpos = s-starts;
1222 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001223 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224
1225 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 if ((s[1] & 0xc0) != 0x80) {
1227 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 startinpos = s-starts;
1229 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 goto utf8Error;
1231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001234 startinpos = s-starts;
1235 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 errmsg = "illegal encoding";
1237 goto utf8Error;
1238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241 break;
1242
1243 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001244 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001245 (s[2] & 0xc0) != 0x80) {
1246 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001247 startinpos = s-starts;
1248 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001249 goto utf8Error;
1250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 if (ch < 0x0800) {
1253 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001254 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001255
1256 XXX For wide builds (UCS-4) we should probably try
1257 to recombine the surrogates into a single code
1258 unit.
1259 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001260 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 startinpos = s-starts;
1262 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 goto utf8Error;
1264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001266 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001267 break;
1268
1269 case 4:
1270 if ((s[1] & 0xc0) != 0x80 ||
1271 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 (s[3] & 0xc0) != 0x80) {
1273 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 startinpos = s-starts;
1275 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 goto utf8Error;
1277 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001278 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1279 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1280 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001281 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001282 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001284 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001285 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 startinpos = s-starts;
1288 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 goto utf8Error;
1290 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001291#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001292 *p++ = (Py_UNICODE)ch;
1293#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001294 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001295
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001296 /* translate from 10000..10FFFF to 0..FFFF */
1297 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001298
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001299 /* high surrogate = top 10 bits added to D800 */
1300 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001301
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001303 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001304#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 break;
1306
1307 default:
1308 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001310 startinpos = s-starts;
1311 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
1314 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001316
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 outpos = p-PyUnicode_AS_UNICODE(unicode);
1319 if (unicode_decode_call_errorhandler(
1320 errors, &errorHandler,
1321 "utf8", errmsg,
1322 starts, size, &startinpos, &endinpos, &exc, &s,
1323 (PyObject **)&unicode, &outpos, &p))
1324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
Walter Dörwald69652032004-09-07 20:24:22 +00001326 if (consumed)
1327 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328
1329 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001330 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 goto onError;
1332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 Py_XDECREF(errorHandler);
1334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335 return (PyObject *)unicode;
1336
1337onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 Py_XDECREF(errorHandler);
1339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 Py_DECREF(unicode);
1341 return NULL;
1342}
1343
Tim Peters602f7402002-04-27 18:03:26 +00001344/* Allocation strategy: if the string is short, convert into a stack buffer
1345 and allocate exactly as much space needed at the end. Else allocate the
1346 maximum possible needed (4 result bytes per Unicode character), and return
1347 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001348*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001349PyObject *
1350PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001351 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001352 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353{
Tim Peters602f7402002-04-27 18:03:26 +00001354#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001355
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001357 PyObject *v; /* result string object */
1358 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001359 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001360 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001361 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 assert(s != NULL);
1364 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters602f7402002-04-27 18:03:26 +00001366 if (size <= MAX_SHORT_UNICHARS) {
1367 /* Write into the stack buffer; nallocated can't overflow.
1368 * At the end, we'll allocate exactly as much heap space as it
1369 * turns out we need.
1370 */
1371 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1372 v = NULL; /* will allocate after we're done */
1373 p = stackbuf;
1374 }
1375 else {
1376 /* Overallocate on the heap, and give the excess back at the end. */
1377 nallocated = size * 4;
1378 if (nallocated / 4 != size) /* overflow! */
1379 return PyErr_NoMemory();
1380 v = PyString_FromStringAndSize(NULL, nallocated);
1381 if (v == NULL)
1382 return NULL;
1383 p = PyString_AS_STRING(v);
1384 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001385
Tim Peters602f7402002-04-27 18:03:26 +00001386 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001387 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001388
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001389 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001390 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001392
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001394 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001395 *p++ = (char)(0xc0 | (ch >> 6));
1396 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001397 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001398 else {
Tim Peters602f7402002-04-27 18:03:26 +00001399 /* Encode UCS2 Unicode ordinals */
1400 if (ch < 0x10000) {
1401 /* Special case: check for high surrogate */
1402 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1403 Py_UCS4 ch2 = s[i];
1404 /* Check for low surrogate and combine the two to
1405 form a UCS4 value */
1406 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001407 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001408 i++;
1409 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001410 }
Tim Peters602f7402002-04-27 18:03:26 +00001411 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001413 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001414 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1415 *p++ = (char)(0x80 | (ch & 0x3f));
1416 continue;
1417 }
1418encodeUCS4:
1419 /* Encode UCS4 Unicode ordinals */
1420 *p++ = (char)(0xf0 | (ch >> 18));
1421 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1422 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1423 *p++ = (char)(0x80 | (ch & 0x3f));
1424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001426
Tim Peters602f7402002-04-27 18:03:26 +00001427 if (v == NULL) {
1428 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001429 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001430 assert(nneeded <= nallocated);
1431 v = PyString_FromStringAndSize(stackbuf, nneeded);
1432 }
1433 else {
1434 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001435 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001436 assert(nneeded <= nallocated);
1437 _PyString_Resize(&v, nneeded);
1438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001440
Tim Peters602f7402002-04-27 18:03:26 +00001441#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442}
1443
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1445{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 if (!PyUnicode_Check(unicode)) {
1447 PyErr_BadArgument();
1448 return NULL;
1449 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001450 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1451 PyUnicode_GET_SIZE(unicode),
1452 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453}
1454
1455/* --- UTF-16 Codec ------------------------------------------------------- */
1456
Tim Peters772747b2001-08-09 22:21:55 +00001457PyObject *
1458PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001459 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001460 const char *errors,
1461 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462{
Walter Dörwald69652032004-09-07 20:24:22 +00001463 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1464}
1465
1466PyObject *
1467PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001469 const char *errors,
1470 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001471 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001472{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001474 Py_ssize_t startinpos;
1475 Py_ssize_t endinpos;
1476 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 PyUnicodeObject *unicode;
1478 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001479 const unsigned char *q, *e;
1480 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001481 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001482 /* Offsets from q for retrieving byte pairs in the right order. */
1483#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1484 int ihi = 1, ilo = 0;
1485#else
1486 int ihi = 0, ilo = 1;
1487#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001488 PyObject *errorHandler = NULL;
1489 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490
1491 /* Note: size will always be longer than the resulting Unicode
1492 character count */
1493 unicode = _PyUnicode_New(size);
1494 if (!unicode)
1495 return NULL;
1496 if (size == 0)
1497 return (PyObject *)unicode;
1498
1499 /* Unpack UTF-16 encoded data */
1500 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001501 q = (unsigned char *)s;
1502 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503
1504 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001505 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001507 /* Check for BOM marks (U+FEFF) in the input and adjust current
1508 byte order setting accordingly. In native mode, the leading BOM
1509 mark is skipped, in all other modes, it is copied to the output
1510 stream as-is (giving a ZWNBSP character). */
1511 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001512 if (size >= 2) {
1513 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001514#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001515 if (bom == 0xFEFF) {
1516 q += 2;
1517 bo = -1;
1518 }
1519 else if (bom == 0xFFFE) {
1520 q += 2;
1521 bo = 1;
1522 }
Tim Petersced69f82003-09-16 20:30:58 +00001523#else
Walter Dörwald69652032004-09-07 20:24:22 +00001524 if (bom == 0xFEFF) {
1525 q += 2;
1526 bo = 1;
1527 }
1528 else if (bom == 0xFFFE) {
1529 q += 2;
1530 bo = -1;
1531 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001532#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001533 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535
Tim Peters772747b2001-08-09 22:21:55 +00001536 if (bo == -1) {
1537 /* force LE */
1538 ihi = 1;
1539 ilo = 0;
1540 }
1541 else if (bo == 1) {
1542 /* force BE */
1543 ihi = 0;
1544 ilo = 1;
1545 }
1546
1547 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001548 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001549 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001551 if (consumed)
1552 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 errmsg = "truncated data";
1554 startinpos = ((const char *)q)-starts;
1555 endinpos = ((const char *)e)-starts;
1556 goto utf16Error;
1557 /* The remaining input chars are ignored if the callback
1558 chooses to skip the input */
1559 }
1560 ch = (q[ihi] << 8) | q[ilo];
1561
Tim Peters772747b2001-08-09 22:21:55 +00001562 q += 2;
1563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 if (ch < 0xD800 || ch > 0xDFFF) {
1565 *p++ = ch;
1566 continue;
1567 }
1568
1569 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001570 if (q >= e) {
1571 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = (((const char *)q)-2)-starts;
1573 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 goto utf16Error;
1575 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001576 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1578 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001580#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001581 *p++ = ch;
1582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583#else
1584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001585#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001586 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 }
1588 else {
1589 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 startinpos = (((const char *)q)-4)-starts;
1591 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001592 goto utf16Error;
1593 }
1594
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001596 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 startinpos = (((const char *)q)-2)-starts;
1598 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 /* Fall through to report the error */
1600
1601 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 outpos = p-PyUnicode_AS_UNICODE(unicode);
1603 if (unicode_decode_call_errorhandler(
1604 errors, &errorHandler,
1605 "utf16", errmsg,
1606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1607 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 }
1610
1611 if (byteorder)
1612 *byteorder = bo;
1613
Walter Dörwald69652032004-09-07 20:24:22 +00001614 if (consumed)
1615 *consumed = (const char *)q-starts;
1616
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 goto onError;
1620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621 Py_XDECREF(errorHandler);
1622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623 return (PyObject *)unicode;
1624
1625onError:
1626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 Py_XDECREF(errorHandler);
1628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 return NULL;
1630}
1631
Tim Peters772747b2001-08-09 22:21:55 +00001632PyObject *
1633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001634 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001635 const char *errors,
1636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637{
1638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001639 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001641 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001642#else
1643 const int pairs = 0;
1644#endif
Tim Peters772747b2001-08-09 22:21:55 +00001645 /* Offsets from p for storing byte pairs in the right order. */
1646#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1647 int ihi = 1, ilo = 0;
1648#else
1649 int ihi = 0, ilo = 1;
1650#endif
1651
1652#define STORECHAR(CH) \
1653 do { \
1654 p[ihi] = ((CH) >> 8) & 0xff; \
1655 p[ilo] = (CH) & 0xff; \
1656 p += 2; \
1657 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001659#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001660 for (i = pairs = 0; i < size; i++)
1661 if (s[i] >= 0x10000)
1662 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001663#endif
Tim Petersced69f82003-09-16 20:30:58 +00001664 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001665 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 if (v == NULL)
1667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001668
Tim Peters772747b2001-08-09 22:21:55 +00001669 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001671 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001672 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001673 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001674
1675 if (byteorder == -1) {
1676 /* force LE */
1677 ihi = 1;
1678 ilo = 0;
1679 }
1680 else if (byteorder == 1) {
1681 /* force BE */
1682 ihi = 0;
1683 ilo = 1;
1684 }
1685
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001686 while (size-- > 0) {
1687 Py_UNICODE ch = *s++;
1688 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001689#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001690 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1692 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#endif
Tim Peters772747b2001-08-09 22:21:55 +00001695 STORECHAR(ch);
1696 if (ch2)
1697 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001700#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701}
1702
1703PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1704{
1705 if (!PyUnicode_Check(unicode)) {
1706 PyErr_BadArgument();
1707 return NULL;
1708 }
1709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 NULL,
1712 0);
1713}
1714
1715/* --- Unicode Escape Codec ----------------------------------------------- */
1716
Fredrik Lundh06d12682001-01-24 07:59:11 +00001717static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001718
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001720 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 const char *errors)
1722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001724 Py_ssize_t startinpos;
1725 Py_ssize_t endinpos;
1726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001731 char* message;
1732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 PyObject *errorHandler = NULL;
1734 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001735
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 /* Escaped strings will always be longer than the resulting
1737 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 length after conversion to the true value.
1739 (but if the error callback returns a long replacement string
1740 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 v = _PyUnicode_New(size);
1742 if (v == NULL)
1743 goto onError;
1744 if (size == 0)
1745 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 while (s < end) {
1751 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001752 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 /* Non-escape characters are interpreted as Unicode ordinals */
1756 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 continue;
1759 }
1760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 /* \ - Escapes */
1763 s++;
1764 switch (*s++) {
1765
1766 /* \x escapes */
1767 case '\n': break;
1768 case '\\': *p++ = '\\'; break;
1769 case '\'': *p++ = '\''; break;
1770 case '\"': *p++ = '\"'; break;
1771 case 'b': *p++ = '\b'; break;
1772 case 'f': *p++ = '\014'; break; /* FF */
1773 case 't': *p++ = '\t'; break;
1774 case 'n': *p++ = '\n'; break;
1775 case 'r': *p++ = '\r'; break;
1776 case 'v': *p++ = '\013'; break; /* VT */
1777 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1778
1779 /* \OOO (octal) escapes */
1780 case '0': case '1': case '2': case '3':
1781 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001782 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001784 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001786 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001788 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 break;
1790
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 /* hex escapes */
1792 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001794 digits = 2;
1795 message = "truncated \\xXX escape";
1796 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001800 digits = 4;
1801 message = "truncated \\uXXXX escape";
1802 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Fredrik Lundhccc74732001-02-18 22:13:49 +00001804 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001805 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806 digits = 8;
1807 message = "truncated \\UXXXXXXXX escape";
1808 hexescape:
1809 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (s+digits>end) {
1812 endinpos = size;
1813 if (unicode_decode_call_errorhandler(
1814 errors, &errorHandler,
1815 "unicodeescape", "end of string in escape sequence",
1816 starts, size, &startinpos, &endinpos, &exc, &s,
1817 (PyObject **)&v, &outpos, &p))
1818 goto onError;
1819 goto nextByte;
1820 }
1821 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 endinpos = (s+i+1)-starts;
1825 if (unicode_decode_call_errorhandler(
1826 errors, &errorHandler,
1827 "unicodeescape", message,
1828 starts, size, &startinpos, &endinpos, &exc, &s,
1829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001830 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001832 }
1833 chr = (chr<<4) & ~0xF;
1834 if (c >= '0' && c <= '9')
1835 chr += c - '0';
1836 else if (c >= 'a' && c <= 'f')
1837 chr += 10 + c - 'a';
1838 else
1839 chr += 10 + c - 'A';
1840 }
1841 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001842 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 /* _decoding_error will have already written into the
1844 target buffer. */
1845 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001847 /* when we get here, chr is a 32-bit unicode character */
1848 if (chr <= 0xffff)
1849 /* UCS-2 character */
1850 *p++ = (Py_UNICODE) chr;
1851 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001852 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001853 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001854#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001855 *p++ = chr;
1856#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 chr -= 0x10000L;
1858 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001859 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001860#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001861 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 endinpos = s-starts;
1863 outpos = p-PyUnicode_AS_UNICODE(v);
1864 if (unicode_decode_call_errorhandler(
1865 errors, &errorHandler,
1866 "unicodeescape", "illegal Unicode character",
1867 starts, size, &startinpos, &endinpos, &exc, &s,
1868 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001869 goto onError;
1870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001871 break;
1872
1873 /* \N{name} */
1874 case 'N':
1875 message = "malformed \\N character escape";
1876 if (ucnhash_CAPI == NULL) {
1877 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001878 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 m = PyImport_ImportModule("unicodedata");
1880 if (m == NULL)
1881 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001882 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001883 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001884 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001885 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001887 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001888 if (ucnhash_CAPI == NULL)
1889 goto ucnhashError;
1890 }
1891 if (*s == '{') {
1892 const char *start = s+1;
1893 /* look for the closing brace */
1894 while (*s != '}' && s < end)
1895 s++;
1896 if (s > start && s < end && *s == '}') {
1897 /* found a name. look it up in the unicode database */
1898 message = "unknown Unicode character name";
1899 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001900 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001901 goto store;
1902 }
1903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 endinpos = s-starts;
1905 outpos = p-PyUnicode_AS_UNICODE(v);
1906 if (unicode_decode_call_errorhandler(
1907 errors, &errorHandler,
1908 "unicodeescape", message,
1909 starts, size, &startinpos, &endinpos, &exc, &s,
1910 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 break;
1913
1914 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001915 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001916 message = "\\ at end of string";
1917 s--;
1918 endinpos = s-starts;
1919 outpos = p-PyUnicode_AS_UNICODE(v);
1920 if (unicode_decode_call_errorhandler(
1921 errors, &errorHandler,
1922 "unicodeescape", message,
1923 starts, size, &startinpos, &endinpos, &exc, &s,
1924 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001925 goto onError;
1926 }
1927 else {
1928 *p++ = '\\';
1929 *p++ = (unsigned char)s[-1];
1930 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 nextByte:
1934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001941
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001943 PyErr_SetString(
1944 PyExc_UnicodeError,
1945 "\\N escapes not supported (can't load unicodedata module)"
1946 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001947 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 Py_XDECREF(errorHandler);
1949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001950 return NULL;
1951
Fredrik Lundhccc74732001-02-18 22:13:49 +00001952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 Py_XDECREF(errorHandler);
1955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 return NULL;
1957}
1958
1959/* Return a Unicode-Escape string version of the Unicode object.
1960
1961 If quotes is true, the string is enclosed in u"" or u'' quotes as
1962 appropriate.
1963
1964*/
1965
Barry Warsaw51ac5802000-03-20 16:36:48 +00001966static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001968 Py_UNICODE ch);
1969
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970static
1971PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 int quotes)
1974{
1975 PyObject *repr;
1976 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1981 if (repr == NULL)
1982 return NULL;
1983
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001984 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001988 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 !findchar(s, size, '"')) ? '"' : '\'';
1990 }
1991 while (size-- > 0) {
1992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001993
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001994 /* Escape quotes and backslashes */
1995 if ((quotes &&
1996 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 *p++ = '\\';
1998 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001999 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002000 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002002#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002003 /* Map 21-bit characters to '\U00xxxxxx' */
2004 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002005 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002006
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007 /* Resize the string if necessary */
2008 if (offset + 12 > PyString_GET_SIZE(repr)) {
2009 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002010 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002011 p = PyString_AS_STRING(repr) + offset;
2012 }
2013
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014 *p++ = '\\';
2015 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002016 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2022 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002023 *p++ = hexdigit[ch & 0x0000000F];
2024 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002026#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002027 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2028 else if (ch >= 0xD800 && ch < 0xDC00) {
2029 Py_UNICODE ch2;
2030 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002032 ch2 = *s++;
2033 size--;
2034 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2035 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2036 *p++ = '\\';
2037 *p++ = 'U';
2038 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2044 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2045 *p++ = hexdigit[ucs & 0x0000000F];
2046 continue;
2047 }
2048 /* Fall through: isolated surrogates are copied as-is */
2049 s--;
2050 size++;
2051 }
2052
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002054 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = '\\';
2056 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002057 *p++ = hexdigit[(ch >> 12) & 0x000F];
2058 *p++ = hexdigit[(ch >> 8) & 0x000F];
2059 *p++ = hexdigit[(ch >> 4) & 0x000F];
2060 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002062
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002063 /* Map special whitespace to '\t', \n', '\r' */
2064 else if (ch == '\t') {
2065 *p++ = '\\';
2066 *p++ = 't';
2067 }
2068 else if (ch == '\n') {
2069 *p++ = '\\';
2070 *p++ = 'n';
2071 }
2072 else if (ch == '\r') {
2073 *p++ = '\\';
2074 *p++ = 'r';
2075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002076
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002077 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002078 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002080 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002081 *p++ = hexdigit[(ch >> 4) & 0x000F];
2082 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 /* Copy everything else as-is */
2086 else
2087 *p++ = (char) ch;
2088 }
2089 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002090 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002093 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 return repr;
2095}
2096
2097PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002098 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099{
2100 return unicodeescape_string(s, size, 0);
2101}
2102
2103PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2104{
2105 if (!PyUnicode_Check(unicode)) {
2106 PyErr_BadArgument();
2107 return NULL;
2108 }
2109 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2110 PyUnicode_GET_SIZE(unicode));
2111}
2112
2113/* --- Raw Unicode Escape Codec ------------------------------------------- */
2114
2115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 const char *errors)
2118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002120 Py_ssize_t startinpos;
2121 Py_ssize_t endinpos;
2122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 const char *end;
2126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 PyObject *errorHandler = NULL;
2128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002129
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 /* Escaped strings will always be longer than the resulting
2131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 length after conversion to the true value. (But decoding error
2133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 v = _PyUnicode_New(size);
2135 if (v == NULL)
2136 goto onError;
2137 if (size == 0)
2138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 end = s + size;
2141 while (s < end) {
2142 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002143 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
2147 /* Non-escape characters are interpreted as Unicode ordinals */
2148 if (*s != '\\') {
2149 *p++ = (unsigned char)*s++;
2150 continue;
2151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
2154 /* \u-escapes are only interpreted iff the number of leading
2155 backslashes if odd */
2156 bs = s;
2157 for (;s < end;) {
2158 if (*s != '\\')
2159 break;
2160 *p++ = (unsigned char)*s++;
2161 }
2162 if (((s - bs) & 1) == 0 ||
2163 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002164 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 continue;
2166 }
2167 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002168 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 s++;
2170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002173 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 endinpos = s-starts;
2177 if (unicode_decode_call_errorhandler(
2178 errors, &errorHandler,
2179 "rawunicodeescape", "truncated \\uXXXX",
2180 starts, size, &startinpos, &endinpos, &exc, &s,
2181 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 }
2185 x = (x<<4) & ~0xF;
2186 if (c >= '0' && c <= '9')
2187 x += c - '0';
2188 else if (c >= 'a' && c <= 'f')
2189 x += 10 + c - 'a';
2190 else
2191 x += 10 + c - 'A';
2192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002193#ifndef Py_UNICODE_WIDE
2194 if (x > 0x10000) {
2195 if (unicode_decode_call_errorhandler(
2196 errors, &errorHandler,
2197 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2198 starts, size, &startinpos, &endinpos, &exc, &s,
2199 (PyObject **)&v, &outpos, &p))
2200 goto onError;
2201 }
2202#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 *p++ = x;
2204 nextByte:
2205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002207 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 Py_XDECREF(errorHandler);
2210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 onError:
2214 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 Py_XDECREF(errorHandler);
2216 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return NULL;
2218}
2219
2220PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002221 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222{
2223 PyObject *repr;
2224 char *p;
2225 char *q;
2226
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002227 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002229#ifdef Py_UNICODE_WIDE
2230 repr = PyString_FromStringAndSize(NULL, 10 * size);
2231#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002233#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 if (repr == NULL)
2235 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002236 if (size == 0)
2237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 p = q = PyString_AS_STRING(repr);
2240 while (size-- > 0) {
2241 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002242#ifdef Py_UNICODE_WIDE
2243 /* Map 32-bit characters to '\Uxxxxxxxx' */
2244 if (ch >= 0x10000) {
2245 *p++ = '\\';
2246 *p++ = 'U';
2247 *p++ = hexdigit[(ch >> 28) & 0xf];
2248 *p++ = hexdigit[(ch >> 24) & 0xf];
2249 *p++ = hexdigit[(ch >> 20) & 0xf];
2250 *p++ = hexdigit[(ch >> 16) & 0xf];
2251 *p++ = hexdigit[(ch >> 12) & 0xf];
2252 *p++ = hexdigit[(ch >> 8) & 0xf];
2253 *p++ = hexdigit[(ch >> 4) & 0xf];
2254 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 else
2257#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 /* Map 16-bit characters to '\uxxxx' */
2259 if (ch >= 256) {
2260 *p++ = '\\';
2261 *p++ = 'u';
2262 *p++ = hexdigit[(ch >> 12) & 0xf];
2263 *p++ = hexdigit[(ch >> 8) & 0xf];
2264 *p++ = hexdigit[(ch >> 4) & 0xf];
2265 *p++ = hexdigit[ch & 15];
2266 }
2267 /* Copy everything else as-is */
2268 else
2269 *p++ = (char) ch;
2270 }
2271 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002272 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 return repr;
2274}
2275
2276PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2277{
2278 if (!PyUnicode_Check(unicode)) {
2279 PyErr_BadArgument();
2280 return NULL;
2281 }
2282 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2283 PyUnicode_GET_SIZE(unicode));
2284}
2285
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002286/* --- Unicode Internal Codec ------------------------------------------- */
2287
2288PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002290 const char *errors)
2291{
2292 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002296 PyUnicodeObject *v;
2297 Py_UNICODE *p;
2298 const char *end;
2299 const char *reason;
2300 PyObject *errorHandler = NULL;
2301 PyObject *exc = NULL;
2302
Neal Norwitzd43069c2006-01-08 01:12:10 +00002303#ifdef Py_UNICODE_WIDE
2304 Py_UNICODE unimax = PyUnicode_GetMax();
2305#endif
2306
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2308 if (v == NULL)
2309 goto onError;
2310 if (PyUnicode_GetSize((PyObject *)v) == 0)
2311 return (PyObject *)v;
2312 p = PyUnicode_AS_UNICODE(v);
2313 end = s + size;
2314
2315 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002316 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002317 /* We have to sanity check the raw data, otherwise doom looms for
2318 some malformed UCS-4 data. */
2319 if (
2320 #ifdef Py_UNICODE_WIDE
2321 *p > unimax || *p < 0 ||
2322 #endif
2323 end-s < Py_UNICODE_SIZE
2324 )
2325 {
2326 startinpos = s - starts;
2327 if (end-s < Py_UNICODE_SIZE) {
2328 endinpos = end-starts;
2329 reason = "truncated input";
2330 }
2331 else {
2332 endinpos = s - starts + Py_UNICODE_SIZE;
2333 reason = "illegal code point (> 0x10FFFF)";
2334 }
2335 outpos = p - PyUnicode_AS_UNICODE(v);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "unicode_internal", reason,
2339 starts, size, &startinpos, &endinpos, &exc, &s,
2340 (PyObject **)&v, &outpos, &p)) {
2341 goto onError;
2342 }
2343 }
2344 else {
2345 p++;
2346 s += Py_UNICODE_SIZE;
2347 }
2348 }
2349
Martin v. Löwis412fb672006-04-13 06:34:32 +00002350 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002351 goto onError;
2352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
2354 return (PyObject *)v;
2355
2356 onError:
2357 Py_XDECREF(v);
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return NULL;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Latin-1 Codec ------------------------------------------------------ */
2364
2365PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002366 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 const char *errors)
2368{
2369 PyUnicodeObject *v;
2370 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002371
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002373 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002374 Py_UNICODE r = *(unsigned char*)s;
2375 return PyUnicode_FromUnicode(&r, 1);
2376 }
2377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 v = _PyUnicode_New(size);
2379 if (v == NULL)
2380 goto onError;
2381 if (size == 0)
2382 return (PyObject *)v;
2383 p = PyUnicode_AS_UNICODE(v);
2384 while (size-- > 0)
2385 *p++ = (unsigned char)*s++;
2386 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 onError:
2389 Py_XDECREF(v);
2390 return NULL;
2391}
2392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393/* create or adjust a UnicodeEncodeError */
2394static void make_encode_exception(PyObject **exceptionObject,
2395 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002396 const Py_UNICODE *unicode, Py_ssize_t size,
2397 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 if (*exceptionObject == NULL) {
2401 *exceptionObject = PyUnicodeEncodeError_Create(
2402 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002405 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2406 goto onError;
2407 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2408 goto onError;
2409 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2410 goto onError;
2411 return;
2412 onError:
2413 Py_DECREF(*exceptionObject);
2414 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
2416}
2417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418/* raises a UnicodeEncodeError */
2419static void raise_encode_exception(PyObject **exceptionObject,
2420 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002421 const Py_UNICODE *unicode, Py_ssize_t size,
2422 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 const char *reason)
2424{
2425 make_encode_exception(exceptionObject,
2426 encoding, unicode, size, startpos, endpos, reason);
2427 if (*exceptionObject != NULL)
2428 PyCodec_StrictErrors(*exceptionObject);
2429}
2430
2431/* error handling callback helper:
2432 build arguments, call the callback and check the arguments,
2433 put the result into newpos and return the replacement string, which
2434 has to be freed by the caller */
2435static PyObject *unicode_encode_call_errorhandler(const char *errors,
2436 PyObject **errorHandler,
2437 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2439 Py_ssize_t startpos, Py_ssize_t endpos,
2440 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002442 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443
2444 PyObject *restuple;
2445 PyObject *resunicode;
2446
2447 if (*errorHandler == NULL) {
2448 *errorHandler = PyCodec_LookupError(errors);
2449 if (*errorHandler == NULL)
2450 return NULL;
2451 }
2452
2453 make_encode_exception(exceptionObject,
2454 encoding, unicode, size, startpos, endpos, reason);
2455 if (*exceptionObject == NULL)
2456 return NULL;
2457
2458 restuple = PyObject_CallFunctionObjArgs(
2459 *errorHandler, *exceptionObject, NULL);
2460 if (restuple == NULL)
2461 return NULL;
2462 if (!PyTuple_Check(restuple)) {
2463 PyErr_Format(PyExc_TypeError, &argparse[4]);
2464 Py_DECREF(restuple);
2465 return NULL;
2466 }
2467 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2468 &resunicode, newpos)) {
2469 Py_DECREF(restuple);
2470 return NULL;
2471 }
2472 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002473 *newpos = size+*newpos;
2474 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002475 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002476 Py_DECREF(restuple);
2477 return NULL;
2478 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_INCREF(resunicode);
2480 Py_DECREF(restuple);
2481 return resunicode;
2482}
2483
2484static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002485 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *errors,
2487 int limit)
2488{
2489 /* output object */
2490 PyObject *res;
2491 /* pointers to the beginning and end+1 of input */
2492 const Py_UNICODE *startp = p;
2493 const Py_UNICODE *endp = p + size;
2494 /* pointer to the beginning of the unencodable characters */
2495 /* const Py_UNICODE *badp = NULL; */
2496 /* pointer into the output */
2497 char *str;
2498 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 Py_ssize_t respos = 0;
2500 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002501 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2502 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 PyObject *errorHandler = NULL;
2504 PyObject *exc = NULL;
2505 /* the following variable is used for caching string comparisons
2506 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2507 int known_errorHandler = -1;
2508
2509 /* allocate enough for a simple encoding without
2510 replacements, if we need more, we'll resize */
2511 res = PyString_FromStringAndSize(NULL, size);
2512 if (res == NULL)
2513 goto onError;
2514 if (size == 0)
2515 return res;
2516 str = PyString_AS_STRING(res);
2517 ressize = size;
2518
2519 while (p<endp) {
2520 Py_UNICODE c = *p;
2521
2522 /* can we encode this? */
2523 if (c<limit) {
2524 /* no overflow check, because we know that the space is enough */
2525 *str++ = (char)c;
2526 ++p;
2527 }
2528 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002529 Py_ssize_t unicodepos = p-startp;
2530 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002532 Py_ssize_t repsize;
2533 Py_ssize_t newpos;
2534 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_UNICODE *uni2;
2536 /* startpos for collecting unencodable chars */
2537 const Py_UNICODE *collstart = p;
2538 const Py_UNICODE *collend = p;
2539 /* find all unecodable characters */
2540 while ((collend < endp) && ((*collend)>=limit))
2541 ++collend;
2542 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2543 if (known_errorHandler==-1) {
2544 if ((errors==NULL) || (!strcmp(errors, "strict")))
2545 known_errorHandler = 1;
2546 else if (!strcmp(errors, "replace"))
2547 known_errorHandler = 2;
2548 else if (!strcmp(errors, "ignore"))
2549 known_errorHandler = 3;
2550 else if (!strcmp(errors, "xmlcharrefreplace"))
2551 known_errorHandler = 4;
2552 else
2553 known_errorHandler = 0;
2554 }
2555 switch (known_errorHandler) {
2556 case 1: /* strict */
2557 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2558 goto onError;
2559 case 2: /* replace */
2560 while (collstart++<collend)
2561 *str++ = '?'; /* fall through */
2562 case 3: /* ignore */
2563 p = collend;
2564 break;
2565 case 4: /* xmlcharrefreplace */
2566 respos = str-PyString_AS_STRING(res);
2567 /* determine replacement size (temporarily (mis)uses p) */
2568 for (p = collstart, repsize = 0; p < collend; ++p) {
2569 if (*p<10)
2570 repsize += 2+1+1;
2571 else if (*p<100)
2572 repsize += 2+2+1;
2573 else if (*p<1000)
2574 repsize += 2+3+1;
2575 else if (*p<10000)
2576 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002577#ifndef Py_UNICODE_WIDE
2578 else
2579 repsize += 2+5+1;
2580#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 else if (*p<100000)
2582 repsize += 2+5+1;
2583 else if (*p<1000000)
2584 repsize += 2+6+1;
2585 else
2586 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002587#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 }
2589 requiredsize = respos+repsize+(endp-collend);
2590 if (requiredsize > ressize) {
2591 if (requiredsize<2*ressize)
2592 requiredsize = 2*ressize;
2593 if (_PyString_Resize(&res, requiredsize))
2594 goto onError;
2595 str = PyString_AS_STRING(res) + respos;
2596 ressize = requiredsize;
2597 }
2598 /* generate replacement (temporarily (mis)uses p) */
2599 for (p = collstart; p < collend; ++p) {
2600 str += sprintf(str, "&#%d;", (int)*p);
2601 }
2602 p = collend;
2603 break;
2604 default:
2605 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2606 encoding, reason, startp, size, &exc,
2607 collstart-startp, collend-startp, &newpos);
2608 if (repunicode == NULL)
2609 goto onError;
2610 /* need more space? (at least enough for what we
2611 have+the replacement+the rest of the string, so
2612 we won't have to check space for encodable characters) */
2613 respos = str-PyString_AS_STRING(res);
2614 repsize = PyUnicode_GET_SIZE(repunicode);
2615 requiredsize = respos+repsize+(endp-collend);
2616 if (requiredsize > ressize) {
2617 if (requiredsize<2*ressize)
2618 requiredsize = 2*ressize;
2619 if (_PyString_Resize(&res, requiredsize)) {
2620 Py_DECREF(repunicode);
2621 goto onError;
2622 }
2623 str = PyString_AS_STRING(res) + respos;
2624 ressize = requiredsize;
2625 }
2626 /* check if there is anything unencodable in the replacement
2627 and copy it to the output */
2628 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2629 c = *uni2;
2630 if (c >= limit) {
2631 raise_encode_exception(&exc, encoding, startp, size,
2632 unicodepos, unicodepos+1, reason);
2633 Py_DECREF(repunicode);
2634 goto onError;
2635 }
2636 *str = (char)c;
2637 }
2638 p = startp + newpos;
2639 Py_DECREF(repunicode);
2640 }
2641 }
2642 }
2643 /* Resize if we allocated to much */
2644 respos = str-PyString_AS_STRING(res);
2645 if (respos<ressize)
2646 /* If this falls res will be NULL */
2647 _PyString_Resize(&res, respos);
2648 Py_XDECREF(errorHandler);
2649 Py_XDECREF(exc);
2650 return res;
2651
2652 onError:
2653 Py_XDECREF(res);
2654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
2656 return NULL;
2657}
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002660 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 const char *errors)
2662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2667{
2668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
2672 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2673 PyUnicode_GET_SIZE(unicode),
2674 NULL);
2675}
2676
2677/* --- 7-bit ASCII Codec -------------------------------------------------- */
2678
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 const char *errors)
2682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 PyUnicodeObject *v;
2685 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *e;
2690 PyObject *errorHandler = NULL;
2691 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002694 if (size == 1 && *(unsigned char*)s < 128) {
2695 Py_UNICODE r = *(unsigned char*)s;
2696 return PyUnicode_FromUnicode(&r, 1);
2697 }
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 v = _PyUnicode_New(size);
2700 if (v == NULL)
2701 goto onError;
2702 if (size == 0)
2703 return (PyObject *)v;
2704 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 e = s + size;
2706 while (s < e) {
2707 register unsigned char c = (unsigned char)*s;
2708 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 ++s;
2711 }
2712 else {
2713 startinpos = s-starts;
2714 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002715 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "ascii", "ordinal not in range(128)",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002725 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 onError:
2732 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 return NULL;
2736}
2737
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 const char *errors)
2741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743}
2744
2745PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2746{
2747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 return NULL;
2750 }
2751 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2752 PyUnicode_GET_SIZE(unicode),
2753 NULL);
2754}
2755
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002756#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002760PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002762 const char *errors)
2763{
2764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002767
2768 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002769 assert(size < INT_MAX);
2770 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002771 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002772 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2773
2774 v = _PyUnicode_New(usize);
2775 if (v == NULL)
2776 return NULL;
2777 if (usize == 0)
2778 return (PyObject *)v;
2779 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002780 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002781 Py_DECREF(v);
2782 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2783 }
2784
2785 return (PyObject *)v;
2786}
2787
2788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002789 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002790 const char *errors)
2791{
2792 PyObject *repr;
2793 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002794 DWORD mbcssize;
2795
2796 /* If there are no characters, bail now! */
2797 if (size==0)
2798 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002799
2800 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 assert(size<INT_MAX);
2802 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 if (mbcssize==0)
2804 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2805
2806 repr = PyString_FromStringAndSize(NULL, mbcssize);
2807 if (repr == NULL)
2808 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002809 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002810 return repr;
2811
2812 /* Do the conversion */
2813 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 assert(size < INT_MAX);
2815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002816 Py_DECREF(repr);
2817 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2818 }
2819 return repr;
2820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002821
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002822PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831}
2832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002833#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835/* --- Character Mapping Codec -------------------------------------------- */
2836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002838 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyObject *mapping,
2840 const char *errors)
2841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 Py_ssize_t startinpos;
2844 Py_ssize_t endinpos;
2845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 PyUnicodeObject *v;
2848 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 PyObject *errorHandler = NULL;
2851 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002852 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Default to Latin-1 */
2856 if (mapping == NULL)
2857 return PyUnicode_DecodeLatin1(s, size, errors);
2858
2859 v = _PyUnicode_New(size);
2860 if (v == NULL)
2861 goto onError;
2862 if (size == 0)
2863 return (PyObject *)v;
2864 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002866 if (PyUnicode_CheckExact(mapping)) {
2867 mapstring = PyUnicode_AS_UNICODE(mapping);
2868 maplen = PyUnicode_GET_SIZE(mapping);
2869 while (s < e) {
2870 unsigned char ch = *s;
2871 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002873 if (ch < maplen)
2874 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002876 if (x == 0xfffe) {
2877 /* undefined mapping */
2878 outpos = p-PyUnicode_AS_UNICODE(v);
2879 startinpos = s-starts;
2880 endinpos = startinpos+1;
2881 if (unicode_decode_call_errorhandler(
2882 errors, &errorHandler,
2883 "charmap", "character maps to <undefined>",
2884 starts, size, &startinpos, &endinpos, &exc, &s,
2885 (PyObject **)&v, &outpos, &p)) {
2886 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002887 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002888 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002889 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002890 *p++ = x;
2891 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002893 }
2894 else {
2895 while (s < e) {
2896 unsigned char ch = *s;
2897 PyObject *w, *x;
2898
2899 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2900 w = PyInt_FromLong((long)ch);
2901 if (w == NULL)
2902 goto onError;
2903 x = PyObject_GetItem(mapping, w);
2904 Py_DECREF(w);
2905 if (x == NULL) {
2906 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2907 /* No mapping found means: mapping is undefined. */
2908 PyErr_Clear();
2909 x = Py_None;
2910 Py_INCREF(x);
2911 } else
2912 goto onError;
2913 }
2914
2915 /* Apply mapping */
2916 if (PyInt_Check(x)) {
2917 long value = PyInt_AS_LONG(x);
2918 if (value < 0 || value > 65535) {
2919 PyErr_SetString(PyExc_TypeError,
2920 "character mapping must be in range(65536)");
2921 Py_DECREF(x);
2922 goto onError;
2923 }
2924 *p++ = (Py_UNICODE)value;
2925 }
2926 else if (x == Py_None) {
2927 /* undefined mapping */
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 startinpos = s-starts;
2930 endinpos = startinpos+1;
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "charmap", "character maps to <undefined>",
2934 starts, size, &startinpos, &endinpos, &exc, &s,
2935 (PyObject **)&v, &outpos, &p)) {
2936 Py_DECREF(x);
2937 goto onError;
2938 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002939 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002940 continue;
2941 }
2942 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944
2945 if (targetsize == 1)
2946 /* 1-1 mapping */
2947 *p++ = *PyUnicode_AS_UNICODE(x);
2948
2949 else if (targetsize > 1) {
2950 /* 1-n mapping */
2951 if (targetsize > extrachars) {
2952 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2954 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002955 (targetsize << 2);
2956 extrachars += needed;
2957 if (_PyUnicode_Resize(&v,
2958 PyUnicode_GET_SIZE(v) + needed) < 0) {
2959 Py_DECREF(x);
2960 goto onError;
2961 }
2962 p = PyUnicode_AS_UNICODE(v) + oldpos;
2963 }
2964 Py_UNICODE_COPY(p,
2965 PyUnicode_AS_UNICODE(x),
2966 targetsize);
2967 p += targetsize;
2968 extrachars -= targetsize;
2969 }
2970 /* 1-0 mapping: skip the character */
2971 }
2972 else {
2973 /* wrong return value */
2974 PyErr_SetString(PyExc_TypeError,
2975 "character mapping must return integer, None or unicode");
2976 Py_DECREF(x);
2977 goto onError;
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002980 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
2983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 Py_XDECREF(errorHandler);
2987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 Py_XDECREF(errorHandler);
2992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 Py_XDECREF(v);
2994 return NULL;
2995}
2996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997/* Lookup the character ch in the mapping. If the character
2998 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002999 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 PyObject *w = PyInt_FromLong((long)c);
3003 PyObject *x;
3004
3005 if (w == NULL)
3006 return NULL;
3007 x = PyObject_GetItem(mapping, w);
3008 Py_DECREF(w);
3009 if (x == NULL) {
3010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3011 /* No mapping found means: mapping is undefined. */
3012 PyErr_Clear();
3013 x = Py_None;
3014 Py_INCREF(x);
3015 return x;
3016 } else
3017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003019 else if (x == Py_None)
3020 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 else if (PyInt_Check(x)) {
3022 long value = PyInt_AS_LONG(x);
3023 if (value < 0 || value > 255) {
3024 PyErr_SetString(PyExc_TypeError,
3025 "character mapping must be in range(256)");
3026 Py_DECREF(x);
3027 return NULL;
3028 }
3029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 else if (PyString_Check(x))
3032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 /* wrong return value */
3035 PyErr_SetString(PyExc_TypeError,
3036 "character mapping must return integer, None or str");
3037 Py_DECREF(x);
3038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040}
3041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042/* lookup the character, put the result in the output string and adjust
3043 various state variables. Reallocate the output string if not enough
3044 space is available. Return a new reference to the object that
3045 was put in the output buffer, or Py_None, if the mapping was undefined
3046 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003047 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048static
3049PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051{
3052 PyObject *rep = charmapencode_lookup(c, mapping);
3053
3054 if (rep==NULL)
3055 return NULL;
3056 else if (rep==Py_None)
3057 return rep;
3058 else {
3059 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 if (outsize<requiredsize) {
3064 /* exponentially overallocate to minimize reallocations */
3065 if (requiredsize < 2*outsize)
3066 requiredsize = 2*outsize;
3067 if (_PyString_Resize(outobj, requiredsize)) {
3068 Py_DECREF(rep);
3069 return NULL;
3070 }
3071 outstart = PyString_AS_STRING(*outobj);
3072 }
3073 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3074 }
3075 else {
3076 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003077 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3078 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 if (outsize<requiredsize) {
3080 /* exponentially overallocate to minimize reallocations */
3081 if (requiredsize < 2*outsize)
3082 requiredsize = 2*outsize;
3083 if (_PyString_Resize(outobj, requiredsize)) {
3084 Py_DECREF(rep);
3085 return NULL;
3086 }
3087 outstart = PyString_AS_STRING(*outobj);
3088 }
3089 memcpy(outstart + *outpos, repchars, repsize);
3090 *outpos += repsize;
3091 }
3092 }
3093 return rep;
3094}
3095
3096/* handle an error in PyUnicode_EncodeCharmap
3097 Return 0 on success, -1 on error */
3098static
3099int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003100 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003102 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003103 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104{
3105 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t repsize;
3107 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 Py_UNICODE *uni2;
3109 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003110 Py_ssize_t collstartpos = *inpos;
3111 Py_ssize_t collendpos = *inpos+1;
3112 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 char *encoding = "charmap";
3114 char *reason = "character maps to <undefined>";
3115
3116 PyObject *x;
3117 /* find all unencodable characters */
3118 while (collendpos < size) {
3119 x = charmapencode_lookup(p[collendpos], mapping);
3120 if (x==NULL)
3121 return -1;
3122 else if (x!=Py_None) {
3123 Py_DECREF(x);
3124 break;
3125 }
3126 Py_DECREF(x);
3127 ++collendpos;
3128 }
3129 /* cache callback name lookup
3130 * (if not done yet, i.e. it's the first error) */
3131 if (*known_errorHandler==-1) {
3132 if ((errors==NULL) || (!strcmp(errors, "strict")))
3133 *known_errorHandler = 1;
3134 else if (!strcmp(errors, "replace"))
3135 *known_errorHandler = 2;
3136 else if (!strcmp(errors, "ignore"))
3137 *known_errorHandler = 3;
3138 else if (!strcmp(errors, "xmlcharrefreplace"))
3139 *known_errorHandler = 4;
3140 else
3141 *known_errorHandler = 0;
3142 }
3143 switch (*known_errorHandler) {
3144 case 1: /* strict */
3145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3146 return -1;
3147 case 2: /* replace */
3148 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3149 x = charmapencode_output('?', mapping, res, respos);
3150 if (x==NULL) {
3151 return -1;
3152 }
3153 else if (x==Py_None) {
3154 Py_DECREF(x);
3155 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3156 return -1;
3157 }
3158 Py_DECREF(x);
3159 }
3160 /* fall through */
3161 case 3: /* ignore */
3162 *inpos = collendpos;
3163 break;
3164 case 4: /* xmlcharrefreplace */
3165 /* generate replacement (temporarily (mis)uses p) */
3166 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3167 char buffer[2+29+1+1];
3168 char *cp;
3169 sprintf(buffer, "&#%d;", (int)p[collpos]);
3170 for (cp = buffer; *cp; ++cp) {
3171 x = charmapencode_output(*cp, mapping, res, respos);
3172 if (x==NULL)
3173 return -1;
3174 else if (x==Py_None) {
3175 Py_DECREF(x);
3176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3177 return -1;
3178 }
3179 Py_DECREF(x);
3180 }
3181 }
3182 *inpos = collendpos;
3183 break;
3184 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003185 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 encoding, reason, p, size, exceptionObject,
3187 collstartpos, collendpos, &newpos);
3188 if (repunicode == NULL)
3189 return -1;
3190 /* generate replacement */
3191 repsize = PyUnicode_GET_SIZE(repunicode);
3192 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3193 x = charmapencode_output(*uni2, mapping, res, respos);
3194 if (x==NULL) {
3195 Py_DECREF(repunicode);
3196 return -1;
3197 }
3198 else if (x==Py_None) {
3199 Py_DECREF(repunicode);
3200 Py_DECREF(x);
3201 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3202 return -1;
3203 }
3204 Py_DECREF(x);
3205 }
3206 *inpos = newpos;
3207 Py_DECREF(repunicode);
3208 }
3209 return 0;
3210}
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyObject *mapping,
3215 const char *errors)
3216{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* output object */
3218 PyObject *res = NULL;
3219 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 PyObject *errorHandler = NULL;
3224 PyObject *exc = NULL;
3225 /* the following variable is used for caching string comparisons
3226 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3227 * 3=ignore, 4=xmlcharrefreplace */
3228 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229
3230 /* Default to Latin-1 */
3231 if (mapping == NULL)
3232 return PyUnicode_EncodeLatin1(p, size, errors);
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 /* allocate enough for a simple encoding without
3235 replacements, if we need more, we'll resize */
3236 res = PyString_FromStringAndSize(NULL, size);
3237 if (res == NULL)
3238 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003239 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 while (inpos<size) {
3243 /* try to encode it */
3244 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3245 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 if (x==Py_None) { /* unencodable character */
3248 if (charmap_encoding_error(p, size, &inpos, mapping,
3249 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003250 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003251 &res, &respos)) {
3252 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003253 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 else
3257 /* done with this character => adjust input position */
3258 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 Py_DECREF(x);
3260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 /* Resize if we allocated to much */
3263 if (respos<PyString_GET_SIZE(res)) {
3264 if (_PyString_Resize(&res, respos))
3265 goto onError;
3266 }
3267 Py_XDECREF(exc);
3268 Py_XDECREF(errorHandler);
3269 return res;
3270
3271 onError:
3272 Py_XDECREF(res);
3273 Py_XDECREF(exc);
3274 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 return NULL;
3276}
3277
3278PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3279 PyObject *mapping)
3280{
3281 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3286 PyUnicode_GET_SIZE(unicode),
3287 mapping,
3288 NULL);
3289}
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291/* create or adjust a UnicodeTranslateError */
3292static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003293 const Py_UNICODE *unicode, Py_ssize_t size,
3294 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 if (*exceptionObject == NULL) {
3298 *exceptionObject = PyUnicodeTranslateError_Create(
3299 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3303 goto onError;
3304 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3305 goto onError;
3306 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3307 goto onError;
3308 return;
3309 onError:
3310 Py_DECREF(*exceptionObject);
3311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
3313}
3314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315/* raises a UnicodeTranslateError */
3316static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 const Py_UNICODE *unicode, Py_ssize_t size,
3318 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 const char *reason)
3320{
3321 make_translate_exception(exceptionObject,
3322 unicode, size, startpos, endpos, reason);
3323 if (*exceptionObject != NULL)
3324 PyCodec_StrictErrors(*exceptionObject);
3325}
3326
3327/* error handling callback helper:
3328 build arguments, call the callback and check the arguments,
3329 put the result into newpos and return the replacement string, which
3330 has to be freed by the caller */
3331static PyObject *unicode_translate_call_errorhandler(const char *errors,
3332 PyObject **errorHandler,
3333 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003334 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3335 Py_ssize_t startpos, Py_ssize_t endpos,
3336 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003338 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339
Martin v. Löwis412fb672006-04-13 06:34:32 +00003340 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *restuple;
3342 PyObject *resunicode;
3343
3344 if (*errorHandler == NULL) {
3345 *errorHandler = PyCodec_LookupError(errors);
3346 if (*errorHandler == NULL)
3347 return NULL;
3348 }
3349
3350 make_translate_exception(exceptionObject,
3351 unicode, size, startpos, endpos, reason);
3352 if (*exceptionObject == NULL)
3353 return NULL;
3354
3355 restuple = PyObject_CallFunctionObjArgs(
3356 *errorHandler, *exceptionObject, NULL);
3357 if (restuple == NULL)
3358 return NULL;
3359 if (!PyTuple_Check(restuple)) {
3360 PyErr_Format(PyExc_TypeError, &argparse[4]);
3361 Py_DECREF(restuple);
3362 return NULL;
3363 }
3364 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_DECREF(restuple);
3367 return NULL;
3368 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 if (i_newpos<0)
3370 *newpos = size+i_newpos;
3371 else
3372 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003373 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003375 Py_DECREF(restuple);
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_INCREF(resunicode);
3379 Py_DECREF(restuple);
3380 return resunicode;
3381}
3382
3383/* Lookup the character ch in the mapping and put the result in result,
3384 which must be decrefed by the caller.
3385 Return 0 on success, -1 on error */
3386static
3387int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3388{
3389 PyObject *w = PyInt_FromLong((long)c);
3390 PyObject *x;
3391
3392 if (w == NULL)
3393 return -1;
3394 x = PyObject_GetItem(mapping, w);
3395 Py_DECREF(w);
3396 if (x == NULL) {
3397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3398 /* No mapping found means: use 1:1 mapping. */
3399 PyErr_Clear();
3400 *result = NULL;
3401 return 0;
3402 } else
3403 return -1;
3404 }
3405 else if (x == Py_None) {
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyInt_Check(x)) {
3410 long value = PyInt_AS_LONG(x);
3411 long max = PyUnicode_GetMax();
3412 if (value < 0 || value > max) {
3413 PyErr_Format(PyExc_TypeError,
3414 "character mapping must be in range(0x%lx)", max+1);
3415 Py_DECREF(x);
3416 return -1;
3417 }
3418 *result = x;
3419 return 0;
3420 }
3421 else if (PyUnicode_Check(x)) {
3422 *result = x;
3423 return 0;
3424 }
3425 else {
3426 /* wrong return value */
3427 PyErr_SetString(PyExc_TypeError,
3428 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003429 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 return -1;
3431 }
3432}
3433/* ensure that *outobj is at least requiredsize characters long,
3434if not reallocate and adjust various state variables.
3435Return 0 on success, -1 on error */
3436static
Walter Dörwald4894c302003-10-24 14:25:28 +00003437int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003441 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003443 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003445 if (requiredsize < 2 * oldsize)
3446 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003447 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 return -1;
3449 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 }
3451 return 0;
3452}
3453/* lookup the character, put the result in the output string and adjust
3454 various state variables. Return a new reference to the object that
3455 was put in the output buffer in *result, or Py_None, if the mapping was
3456 undefined (in which case no character was written).
3457 The called must decref result.
3458 Return 0 on success, -1 on error. */
3459static
Walter Dörwald4894c302003-10-24 14:25:28 +00003460int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003461 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003462 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463{
Walter Dörwald4894c302003-10-24 14:25:28 +00003464 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 return -1;
3466 if (*res==NULL) {
3467 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 }
3470 else if (*res==Py_None)
3471 ;
3472 else if (PyInt_Check(*res)) {
3473 /* no overflow check, because we know that the space is enough */
3474 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3475 }
3476 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (repsize==1) {
3479 /* no overflow check, because we know that the space is enough */
3480 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3481 }
3482 else if (repsize!=0) {
3483 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003484 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003485 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003486 repsize - 1;
3487 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 return -1;
3489 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3490 *outp += repsize;
3491 }
3492 }
3493 else
3494 return -1;
3495 return 0;
3496}
3497
3498PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 PyObject *mapping,
3501 const char *errors)
3502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 /* output object */
3504 PyObject *res = NULL;
3505 /* pointers to the beginning and end+1 of input */
3506 const Py_UNICODE *startp = p;
3507 const Py_UNICODE *endp = p + size;
3508 /* pointer into the output */
3509 Py_UNICODE *str;
3510 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 char *reason = "character maps to <undefined>";
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515 /* the following variable is used for caching string comparisons
3516 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3517 * 3=ignore, 4=xmlcharrefreplace */
3518 int known_errorHandler = -1;
3519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (mapping == NULL) {
3521 PyErr_BadArgument();
3522 return NULL;
3523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524
3525 /* allocate enough for a simple 1:1 translation without
3526 replacements, if we need more, we'll resize */
3527 res = PyUnicode_FromUnicode(NULL, size);
3528 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003529 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 return res;
3532 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 while (p<endp) {
3535 /* try to encode it */
3536 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003537 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 goto onError;
3540 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003541 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (x!=Py_None) /* it worked => adjust input pointer */
3543 ++p;
3544 else { /* untranslatable character */
3545 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t repsize;
3547 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting untranslatable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p+1;
3552 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 /* find all untranslatable characters */
3555 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003556 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 goto onError;
3558 Py_XDECREF(x);
3559 if (x!=Py_None)
3560 break;
3561 ++collend;
3562 }
3563 /* cache callback name lookup
3564 * (if not done yet, i.e. it's the first error) */
3565 if (known_errorHandler==-1) {
3566 if ((errors==NULL) || (!strcmp(errors, "strict")))
3567 known_errorHandler = 1;
3568 else if (!strcmp(errors, "replace"))
3569 known_errorHandler = 2;
3570 else if (!strcmp(errors, "ignore"))
3571 known_errorHandler = 3;
3572 else if (!strcmp(errors, "xmlcharrefreplace"))
3573 known_errorHandler = 4;
3574 else
3575 known_errorHandler = 0;
3576 }
3577 switch (known_errorHandler) {
3578 case 1: /* strict */
3579 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3580 goto onError;
3581 case 2: /* replace */
3582 /* No need to check for space, this is a 1:1 replacement */
3583 for (coll = collstart; coll<collend; ++coll)
3584 *str++ = '?';
3585 /* fall through */
3586 case 3: /* ignore */
3587 p = collend;
3588 break;
3589 case 4: /* xmlcharrefreplace */
3590 /* generate replacement (temporarily (mis)uses p) */
3591 for (p = collstart; p < collend; ++p) {
3592 char buffer[2+29+1+1];
3593 char *cp;
3594 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003595 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3597 goto onError;
3598 for (cp = buffer; *cp; ++cp)
3599 *str++ = *cp;
3600 }
3601 p = collend;
3602 break;
3603 default:
3604 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3605 reason, startp, size, &exc,
3606 collstart-startp, collend-startp, &newpos);
3607 if (repunicode == NULL)
3608 goto onError;
3609 /* generate replacement */
3610 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003611 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3617 *str++ = *uni2;
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 /* Resize if we allocated to much */
3624 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003625 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003626 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 }
3629 Py_XDECREF(exc);
3630 Py_XDECREF(errorHandler);
3631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 onError:
3634 Py_XDECREF(res);
3635 Py_XDECREF(exc);
3636 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 return NULL;
3638}
3639
3640PyObject *PyUnicode_Translate(PyObject *str,
3641 PyObject *mapping,
3642 const char *errors)
3643{
3644 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 str = PyUnicode_FromObject(str);
3647 if (str == NULL)
3648 goto onError;
3649 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3650 PyUnicode_GET_SIZE(str),
3651 mapping,
3652 errors);
3653 Py_DECREF(str);
3654 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 onError:
3657 Py_XDECREF(str);
3658 return NULL;
3659}
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossum9e896b32000-04-05 20:11:21 +00003661/* --- Decimal Encoder ---------------------------------------------------- */
3662
3663int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003665 char *output,
3666 const char *errors)
3667{
3668 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 PyObject *errorHandler = NULL;
3670 PyObject *exc = NULL;
3671 const char *encoding = "decimal";
3672 const char *reason = "invalid decimal Unicode string";
3673 /* the following variable is used for caching string comparisons
3674 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3675 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003676
3677 if (output == NULL) {
3678 PyErr_BadArgument();
3679 return -1;
3680 }
3681
3682 p = s;
3683 end = s + length;
3684 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003686 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t repsize;
3689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_UNICODE *uni2;
3691 Py_UNICODE *collstart;
3692 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossum9e896b32000-04-05 20:11:21 +00003694 if (Py_UNICODE_ISSPACE(ch)) {
3695 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003697 continue;
3698 }
3699 decimal = Py_UNICODE_TODECIMAL(ch);
3700 if (decimal >= 0) {
3701 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003703 continue;
3704 }
Guido van Rossumba477042000-04-06 18:18:10 +00003705 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003706 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003708 continue;
3709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* All other characters are considered unencodable */
3711 collstart = p;
3712 collend = p+1;
3713 while (collend < end) {
3714 if ((0 < *collend && *collend < 256) ||
3715 !Py_UNICODE_ISSPACE(*collend) ||
3716 Py_UNICODE_TODECIMAL(*collend))
3717 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* cache callback name lookup
3720 * (if not done yet, i.e. it's the first error) */
3721 if (known_errorHandler==-1) {
3722 if ((errors==NULL) || (!strcmp(errors, "strict")))
3723 known_errorHandler = 1;
3724 else if (!strcmp(errors, "replace"))
3725 known_errorHandler = 2;
3726 else if (!strcmp(errors, "ignore"))
3727 known_errorHandler = 3;
3728 else if (!strcmp(errors, "xmlcharrefreplace"))
3729 known_errorHandler = 4;
3730 else
3731 known_errorHandler = 0;
3732 }
3733 switch (known_errorHandler) {
3734 case 1: /* strict */
3735 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3736 goto onError;
3737 case 2: /* replace */
3738 for (p = collstart; p < collend; ++p)
3739 *output++ = '?';
3740 /* fall through */
3741 case 3: /* ignore */
3742 p = collend;
3743 break;
3744 case 4: /* xmlcharrefreplace */
3745 /* generate replacement (temporarily (mis)uses p) */
3746 for (p = collstart; p < collend; ++p)
3747 output += sprintf(output, "&#%d;", (int)*p);
3748 p = collend;
3749 break;
3750 default:
3751 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3752 encoding, reason, s, length, &exc,
3753 collstart-s, collend-s, &newpos);
3754 if (repunicode == NULL)
3755 goto onError;
3756 /* generate replacement */
3757 repsize = PyUnicode_GET_SIZE(repunicode);
3758 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3759 Py_UNICODE ch = *uni2;
3760 if (Py_UNICODE_ISSPACE(ch))
3761 *output++ = ' ';
3762 else {
3763 decimal = Py_UNICODE_TODECIMAL(ch);
3764 if (decimal >= 0)
3765 *output++ = '0' + decimal;
3766 else if (0 < ch && ch < 256)
3767 *output++ = (char)ch;
3768 else {
3769 Py_DECREF(repunicode);
3770 raise_encode_exception(&exc, encoding,
3771 s, length, collstart-s, collend-s, reason);
3772 goto onError;
3773 }
3774 }
3775 }
3776 p = s + newpos;
3777 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 }
3779 }
3780 /* 0-terminate the output string */
3781 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_XDECREF(exc);
3783 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003784 return 0;
3785
3786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 return -1;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792/* --- Helpers ------------------------------------------------------------ */
3793
Tim Petersced69f82003-09-16 20:30:58 +00003794static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003795Py_ssize_t count(PyUnicodeObject *self,
3796 Py_ssize_t start,
3797 Py_ssize_t end,
3798 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003800 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003802 if (start < 0)
3803 start += self->length;
3804 if (start < 0)
3805 start = 0;
3806 if (end > self->length)
3807 end = self->length;
3808 if (end < 0)
3809 end += self->length;
3810 if (end < 0)
3811 end = 0;
3812
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003813 if (substring->length == 0)
3814 return (end - start + 1);
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 end -= substring->length;
3817
3818 while (start <= end)
3819 if (Py_UNICODE_MATCH(self, start, substring)) {
3820 count++;
3821 start += substring->length;
3822 } else
3823 start++;
3824
3825 return count;
3826}
3827
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t start,
3831 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 str = PyUnicode_FromObject(str);
3836 if (str == NULL)
3837 return -1;
3838 substr = PyUnicode_FromObject(substr);
3839 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003840 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 return -1;
3842 }
Tim Petersced69f82003-09-16 20:30:58 +00003843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 result = count((PyUnicodeObject *)str,
3845 start, end,
3846 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 Py_DECREF(str);
3849 Py_DECREF(substr);
3850 return result;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t start,
3857 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 int direction)
3859{
3860 if (start < 0)
3861 start += self->length;
3862 if (start < 0)
3863 start = 0;
3864
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (end > self->length)
3866 end = self->length;
3867 if (end < 0)
3868 end += self->length;
3869 if (end < 0)
3870 end = 0;
3871
Guido van Rossum76afbd92002-08-20 17:29:29 +00003872 if (substring->length == 0)
3873 return (direction > 0) ? start : end;
3874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875 end -= substring->length;
3876
3877 if (direction < 0) {
3878 for (; end >= start; end--)
3879 if (Py_UNICODE_MATCH(self, end, substring))
3880 return end;
3881 } else {
3882 for (; start <= end; start++)
3883 if (Py_UNICODE_MATCH(self, start, substring))
3884 return start;
3885 }
3886
3887 return -1;
3888}
3889
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003892 Py_ssize_t start,
3893 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 int direction)
3895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 str = PyUnicode_FromObject(str);
3899 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003900 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 substr = PyUnicode_FromObject(substr);
3902 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003903 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003904 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 }
Tim Petersced69f82003-09-16 20:30:58 +00003906
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 result = findstring((PyUnicodeObject *)str,
3908 (PyUnicodeObject *)substr,
3909 start, end, direction);
3910 Py_DECREF(str);
3911 Py_DECREF(substr);
3912 return result;
3913}
3914
Tim Petersced69f82003-09-16 20:30:58 +00003915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916int tailmatch(PyUnicodeObject *self,
3917 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t start,
3919 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 int direction)
3921{
3922 if (start < 0)
3923 start += self->length;
3924 if (start < 0)
3925 start = 0;
3926
3927 if (substring->length == 0)
3928 return 1;
3929
3930 if (end > self->length)
3931 end = self->length;
3932 if (end < 0)
3933 end += self->length;
3934 if (end < 0)
3935 end = 0;
3936
3937 end -= substring->length;
3938 if (end < start)
3939 return 0;
3940
3941 if (direction > 0) {
3942 if (Py_UNICODE_MATCH(self, end, substring))
3943 return 1;
3944 } else {
3945 if (Py_UNICODE_MATCH(self, start, substring))
3946 return 1;
3947 }
3948
3949 return 0;
3950}
3951
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t start,
3955 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int direction)
3957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 str = PyUnicode_FromObject(str);
3961 if (str == NULL)
3962 return -1;
3963 substr = PyUnicode_FromObject(substr);
3964 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003965 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 return -1;
3967 }
Tim Petersced69f82003-09-16 20:30:58 +00003968
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 result = tailmatch((PyUnicodeObject *)str,
3970 (PyUnicodeObject *)substr,
3971 start, end, direction);
3972 Py_DECREF(str);
3973 Py_DECREF(substr);
3974 return result;
3975}
3976
Tim Petersced69f82003-09-16 20:30:58 +00003977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 Py_UNICODE ch)
3981{
3982 /* like wcschr, but doesn't stop at NULL characters */
3983
3984 while (size-- > 0) {
3985 if (*s == ch)
3986 return s;
3987 s++;
3988 }
3989
3990 return NULL;
3991}
3992
3993/* Apply fixfct filter to the Unicode object self and return a
3994 reference to the modified object */
3995
Tim Petersced69f82003-09-16 20:30:58 +00003996static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997PyObject *fixup(PyUnicodeObject *self,
3998 int (*fixfct)(PyUnicodeObject *s))
3999{
4000
4001 PyUnicodeObject *u;
4002
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (u == NULL)
4005 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006
4007 Py_UNICODE_COPY(u->str, self->str, self->length);
4008
Tim Peters7a29bd52001-09-12 03:03:31 +00004009 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 /* fixfct should return TRUE if it modified the buffer. If
4011 FALSE, return a reference to the original buffer instead
4012 (to save space, not time) */
4013 Py_INCREF(self);
4014 Py_DECREF(u);
4015 return (PyObject*) self;
4016 }
4017 return (PyObject*) u;
4018}
4019
Tim Petersced69f82003-09-16 20:30:58 +00004020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021int fixupper(PyUnicodeObject *self)
4022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 Py_UNICODE *s = self->str;
4025 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004026
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 while (len-- > 0) {
4028 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004029
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 ch = Py_UNICODE_TOUPPER(*s);
4031 if (ch != *s) {
4032 status = 1;
4033 *s = ch;
4034 }
4035 s++;
4036 }
4037
4038 return status;
4039}
4040
Tim Petersced69f82003-09-16 20:30:58 +00004041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042int fixlower(PyUnicodeObject *self)
4043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_UNICODE *s = self->str;
4046 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 while (len-- > 0) {
4049 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 ch = Py_UNICODE_TOLOWER(*s);
4052 if (ch != *s) {
4053 status = 1;
4054 *s = ch;
4055 }
4056 s++;
4057 }
4058
4059 return status;
4060}
4061
Tim Petersced69f82003-09-16 20:30:58 +00004062static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063int fixswapcase(PyUnicodeObject *self)
4064{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 Py_UNICODE *s = self->str;
4067 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 while (len-- > 0) {
4070 if (Py_UNICODE_ISUPPER(*s)) {
4071 *s = Py_UNICODE_TOLOWER(*s);
4072 status = 1;
4073 } else if (Py_UNICODE_ISLOWER(*s)) {
4074 *s = Py_UNICODE_TOUPPER(*s);
4075 status = 1;
4076 }
4077 s++;
4078 }
4079
4080 return status;
4081}
4082
Tim Petersced69f82003-09-16 20:30:58 +00004083static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084int fixcapitalize(PyUnicodeObject *self)
4085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004086 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004087 Py_UNICODE *s = self->str;
4088 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004090 if (len == 0)
4091 return 0;
4092 if (Py_UNICODE_ISLOWER(*s)) {
4093 *s = Py_UNICODE_TOUPPER(*s);
4094 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004096 s++;
4097 while (--len > 0) {
4098 if (Py_UNICODE_ISUPPER(*s)) {
4099 *s = Py_UNICODE_TOLOWER(*s);
4100 status = 1;
4101 }
4102 s++;
4103 }
4104 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
4107static
4108int fixtitle(PyUnicodeObject *self)
4109{
4110 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4111 register Py_UNICODE *e;
4112 int previous_is_cased;
4113
4114 /* Shortcut for single character strings */
4115 if (PyUnicode_GET_SIZE(self) == 1) {
4116 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4117 if (*p != ch) {
4118 *p = ch;
4119 return 1;
4120 }
4121 else
4122 return 0;
4123 }
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 e = p + PyUnicode_GET_SIZE(self);
4126 previous_is_cased = 0;
4127 for (; p < e; p++) {
4128 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (previous_is_cased)
4131 *p = Py_UNICODE_TOLOWER(ch);
4132 else
4133 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004134
4135 if (Py_UNICODE_ISLOWER(ch) ||
4136 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 Py_UNICODE_ISTITLE(ch))
4138 previous_is_cased = 1;
4139 else
4140 previous_is_cased = 0;
4141 }
4142 return 1;
4143}
4144
Tim Peters8ce9f162004-08-27 01:49:32 +00004145PyObject *
4146PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
Tim Peters8ce9f162004-08-27 01:49:32 +00004148 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004149 const Py_UNICODE blank = ' ';
4150 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004151 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004153 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4154 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004155 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4156 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004158 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004159 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 fseq = PySequence_Fast(seq, "");
4162 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004163 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004164 }
4165
Tim Peters91879ab2004-08-27 22:35:44 +00004166 /* Grrrr. A codec may be invoked to convert str objects to
4167 * Unicode, and so it's possible to call back into Python code
4168 * during PyUnicode_FromObject(), and so it's possible for a sick
4169 * codec to change the size of fseq (if seq is a list). Therefore
4170 * we have to keep refetching the size -- can't assume seqlen
4171 * is invariant.
4172 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004173 seqlen = PySequence_Fast_GET_SIZE(fseq);
4174 /* If empty sequence, return u"". */
4175 if (seqlen == 0) {
4176 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4177 goto Done;
4178 }
4179 /* If singleton sequence with an exact Unicode, return that. */
4180 if (seqlen == 1) {
4181 item = PySequence_Fast_GET_ITEM(fseq, 0);
4182 if (PyUnicode_CheckExact(item)) {
4183 Py_INCREF(item);
4184 res = (PyUnicodeObject *)item;
4185 goto Done;
4186 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004187 }
4188
Tim Peters05eba1f2004-08-27 21:32:02 +00004189 /* At least two items to join, or one that isn't exact Unicode. */
4190 if (seqlen > 1) {
4191 /* Set up sep and seplen -- they're needed. */
4192 if (separator == NULL) {
4193 sep = &blank;
4194 seplen = 1;
4195 }
4196 else {
4197 internal_separator = PyUnicode_FromObject(separator);
4198 if (internal_separator == NULL)
4199 goto onError;
4200 sep = PyUnicode_AS_UNICODE(internal_separator);
4201 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004202 /* In case PyUnicode_FromObject() mutated seq. */
4203 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004204 }
4205 }
4206
4207 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004208 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004209 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004210 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004211 res_p = PyUnicode_AS_UNICODE(res);
4212 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004213
Tim Peters05eba1f2004-08-27 21:32:02 +00004214 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004215 Py_ssize_t itemlen;
4216 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004217
4218 item = PySequence_Fast_GET_ITEM(fseq, i);
4219 /* Convert item to Unicode. */
4220 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4221 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004222 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004223 " %.80s found",
4224 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004225 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004226 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004227 item = PyUnicode_FromObject(item);
4228 if (item == NULL)
4229 goto onError;
4230 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004231
Tim Peters91879ab2004-08-27 22:35:44 +00004232 /* In case PyUnicode_FromObject() mutated seq. */
4233 seqlen = PySequence_Fast_GET_SIZE(fseq);
4234
Tim Peters8ce9f162004-08-27 01:49:32 +00004235 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004237 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004238 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 if (i < seqlen - 1) {
4241 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004242 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004243 goto Overflow;
4244 }
4245 if (new_res_used > res_alloc) {
4246 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004247 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004248 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004249 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004250 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004251 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004252 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004253 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004255 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004256 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004258
4259 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004260 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004261 res_p += itemlen;
4262 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004263 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004264 res_p += seplen;
4265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004267 res_used = new_res_used;
4268 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004269
Tim Peters05eba1f2004-08-27 21:32:02 +00004270 /* Shrink res to match the used area; this probably can't fail,
4271 * but it's cheap to check.
4272 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004273 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004274 goto onError;
4275
4276 Done:
4277 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004278 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 return (PyObject *)res;
4280
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 Overflow:
4282 PyErr_SetString(PyExc_OverflowError,
4283 "join() is too long for a Python string");
4284 Py_DECREF(item);
4285 /* fall through */
4286
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004288 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004289 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004290 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return NULL;
4292}
4293
Tim Petersced69f82003-09-16 20:30:58 +00004294static
4295PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004296 Py_ssize_t left,
4297 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 Py_UNICODE fill)
4299{
4300 PyUnicodeObject *u;
4301
4302 if (left < 0)
4303 left = 0;
4304 if (right < 0)
4305 right = 0;
4306
Tim Peters7a29bd52001-09-12 03:03:31 +00004307 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 Py_INCREF(self);
4309 return self;
4310 }
4311
4312 u = _PyUnicode_New(left + self->length + right);
4313 if (u) {
4314 if (left)
4315 Py_UNICODE_FILL(u->str, fill, left);
4316 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4317 if (right)
4318 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4319 }
4320
4321 return u;
4322}
4323
4324#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 if (!str) \
4327 goto onError; \
4328 if (PyList_Append(list, str)) { \
4329 Py_DECREF(str); \
4330 goto onError; \
4331 } \
4332 else \
4333 Py_DECREF(str);
4334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004335#define SPLIT_INSERT(data, left, right) \
4336 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4337 if (!str) \
4338 goto onError; \
4339 if (PyList_Insert(list, 0, str)) { \
4340 Py_DECREF(str); \
4341 goto onError; \
4342 } \
4343 else \
4344 Py_DECREF(str);
4345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346static
4347PyObject *split_whitespace(PyUnicodeObject *self,
4348 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 register Py_ssize_t i;
4352 register Py_ssize_t j;
4353 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 PyObject *str;
4355
4356 for (i = j = 0; i < len; ) {
4357 /* find a token */
4358 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4359 i++;
4360 j = i;
4361 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4362 i++;
4363 if (j < i) {
4364 if (maxcount-- <= 0)
4365 break;
4366 SPLIT_APPEND(self->str, j, i);
4367 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4368 i++;
4369 j = i;
4370 }
4371 }
4372 if (j < len) {
4373 SPLIT_APPEND(self->str, j, len);
4374 }
4375 return list;
4376
4377 onError:
4378 Py_DECREF(list);
4379 return NULL;
4380}
4381
4382PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004383 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 register Py_ssize_t i;
4386 register Py_ssize_t j;
4387 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 PyObject *list;
4389 PyObject *str;
4390 Py_UNICODE *data;
4391
4392 string = PyUnicode_FromObject(string);
4393 if (string == NULL)
4394 return NULL;
4395 data = PyUnicode_AS_UNICODE(string);
4396 len = PyUnicode_GET_SIZE(string);
4397
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 list = PyList_New(0);
4399 if (!list)
4400 goto onError;
4401
4402 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004403 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 /* Find a line and append it */
4406 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4407 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408
4409 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004410 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 if (i < len) {
4412 if (data[i] == '\r' && i + 1 < len &&
4413 data[i+1] == '\n')
4414 i += 2;
4415 else
4416 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004417 if (keepends)
4418 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Guido van Rossum86662912000-04-11 15:38:46 +00004420 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 j = i;
4422 }
4423 if (j < len) {
4424 SPLIT_APPEND(data, j, len);
4425 }
4426
4427 Py_DECREF(string);
4428 return list;
4429
4430 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004431 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_DECREF(string);
4433 return NULL;
4434}
4435
Tim Petersced69f82003-09-16 20:30:58 +00004436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437PyObject *split_char(PyUnicodeObject *self,
4438 PyObject *list,
4439 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 register Py_ssize_t i;
4443 register Py_ssize_t j;
4444 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 PyObject *str;
4446
4447 for (i = j = 0; i < len; ) {
4448 if (self->str[i] == ch) {
4449 if (maxcount-- <= 0)
4450 break;
4451 SPLIT_APPEND(self->str, j, i);
4452 i = j = i + 1;
4453 } else
4454 i++;
4455 }
4456 if (j <= len) {
4457 SPLIT_APPEND(self->str, j, len);
4458 }
4459 return list;
4460
4461 onError:
4462 Py_DECREF(list);
4463 return NULL;
4464}
4465
Tim Petersced69f82003-09-16 20:30:58 +00004466static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467PyObject *split_substring(PyUnicodeObject *self,
4468 PyObject *list,
4469 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004470 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 register Py_ssize_t i;
4473 register Py_ssize_t j;
4474 Py_ssize_t len = self->length;
4475 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 PyObject *str;
4477
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004478 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 if (Py_UNICODE_MATCH(self, i, substring)) {
4480 if (maxcount-- <= 0)
4481 break;
4482 SPLIT_APPEND(self->str, j, i);
4483 i = j = i + sublen;
4484 } else
4485 i++;
4486 }
4487 if (j <= len) {
4488 SPLIT_APPEND(self->str, j, len);
4489 }
4490 return list;
4491
4492 onError:
4493 Py_DECREF(list);
4494 return NULL;
4495}
4496
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004497static
4498PyObject *rsplit_whitespace(PyUnicodeObject *self,
4499 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 register Py_ssize_t i;
4503 register Py_ssize_t j;
4504 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004505 PyObject *str;
4506
4507 for (i = j = len - 1; i >= 0; ) {
4508 /* find a token */
4509 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4510 i--;
4511 j = i;
4512 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4513 i--;
4514 if (j > i) {
4515 if (maxcount-- <= 0)
4516 break;
4517 SPLIT_INSERT(self->str, i + 1, j + 1);
4518 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4519 i--;
4520 j = i;
4521 }
4522 }
4523 if (j >= 0) {
4524 SPLIT_INSERT(self->str, 0, j + 1);
4525 }
4526 return list;
4527
4528 onError:
4529 Py_DECREF(list);
4530 return NULL;
4531}
4532
4533static
4534PyObject *rsplit_char(PyUnicodeObject *self,
4535 PyObject *list,
4536 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004538{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 register Py_ssize_t i;
4540 register Py_ssize_t j;
4541 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004542 PyObject *str;
4543
4544 for (i = j = len - 1; i >= 0; ) {
4545 if (self->str[i] == ch) {
4546 if (maxcount-- <= 0)
4547 break;
4548 SPLIT_INSERT(self->str, i + 1, j + 1);
4549 j = i = i - 1;
4550 } else
4551 i--;
4552 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004553 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004554 SPLIT_INSERT(self->str, 0, j + 1);
4555 }
4556 return list;
4557
4558 onError:
4559 Py_DECREF(list);
4560 return NULL;
4561}
4562
4563static
4564PyObject *rsplit_substring(PyUnicodeObject *self,
4565 PyObject *list,
4566 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 register Py_ssize_t i;
4570 register Py_ssize_t j;
4571 Py_ssize_t len = self->length;
4572 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004573 PyObject *str;
4574
4575 for (i = len - sublen, j = len; i >= 0; ) {
4576 if (Py_UNICODE_MATCH(self, i, substring)) {
4577 if (maxcount-- <= 0)
4578 break;
4579 SPLIT_INSERT(self->str, i + sublen, j);
4580 j = i;
4581 i -= sublen;
4582 } else
4583 i--;
4584 }
4585 if (j >= 0) {
4586 SPLIT_INSERT(self->str, 0, j);
4587 }
4588 return list;
4589
4590 onError:
4591 Py_DECREF(list);
4592 return NULL;
4593}
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004596#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
4598static
4599PyObject *split(PyUnicodeObject *self,
4600 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004601 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602{
4603 PyObject *list;
4604
4605 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004606 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 list = PyList_New(0);
4609 if (!list)
4610 return NULL;
4611
4612 if (substring == NULL)
4613 return split_whitespace(self,list,maxcount);
4614
4615 else if (substring->length == 1)
4616 return split_char(self,list,substring->str[0],maxcount);
4617
4618 else if (substring->length == 0) {
4619 Py_DECREF(list);
4620 PyErr_SetString(PyExc_ValueError, "empty separator");
4621 return NULL;
4622 }
4623 else
4624 return split_substring(self,list,substring,maxcount);
4625}
4626
Tim Petersced69f82003-09-16 20:30:58 +00004627static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004628PyObject *rsplit(PyUnicodeObject *self,
4629 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004630 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004631{
4632 PyObject *list;
4633
4634 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004635 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004636
4637 list = PyList_New(0);
4638 if (!list)
4639 return NULL;
4640
4641 if (substring == NULL)
4642 return rsplit_whitespace(self,list,maxcount);
4643
4644 else if (substring->length == 1)
4645 return rsplit_char(self,list,substring->str[0],maxcount);
4646
4647 else if (substring->length == 0) {
4648 Py_DECREF(list);
4649 PyErr_SetString(PyExc_ValueError, "empty separator");
4650 return NULL;
4651 }
4652 else
4653 return rsplit_substring(self,list,substring,maxcount);
4654}
4655
4656static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657PyObject *replace(PyUnicodeObject *self,
4658 PyUnicodeObject *str1,
4659 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661{
4662 PyUnicodeObject *u;
4663
4664 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004665 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666
4667 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004668 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004671 if (!findchar(self->str, self->length, str1->str[0]) &&
4672 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 /* nothing to replace, return original string */
4674 Py_INCREF(self);
4675 u = self;
4676 } else {
4677 Py_UNICODE u1 = str1->str[0];
4678 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004681 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 self->length
4683 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004685 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004686 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 for (i = 0; i < u->length; i++)
4688 if (u->str[i] == u1) {
4689 if (--maxcount < 0)
4690 break;
4691 u->str[i] = u2;
4692 }
4693 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
4696 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 Py_UNICODE *p;
4699
4700 /* replace strings */
4701 n = count(self, 0, self->length, str1);
4702 if (n > maxcount)
4703 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004704 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004706 if (PyUnicode_CheckExact(self)) {
4707 Py_INCREF(self);
4708 u = self;
4709 }
4710 else {
4711 u = (PyUnicodeObject *)
4712 PyUnicode_FromUnicode(self->str, self->length);
4713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 } else {
4715 u = _PyUnicode_New(
4716 self->length + n * (str2->length - str1->length));
4717 if (u) {
4718 i = 0;
4719 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004720 if (str1->length > 0) {
4721 while (i <= self->length - str1->length)
4722 if (Py_UNICODE_MATCH(self, i, str1)) {
4723 /* replace string segment */
4724 Py_UNICODE_COPY(p, str2->str, str2->length);
4725 p += str2->length;
4726 i += str1->length;
4727 if (--n <= 0) {
4728 /* copy remaining part */
4729 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4730 break;
4731 }
4732 } else
4733 *p++ = self->str[i++];
4734 } else {
4735 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 Py_UNICODE_COPY(p, str2->str, str2->length);
4737 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004738 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004741 }
4742 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 }
4745 }
4746 }
Tim Petersced69f82003-09-16 20:30:58 +00004747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return (PyObject *) u;
4749}
4750
4751/* --- Unicode Object Methods --------------------------------------------- */
4752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004753PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754"S.title() -> unicode\n\
4755\n\
4756Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004760unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 return fixup(self, fixtitle);
4763}
4764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004765PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766"S.capitalize() -> unicode\n\
4767\n\
4768Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004769have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
4771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004772unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return fixup(self, fixcapitalize);
4775}
4776
4777#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004778PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779"S.capwords() -> unicode\n\
4780\n\
4781Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004782normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783
4784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004785unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786{
4787 PyObject *list;
4788 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004789 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 /* Split into words */
4792 list = split(self, NULL, -1);
4793 if (!list)
4794 return NULL;
4795
4796 /* Capitalize each word */
4797 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4798 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4799 fixcapitalize);
4800 if (item == NULL)
4801 goto onError;
4802 Py_DECREF(PyList_GET_ITEM(list, i));
4803 PyList_SET_ITEM(list, i, item);
4804 }
4805
4806 /* Join the words to form a new string */
4807 item = PyUnicode_Join(NULL, list);
4808
4809onError:
4810 Py_DECREF(list);
4811 return (PyObject *)item;
4812}
4813#endif
4814
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004815/* Argument converter. Coerces to a single unicode character */
4816
4817static int
4818convert_uc(PyObject *obj, void *addr)
4819{
4820 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4821 PyObject *uniobj;
4822 Py_UNICODE *unistr;
4823
4824 uniobj = PyUnicode_FromObject(obj);
4825 if (uniobj == NULL) {
4826 PyErr_SetString(PyExc_TypeError,
4827 "The fill character cannot be converted to Unicode");
4828 return 0;
4829 }
4830 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4831 PyErr_SetString(PyExc_TypeError,
4832 "The fill character must be exactly one character long");
4833 Py_DECREF(uniobj);
4834 return 0;
4835 }
4836 unistr = PyUnicode_AS_UNICODE(uniobj);
4837 *fillcharloc = unistr[0];
4838 Py_DECREF(uniobj);
4839 return 1;
4840}
4841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004842PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004843"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845Return S centered in a Unicode string of length width. Padding is\n\
4846done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
4848static PyObject *
4849unicode_center(PyUnicodeObject *self, PyObject *args)
4850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t marg, left;
4852 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004853 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
Thomas Woutersde017742006-02-16 19:34:37 +00004855 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 return NULL;
4857
Tim Peters7a29bd52001-09-12 03:03:31 +00004858 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 Py_INCREF(self);
4860 return (PyObject*) self;
4861 }
4862
4863 marg = width - self->length;
4864 left = marg / 2 + (marg & width & 1);
4865
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004866 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867}
4868
Marc-André Lemburge5034372000-08-08 08:04:29 +00004869#if 0
4870
4871/* This code should go into some future Unicode collation support
4872 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004873 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004874
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004875/* speedy UTF-16 code point order comparison */
4876/* gleaned from: */
4877/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4878
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004879static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004880{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004882 0, 0, 0, 0, 0, 0, 0, 0,
4883 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004884 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004885};
4886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887static int
4888unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004891
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 Py_UNICODE *s1 = str1->str;
4893 Py_UNICODE *s2 = str2->str;
4894
4895 len1 = str1->length;
4896 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004897
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004899 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004900
4901 c1 = *s1++;
4902 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004903
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004904 if (c1 > (1<<11) * 26)
4905 c1 += utf16Fixup[c1>>11];
4906 if (c2 > (1<<11) * 26)
4907 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004908 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004909
4910 if (c1 != c2)
4911 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004912
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004913 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
4915
4916 return (len1 < len2) ? -1 : (len1 != len2);
4917}
4918
Marc-André Lemburge5034372000-08-08 08:04:29 +00004919#else
4920
4921static int
4922unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4923{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004925
4926 Py_UNICODE *s1 = str1->str;
4927 Py_UNICODE *s2 = str2->str;
4928
4929 len1 = str1->length;
4930 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004931
Marc-André Lemburge5034372000-08-08 08:04:29 +00004932 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004933 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004934
Fredrik Lundh45714e92001-06-26 16:39:36 +00004935 c1 = *s1++;
4936 c2 = *s2++;
4937
4938 if (c1 != c2)
4939 return (c1 < c2) ? -1 : 1;
4940
Marc-André Lemburge5034372000-08-08 08:04:29 +00004941 len1--; len2--;
4942 }
4943
4944 return (len1 < len2) ? -1 : (len1 != len2);
4945}
4946
4947#endif
4948
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949int PyUnicode_Compare(PyObject *left,
4950 PyObject *right)
4951{
4952 PyUnicodeObject *u = NULL, *v = NULL;
4953 int result;
4954
4955 /* Coerce the two arguments */
4956 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4957 if (u == NULL)
4958 goto onError;
4959 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4960 if (v == NULL)
4961 goto onError;
4962
Thomas Wouters7e474022000-07-16 12:04:32 +00004963 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 if (v == u) {
4965 Py_DECREF(u);
4966 Py_DECREF(v);
4967 return 0;
4968 }
4969
4970 result = unicode_compare(u, v);
4971
4972 Py_DECREF(u);
4973 Py_DECREF(v);
4974 return result;
4975
4976onError:
4977 Py_XDECREF(u);
4978 Py_XDECREF(v);
4979 return -1;
4980}
4981
Guido van Rossum403d68b2000-03-13 15:55:09 +00004982int PyUnicode_Contains(PyObject *container,
4983 PyObject *element)
4984{
Fredrik Lundh833bf942006-05-23 10:12:21 +00004985 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 int result;
4987 Py_ssize_t size;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004988
4989 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00004990 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
4991 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004992 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004993 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00004994 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004995 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00004996
4997 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
4998 if (!u) {
4999 Py_DECREF(v);
5000 return -1;
5001 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005002
Barry Warsaw817918c2002-08-06 16:58:21 +00005003 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005004 if (!size) {
5005 result = 1;
5006 goto done;
5007 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005008
Guido van Rossum403d68b2000-03-13 15:55:09 +00005009 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005010
Barry Warsaw817918c2002-08-06 16:58:21 +00005011 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005012 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5013 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5014 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5015 for (; ptr < end; ptr++) {
5016 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005017 result = 1;
5018 break;
5019 }
5020 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005021 } else {
5022 int start = 0;
5023 int end = PyUnicode_GET_SIZE(u) - size;
5024 for (; start <= end; start++)
5025 if (Py_UNICODE_MATCH(u, start, v)) {
5026 result = 1;
5027 break;
5028 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005029 }
5030
Fredrik Lundh833bf942006-05-23 10:12:21 +00005031done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005032 Py_DECREF(u);
5033 Py_DECREF(v);
5034 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005035}
5036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037/* Concat to string or Unicode object giving a new Unicode object. */
5038
5039PyObject *PyUnicode_Concat(PyObject *left,
5040 PyObject *right)
5041{
5042 PyUnicodeObject *u = NULL, *v = NULL, *w;
5043
5044 /* Coerce the two arguments */
5045 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5046 if (u == NULL)
5047 goto onError;
5048 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5049 if (v == NULL)
5050 goto onError;
5051
5052 /* Shortcuts */
5053 if (v == unicode_empty) {
5054 Py_DECREF(v);
5055 return (PyObject *)u;
5056 }
5057 if (u == unicode_empty) {
5058 Py_DECREF(u);
5059 return (PyObject *)v;
5060 }
5061
5062 /* Concat the two Unicode strings */
5063 w = _PyUnicode_New(u->length + v->length);
5064 if (w == NULL)
5065 goto onError;
5066 Py_UNICODE_COPY(w->str, u->str, u->length);
5067 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5068
5069 Py_DECREF(u);
5070 Py_DECREF(v);
5071 return (PyObject *)w;
5072
5073onError:
5074 Py_XDECREF(u);
5075 Py_XDECREF(v);
5076 return NULL;
5077}
5078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005079PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080"S.count(sub[, start[, end]]) -> int\n\
5081\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005082Return the number of non-overlapping occurrences of substring sub in\n\
5083Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
5086static PyObject *
5087unicode_count(PyUnicodeObject *self, PyObject *args)
5088{
5089 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005090 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005091 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 PyObject *result;
5093
Guido van Rossumb8872e62000-05-09 14:14:27 +00005094 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5095 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 return NULL;
5097
5098 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5099 (PyObject *)substring);
5100 if (substring == NULL)
5101 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 if (start < 0)
5104 start += self->length;
5105 if (start < 0)
5106 start = 0;
5107 if (end > self->length)
5108 end = self->length;
5109 if (end < 0)
5110 end += self->length;
5111 if (end < 0)
5112 end = 0;
5113
5114 result = PyInt_FromLong((long) count(self, start, end, substring));
5115
5116 Py_DECREF(substring);
5117 return result;
5118}
5119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005120PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005123Encodes S using the codec registered for encoding. encoding defaults\n\
5124to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5127'xmlcharrefreplace' as well as any other name registered with\n\
5128codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject *
5131unicode_encode(PyUnicodeObject *self, PyObject *args)
5132{
5133 char *encoding = NULL;
5134 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005135 PyObject *v;
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5138 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005139 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005140 if (v == NULL)
5141 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005142 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5143 PyErr_Format(PyExc_TypeError,
5144 "encoder did not return a string/unicode object "
5145 "(type=%.400s)",
5146 v->ob_type->tp_name);
5147 Py_DECREF(v);
5148 return NULL;
5149 }
5150 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005151
5152 onError:
5153 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005154}
5155
5156PyDoc_STRVAR(decode__doc__,
5157"S.decode([encoding[,errors]]) -> string or unicode\n\
5158\n\
5159Decodes S using the codec registered for encoding. encoding defaults\n\
5160to the default encoding. errors may be given to set a different error\n\
5161handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5162a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5163as well as any other name registerd with codecs.register_error that is\n\
5164able to handle UnicodeDecodeErrors.");
5165
5166static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005167unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005168{
5169 char *encoding = NULL;
5170 char *errors = NULL;
5171 PyObject *v;
5172
5173 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5174 return NULL;
5175 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005176 if (v == NULL)
5177 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005178 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5179 PyErr_Format(PyExc_TypeError,
5180 "decoder did not return a string/unicode object "
5181 "(type=%.400s)",
5182 v->ob_type->tp_name);
5183 Py_DECREF(v);
5184 return NULL;
5185 }
5186 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005187
5188 onError:
5189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.expandtabs([tabsize]) -> unicode\n\
5194\n\
5195Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198static PyObject*
5199unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5200{
5201 Py_UNICODE *e;
5202 Py_UNICODE *p;
5203 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 PyUnicodeObject *u;
5206 int tabsize = 8;
5207
5208 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5209 return NULL;
5210
Thomas Wouters7e474022000-07-16 12:04:32 +00005211 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 i = j = 0;
5213 e = self->str + self->length;
5214 for (p = self->str; p < e; p++)
5215 if (*p == '\t') {
5216 if (tabsize > 0)
5217 j += tabsize - (j % tabsize);
5218 }
5219 else {
5220 j++;
5221 if (*p == '\n' || *p == '\r') {
5222 i += j;
5223 j = 0;
5224 }
5225 }
5226
5227 /* Second pass: create output string and fill it */
5228 u = _PyUnicode_New(i + j);
5229 if (!u)
5230 return NULL;
5231
5232 j = 0;
5233 q = u->str;
5234
5235 for (p = self->str; p < e; p++)
5236 if (*p == '\t') {
5237 if (tabsize > 0) {
5238 i = tabsize - (j % tabsize);
5239 j += i;
5240 while (i--)
5241 *q++ = ' ';
5242 }
5243 }
5244 else {
5245 j++;
5246 *q++ = *p;
5247 if (*p == '\n' || *p == '\r')
5248 j = 0;
5249 }
5250
5251 return (PyObject*) u;
5252}
5253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005254PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255"S.find(sub [,start [,end]]) -> int\n\
5256\n\
5257Return the lowest index in S where substring sub is found,\n\
5258such that sub is contained within s[start,end]. Optional\n\
5259arguments start and end are interpreted as in slice notation.\n\
5260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject *
5264unicode_find(PyUnicodeObject *self, PyObject *args)
5265{
5266 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005268 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 PyObject *result;
5270
Guido van Rossumb8872e62000-05-09 14:14:27 +00005271 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5272 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 return NULL;
5274 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5275 (PyObject *)substring);
5276 if (substring == NULL)
5277 return NULL;
5278
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
5281 Py_DECREF(substring);
5282 return result;
5283}
5284
5285static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005286unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
5288 if (index < 0 || index >= self->length) {
5289 PyErr_SetString(PyExc_IndexError, "string index out of range");
5290 return NULL;
5291 }
5292
5293 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5294}
5295
5296static long
5297unicode_hash(PyUnicodeObject *self)
5298{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005299 /* Since Unicode objects compare equal to their ASCII string
5300 counterparts, they should use the individual character values
5301 as basis for their hash value. This is needed to assure that
5302 strings and Unicode objects behave in the same way as
5303 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005306 register Py_UNICODE *p;
5307 register long x;
5308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 if (self->hash != -1)
5310 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005311 len = PyUnicode_GET_SIZE(self);
5312 p = PyUnicode_AS_UNICODE(self);
5313 x = *p << 7;
5314 while (--len >= 0)
5315 x = (1000003*x) ^ *p++;
5316 x ^= PyUnicode_GET_SIZE(self);
5317 if (x == -1)
5318 x = -2;
5319 self->hash = x;
5320 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321}
5322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324"S.index(sub [,start [,end]]) -> int\n\
5325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005326Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
5328static PyObject *
5329unicode_index(PyUnicodeObject *self, PyObject *args)
5330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005334 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335
Guido van Rossumb8872e62000-05-09 14:14:27 +00005336 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5337 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5341 (PyObject *)substring);
5342 if (substring == NULL)
5343 return NULL;
5344
5345 result = findstring(self, substring, start, end, 1);
5346
5347 Py_DECREF(substring);
5348 if (result < 0) {
5349 PyErr_SetString(PyExc_ValueError, "substring not found");
5350 return NULL;
5351 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353}
5354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005355PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005356"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005358Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005359at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360
5361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005362unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
5364 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5365 register const Py_UNICODE *e;
5366 int cased;
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 /* Shortcut for single character strings */
5369 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005370 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005372 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005373 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005374 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 e = p + PyUnicode_GET_SIZE(self);
5377 cased = 0;
5378 for (; p < e; p++) {
5379 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005382 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 else if (!cased && Py_UNICODE_ISLOWER(ch))
5384 cased = 1;
5385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005386 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387}
5388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005389PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005392Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005393at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394
5395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005396unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
5398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5399 register const Py_UNICODE *e;
5400 int cased;
5401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 /* Shortcut for single character strings */
5403 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005404 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005406 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005407 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 e = p + PyUnicode_GET_SIZE(self);
5411 cased = 0;
5412 for (; p < e; p++) {
5413 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005416 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 else if (!cased && Py_UNICODE_ISUPPER(ch))
5418 cased = 1;
5419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005423PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005424"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005426Return True if S is a titlecased string and there is at least one\n\
5427character in S, i.e. upper- and titlecase characters may only\n\
5428follow uncased characters and lowercase characters only cased ones.\n\
5429Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
5431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005432unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
5434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5435 register const Py_UNICODE *e;
5436 int cased, previous_is_cased;
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 /* Shortcut for single character strings */
5439 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005440 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5441 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005444 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 e = p + PyUnicode_GET_SIZE(self);
5448 cased = 0;
5449 previous_is_cased = 0;
5450 for (; p < e; p++) {
5451 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5454 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005455 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 previous_is_cased = 1;
5457 cased = 1;
5458 }
5459 else if (Py_UNICODE_ISLOWER(ch)) {
5460 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005461 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 previous_is_cased = 1;
5463 cased = 1;
5464 }
5465 else
5466 previous_is_cased = 0;
5467 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005468 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469}
5470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005471PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005472"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005474Return True if all characters in S are whitespace\n\
5475and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005478unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479{
5480 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5481 register const Py_UNICODE *e;
5482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 /* Shortcut for single character strings */
5484 if (PyUnicode_GET_SIZE(self) == 1 &&
5485 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005486 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005488 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005489 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 e = p + PyUnicode_GET_SIZE(self);
5493 for (; p < e; p++) {
5494 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498}
5499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005500PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005501"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005502\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005503Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005504and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005505
5506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005507unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005508{
5509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5510 register const Py_UNICODE *e;
5511
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self) == 1 &&
5514 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005515 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005516
5517 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005518 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005519 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005520
5521 e = p + PyUnicode_GET_SIZE(self);
5522 for (; p < e; p++) {
5523 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005526 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005527}
5528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005529PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005532Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005533and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005534
5535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005536unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005537{
5538 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5539 register const Py_UNICODE *e;
5540
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005541 /* Shortcut for single character strings */
5542 if (PyUnicode_GET_SIZE(self) == 1 &&
5543 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005545
5546 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005547 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005548 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005549
5550 e = p + PyUnicode_GET_SIZE(self);
5551 for (; p < e; p++) {
5552 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005555 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005556}
5557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005558PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005562False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
5564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005565unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566{
5567 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5568 register const Py_UNICODE *e;
5569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Shortcut for single character strings */
5571 if (PyUnicode_GET_SIZE(self) == 1 &&
5572 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005575 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005576 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005578
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 e = p + PyUnicode_GET_SIZE(self);
5580 for (; p < e; p++) {
5581 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005584 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585}
5586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005587PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005588"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005590Return True if all characters in S are digits\n\
5591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
5593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005594unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
5596 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5597 register const Py_UNICODE *e;
5598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* Shortcut for single character strings */
5600 if (PyUnicode_GET_SIZE(self) == 1 &&
5601 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005604 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005605 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005607
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 e = p + PyUnicode_GET_SIZE(self);
5609 for (; p < e; p++) {
5610 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614}
5615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005616PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
5622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005623unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
5625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5626 register const Py_UNICODE *e;
5627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 /* Shortcut for single character strings */
5629 if (PyUnicode_GET_SIZE(self) == 1 &&
5630 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005634 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005636
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 e = p + PyUnicode_GET_SIZE(self);
5638 for (; p < e; p++) {
5639 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643}
5644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646"S.join(sequence) -> unicode\n\
5647\n\
5648Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005654 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655}
5656
Martin v. Löwis18e16552006-02-15 17:27:45 +00005657static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658unicode_length(PyUnicodeObject *self)
5659{
5660 return self->length;
5661}
5662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005663PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005664"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665\n\
5666Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005667done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
5669static PyObject *
5670unicode_ljust(PyUnicodeObject *self, PyObject *args)
5671{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005672 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005673 Py_UNICODE fillchar = ' ';
5674
Martin v. Löwis412fb672006-04-13 06:34:32 +00005675 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 return NULL;
5677
Tim Peters7a29bd52001-09-12 03:03:31 +00005678 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 Py_INCREF(self);
5680 return (PyObject*) self;
5681 }
5682
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005683 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687"S.lower() -> unicode\n\
5688\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005689Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
5691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005692unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return fixup(self, fixlower);
5695}
5696
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005697#define LEFTSTRIP 0
5698#define RIGHTSTRIP 1
5699#define BOTHSTRIP 2
5700
5701/* Arrays indexed by above */
5702static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5703
5704#define STRIPNAME(i) (stripformat[i]+3)
5705
5706static const Py_UNICODE *
5707unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5708{
Tim Peters030a5ce2002-04-22 19:00:10 +00005709 size_t i;
5710 for (i = 0; i < n; ++i)
5711 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005712 return s+i;
5713 return NULL;
5714}
5715
5716/* externally visible for str.strip(unicode) */
5717PyObject *
5718_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5719{
5720 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005722 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005723 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5724 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005725
5726 i = 0;
5727 if (striptype != RIGHTSTRIP) {
5728 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5729 i++;
5730 }
5731 }
5732
5733 j = len;
5734 if (striptype != LEFTSTRIP) {
5735 do {
5736 j--;
5737 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5738 j++;
5739 }
5740
5741 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5742 Py_INCREF(self);
5743 return (PyObject*)self;
5744 }
5745 else
5746 return PyUnicode_FromUnicode(s+i, j-i);
5747}
5748
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005751do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005753 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005755
5756 i = 0;
5757 if (striptype != RIGHTSTRIP) {
5758 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5759 i++;
5760 }
5761 }
5762
5763 j = len;
5764 if (striptype != LEFTSTRIP) {
5765 do {
5766 j--;
5767 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5768 j++;
5769 }
5770
5771 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5772 Py_INCREF(self);
5773 return (PyObject*)self;
5774 }
5775 else
5776 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777}
5778
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005779
5780static PyObject *
5781do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5782{
5783 PyObject *sep = NULL;
5784
5785 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5786 return NULL;
5787
5788 if (sep != NULL && sep != Py_None) {
5789 if (PyUnicode_Check(sep))
5790 return _PyUnicode_XStrip(self, striptype, sep);
5791 else if (PyString_Check(sep)) {
5792 PyObject *res;
5793 sep = PyUnicode_FromObject(sep);
5794 if (sep==NULL)
5795 return NULL;
5796 res = _PyUnicode_XStrip(self, striptype, sep);
5797 Py_DECREF(sep);
5798 return res;
5799 }
5800 else {
5801 PyErr_Format(PyExc_TypeError,
5802 "%s arg must be None, unicode or str",
5803 STRIPNAME(striptype));
5804 return NULL;
5805 }
5806 }
5807
5808 return do_strip(self, striptype);
5809}
5810
5811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005813"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005814\n\
5815Return a copy of the string S with leading and trailing\n\
5816whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005817If chars is given and not None, remove characters in chars instead.\n\
5818If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005819
5820static PyObject *
5821unicode_strip(PyUnicodeObject *self, PyObject *args)
5822{
5823 if (PyTuple_GET_SIZE(args) == 0)
5824 return do_strip(self, BOTHSTRIP); /* Common case */
5825 else
5826 return do_argstrip(self, BOTHSTRIP, args);
5827}
5828
5829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005830PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005831"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005832\n\
5833Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005834If chars is given and not None, remove characters in chars instead.\n\
5835If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005836
5837static PyObject *
5838unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5839{
5840 if (PyTuple_GET_SIZE(args) == 0)
5841 return do_strip(self, LEFTSTRIP); /* Common case */
5842 else
5843 return do_argstrip(self, LEFTSTRIP, args);
5844}
5845
5846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005847PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005848"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005849\n\
5850Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005851If chars is given and not None, remove characters in chars instead.\n\
5852If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005853
5854static PyObject *
5855unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5856{
5857 if (PyTuple_GET_SIZE(args) == 0)
5858 return do_strip(self, RIGHTSTRIP); /* Common case */
5859 else
5860 return do_argstrip(self, RIGHTSTRIP, args);
5861}
5862
5863
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866{
5867 PyUnicodeObject *u;
5868 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005870 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
5872 if (len < 0)
5873 len = 0;
5874
Tim Peters7a29bd52001-09-12 03:03:31 +00005875 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* no repeat, return original string */
5877 Py_INCREF(str);
5878 return (PyObject*) str;
5879 }
Tim Peters8f422462000-09-09 06:13:41 +00005880
5881 /* ensure # of chars needed doesn't overflow int and # of bytes
5882 * needed doesn't overflow size_t
5883 */
5884 nchars = len * str->length;
5885 if (len && nchars / len != str->length) {
5886 PyErr_SetString(PyExc_OverflowError,
5887 "repeated string is too long");
5888 return NULL;
5889 }
5890 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5891 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5892 PyErr_SetString(PyExc_OverflowError,
5893 "repeated string is too long");
5894 return NULL;
5895 }
5896 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 if (!u)
5898 return NULL;
5899
5900 p = u->str;
5901
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005902 if (str->length == 1 && len > 0) {
5903 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005904 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00005905 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005906 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005907 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005908 done = str->length;
5909 }
5910 while (done < nchars) {
5911 int n = (done <= nchars-done) ? done : nchars-done;
5912 Py_UNICODE_COPY(p+done, p, n);
5913 done += n;
5914 }
5915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
5917 return (PyObject*) u;
5918}
5919
5920PyObject *PyUnicode_Replace(PyObject *obj,
5921 PyObject *subobj,
5922 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
5925 PyObject *self;
5926 PyObject *str1;
5927 PyObject *str2;
5928 PyObject *result;
5929
5930 self = PyUnicode_FromObject(obj);
5931 if (self == NULL)
5932 return NULL;
5933 str1 = PyUnicode_FromObject(subobj);
5934 if (str1 == NULL) {
5935 Py_DECREF(self);
5936 return NULL;
5937 }
5938 str2 = PyUnicode_FromObject(replobj);
5939 if (str2 == NULL) {
5940 Py_DECREF(self);
5941 Py_DECREF(str1);
5942 return NULL;
5943 }
Tim Petersced69f82003-09-16 20:30:58 +00005944 result = replace((PyUnicodeObject *)self,
5945 (PyUnicodeObject *)str1,
5946 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 maxcount);
5948 Py_DECREF(self);
5949 Py_DECREF(str1);
5950 Py_DECREF(str2);
5951 return result;
5952}
5953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955"S.replace (old, new[, maxsplit]) -> unicode\n\
5956\n\
5957Return a copy of S with all occurrences of substring\n\
5958old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
5961static PyObject*
5962unicode_replace(PyUnicodeObject *self, PyObject *args)
5963{
5964 PyUnicodeObject *str1;
5965 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 PyObject *result;
5968
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 return NULL;
5971 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5972 if (str1 == NULL)
5973 return NULL;
5974 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005975 if (str2 == NULL) {
5976 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
5980 result = replace(self, str1, str2, maxcount);
5981
5982 Py_DECREF(str1);
5983 Py_DECREF(str2);
5984 return result;
5985}
5986
5987static
5988PyObject *unicode_repr(PyObject *unicode)
5989{
5990 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5991 PyUnicode_GET_SIZE(unicode),
5992 1);
5993}
5994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005995PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996"S.rfind(sub [,start [,end]]) -> int\n\
5997\n\
5998Return the highest index in S where substring sub is found,\n\
5999such that sub is contained within s[start,end]. Optional\n\
6000arguments start and end are interpreted as in slice notation.\n\
6001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006002Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
6004static PyObject *
6005unicode_rfind(PyUnicodeObject *self, PyObject *args)
6006{
6007 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006009 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 PyObject *result;
6011
Guido van Rossumb8872e62000-05-09 14:14:27 +00006012 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6013 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 return NULL;
6015 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6016 (PyObject *)substring);
6017 if (substring == NULL)
6018 return NULL;
6019
Martin v. Löwis18e16552006-02-15 17:27:45 +00006020 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
6022 Py_DECREF(substring);
6023 return result;
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027"S.rindex(sub [,start [,end]]) -> int\n\
6028\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
6031static PyObject *
6032unicode_rindex(PyUnicodeObject *self, PyObject *args)
6033{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006036 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006037 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Guido van Rossumb8872e62000-05-09 14:14:27 +00006039 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6040 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6043 (PyObject *)substring);
6044 if (substring == NULL)
6045 return NULL;
6046
6047 result = findstring(self, substring, start, end, -1);
6048
6049 Py_DECREF(substring);
6050 if (result < 0) {
6051 PyErr_SetString(PyExc_ValueError, "substring not found");
6052 return NULL;
6053 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006054 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006057PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006058"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059\n\
6060Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006061done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063static PyObject *
6064unicode_rjust(PyUnicodeObject *self, PyObject *args)
6065{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006066 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006067 Py_UNICODE fillchar = ' ';
6068
Martin v. Löwis412fb672006-04-13 06:34:32 +00006069 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 return NULL;
6071
Tim Peters7a29bd52001-09-12 03:03:31 +00006072 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 Py_INCREF(self);
6074 return (PyObject*) self;
6075 }
6076
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006077 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078}
6079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082{
6083 /* standard clamping */
6084 if (start < 0)
6085 start = 0;
6086 if (end < 0)
6087 end = 0;
6088 if (end > self->length)
6089 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006090 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 /* full slice, return original string */
6092 Py_INCREF(self);
6093 return (PyObject*) self;
6094 }
6095 if (start > end)
6096 start = end;
6097 /* copy slice */
6098 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6099 end - start);
6100}
6101
6102PyObject *PyUnicode_Split(PyObject *s,
6103 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006104 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
6106 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006107
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 s = PyUnicode_FromObject(s);
6109 if (s == NULL)
6110 return NULL;
6111 if (sep != NULL) {
6112 sep = PyUnicode_FromObject(sep);
6113 if (sep == NULL) {
6114 Py_DECREF(s);
6115 return NULL;
6116 }
6117 }
6118
6119 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6120
6121 Py_DECREF(s);
6122 Py_XDECREF(sep);
6123 return result;
6124}
6125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006126PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127"S.split([sep [,maxsplit]]) -> list of strings\n\
6128\n\
6129Return a list of the words in S, using sep as the\n\
6130delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006131splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006132any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
6134static PyObject*
6135unicode_split(PyUnicodeObject *self, PyObject *args)
6136{
6137 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006138 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return NULL;
6142
6143 if (substring == Py_None)
6144 return split(self, NULL, maxcount);
6145 else if (PyUnicode_Check(substring))
6146 return split(self, (PyUnicodeObject *)substring, maxcount);
6147 else
6148 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6149}
6150
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006151PyObject *PyUnicode_RSplit(PyObject *s,
6152 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006154{
6155 PyObject *result;
6156
6157 s = PyUnicode_FromObject(s);
6158 if (s == NULL)
6159 return NULL;
6160 if (sep != NULL) {
6161 sep = PyUnicode_FromObject(sep);
6162 if (sep == NULL) {
6163 Py_DECREF(s);
6164 return NULL;
6165 }
6166 }
6167
6168 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6169
6170 Py_DECREF(s);
6171 Py_XDECREF(sep);
6172 return result;
6173}
6174
6175PyDoc_STRVAR(rsplit__doc__,
6176"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6177\n\
6178Return a list of the words in S, using sep as the\n\
6179delimiter string, starting at the end of the string and\n\
6180working to the front. If maxsplit is given, at most maxsplit\n\
6181splits are done. If sep is not specified, any whitespace string\n\
6182is a separator.");
6183
6184static PyObject*
6185unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6186{
6187 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006188 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006189
Martin v. Löwis18e16552006-02-15 17:27:45 +00006190 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006191 return NULL;
6192
6193 if (substring == Py_None)
6194 return rsplit(self, NULL, maxcount);
6195 else if (PyUnicode_Check(substring))
6196 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6197 else
6198 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6199}
6200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006201PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006202"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203\n\
6204Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006205Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
6208static PyObject*
6209unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6210{
Guido van Rossum86662912000-04-11 15:38:46 +00006211 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Guido van Rossum86662912000-04-11 15:38:46 +00006213 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 return NULL;
6215
Guido van Rossum86662912000-04-11 15:38:46 +00006216 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
6219static
6220PyObject *unicode_str(PyUnicodeObject *self)
6221{
Fred Drakee4315f52000-05-09 19:53:39 +00006222 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006225PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226"S.swapcase() -> unicode\n\
6227\n\
6228Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006229and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230
6231static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006232unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 return fixup(self, fixswapcase);
6235}
6236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006237PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238"S.translate(table) -> unicode\n\
6239\n\
6240Return a copy of the string S, where all characters have been mapped\n\
6241through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006242Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6243Unmapped characters are left untouched. Characters mapped to None\n\
6244are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245
6246static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006247unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248{
Tim Petersced69f82003-09-16 20:30:58 +00006249 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006251 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 "ignore");
6253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256"S.upper() -> unicode\n\
6257\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006258Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
6260static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006261unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 return fixup(self, fixupper);
6264}
6265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006266PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267"S.zfill(width) -> unicode\n\
6268\n\
6269Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006270of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272static PyObject *
6273unicode_zfill(PyUnicodeObject *self, PyObject *args)
6274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006275 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 PyUnicodeObject *u;
6277
Martin v. Löwis18e16552006-02-15 17:27:45 +00006278 Py_ssize_t width;
6279 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
6281
6282 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006283 if (PyUnicode_CheckExact(self)) {
6284 Py_INCREF(self);
6285 return (PyObject*) self;
6286 }
6287 else
6288 return PyUnicode_FromUnicode(
6289 PyUnicode_AS_UNICODE(self),
6290 PyUnicode_GET_SIZE(self)
6291 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 }
6293
6294 fill = width - self->length;
6295
6296 u = pad(self, fill, 0, '0');
6297
Walter Dörwald068325e2002-04-15 13:36:47 +00006298 if (u == NULL)
6299 return NULL;
6300
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 if (u->str[fill] == '+' || u->str[fill] == '-') {
6302 /* move sign to beginning of string */
6303 u->str[0] = u->str[fill];
6304 u->str[fill] = '0';
6305 }
6306
6307 return (PyObject*) u;
6308}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
6310#if 0
6311static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006312unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 return PyInt_FromLong(unicode_freelist_size);
6315}
6316#endif
6317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006318PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006319"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006321Return True if S starts with the specified prefix, False otherwise.\n\
6322With optional start, test S beginning at that position.\n\
6323With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324
6325static PyObject *
6326unicode_startswith(PyUnicodeObject *self,
6327 PyObject *args)
6328{
6329 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006330 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006331 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 PyObject *result;
6333
Guido van Rossumb8872e62000-05-09 14:14:27 +00006334 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6335 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 return NULL;
6337 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6338 (PyObject *)substring);
6339 if (substring == NULL)
6340 return NULL;
6341
Guido van Rossum77f6a652002-04-03 22:41:51 +00006342 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344 Py_DECREF(substring);
6345 return result;
6346}
6347
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006350"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006352Return True if S ends with the specified suffix, False otherwise.\n\
6353With optional start, test S beginning at that position.\n\
6354With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
6356static PyObject *
6357unicode_endswith(PyUnicodeObject *self,
6358 PyObject *args)
6359{
6360 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006362 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 PyObject *result;
6364
Guido van Rossumb8872e62000-05-09 14:14:27 +00006365 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6366 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 return NULL;
6368 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6369 (PyObject *)substring);
6370 if (substring == NULL)
6371 return NULL;
6372
Guido van Rossum77f6a652002-04-03 22:41:51 +00006373 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
6375 Py_DECREF(substring);
6376 return result;
6377}
6378
6379
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006380
6381static PyObject *
6382unicode_getnewargs(PyUnicodeObject *v)
6383{
6384 return Py_BuildValue("(u#)", v->str, v->length);
6385}
6386
6387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388static PyMethodDef unicode_methods[] = {
6389
6390 /* Order is according to common usage: often used methods should
6391 appear first, since lookup is done sequentially. */
6392
Georg Brandlecdc0a92006-03-30 12:19:07 +00006393 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006394 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6395 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006396 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006397 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6398 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6399 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6400 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6401 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6402 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6403 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6404 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6405 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6406 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006407 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006408 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006409/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6410 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6411 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6412 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006413 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006414 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006415 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006416 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6417 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6418 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6419 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6420 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6421 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6422 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6423 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6424 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6425 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6426 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6427 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6428 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6429 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006430 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006431#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006432 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433#endif
6434
6435#if 0
6436 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006437 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438#endif
6439
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006440 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 {NULL, NULL}
6442};
6443
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006444static PyObject *
6445unicode_mod(PyObject *v, PyObject *w)
6446{
6447 if (!PyUnicode_Check(v)) {
6448 Py_INCREF(Py_NotImplemented);
6449 return Py_NotImplemented;
6450 }
6451 return PyUnicode_Format(v, w);
6452}
6453
6454static PyNumberMethods unicode_as_number = {
6455 0, /*nb_add*/
6456 0, /*nb_subtract*/
6457 0, /*nb_multiply*/
6458 0, /*nb_divide*/
6459 unicode_mod, /*nb_remainder*/
6460};
6461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006463 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006464 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6466 (ssizeargfunc) unicode_getitem, /* sq_item */
6467 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 0, /* sq_ass_item */
6469 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006470 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471};
6472
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006473#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6474
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006475static PyObject*
6476unicode_subscript(PyUnicodeObject* self, PyObject* item)
6477{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006478 PyNumberMethods *nb = item->ob_type->tp_as_number;
6479 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6480 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006481 if (i == -1 && PyErr_Occurred())
6482 return NULL;
6483 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006484 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006485 return unicode_getitem(self, i);
6486 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006487 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006488 Py_UNICODE* source_buf;
6489 Py_UNICODE* result_buf;
6490 PyObject* result;
6491
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006492 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006493 &start, &stop, &step, &slicelength) < 0) {
6494 return NULL;
6495 }
6496
6497 if (slicelength <= 0) {
6498 return PyUnicode_FromUnicode(NULL, 0);
6499 } else {
6500 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006501 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6502 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006503
6504 if (result_buf == NULL)
6505 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006506
6507 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6508 result_buf[i] = source_buf[cur];
6509 }
Tim Petersced69f82003-09-16 20:30:58 +00006510
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006511 result = PyUnicode_FromUnicode(result_buf, slicelength);
6512 PyMem_FREE(result_buf);
6513 return result;
6514 }
6515 } else {
6516 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6517 return NULL;
6518 }
6519}
6520
6521static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006523 (binaryfunc)unicode_subscript, /* mp_subscript */
6524 (objobjargproc)0, /* mp_ass_subscript */
6525};
6526
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006529 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 const void **ptr)
6531{
6532 if (index != 0) {
6533 PyErr_SetString(PyExc_SystemError,
6534 "accessing non-existent unicode segment");
6535 return -1;
6536 }
6537 *ptr = (void *) self->str;
6538 return PyUnicode_GET_DATA_SIZE(self);
6539}
6540
Martin v. Löwis18e16552006-02-15 17:27:45 +00006541static Py_ssize_t
6542unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 const void **ptr)
6544{
6545 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006546 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return -1;
6548}
6549
6550static int
6551unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
6554 if (lenp)
6555 *lenp = PyUnicode_GET_DATA_SIZE(self);
6556 return 1;
6557}
6558
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006559static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006561 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 const void **ptr)
6563{
6564 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006565
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 if (index != 0) {
6567 PyErr_SetString(PyExc_SystemError,
6568 "accessing non-existent unicode segment");
6569 return -1;
6570 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006571 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (str == NULL)
6573 return -1;
6574 *ptr = (void *) PyString_AS_STRING(str);
6575 return PyString_GET_SIZE(str);
6576}
6577
6578/* Helpers for PyUnicode_Format() */
6579
6580static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006581getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006583 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 if (argidx < arglen) {
6585 (*p_argidx)++;
6586 if (arglen < 0)
6587 return args;
6588 else
6589 return PyTuple_GetItem(args, argidx);
6590 }
6591 PyErr_SetString(PyExc_TypeError,
6592 "not enough arguments for format string");
6593 return NULL;
6594}
6595
6596#define F_LJUST (1<<0)
6597#define F_SIGN (1<<1)
6598#define F_BLANK (1<<2)
6599#define F_ALT (1<<3)
6600#define F_ZERO (1<<4)
6601
Martin v. Löwis18e16552006-02-15 17:27:45 +00006602static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006603strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605 register Py_ssize_t i;
6606 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 for (i = len - 1; i >= 0; i--)
6608 buffer[i] = (Py_UNICODE) charbuffer[i];
6609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 return len;
6611}
6612
Neal Norwitzfc76d632006-01-10 06:03:13 +00006613static int
6614doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6615{
Tim Peters15231542006-02-16 01:08:01 +00006616 Py_ssize_t result;
6617
Neal Norwitzfc76d632006-01-10 06:03:13 +00006618 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006619 result = strtounicode(buffer, (char *)buffer);
6620 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006621}
6622
6623static int
6624longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6625{
Tim Peters15231542006-02-16 01:08:01 +00006626 Py_ssize_t result;
6627
Neal Norwitzfc76d632006-01-10 06:03:13 +00006628 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006629 result = strtounicode(buffer, (char *)buffer);
6630 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006631}
6632
Guido van Rossum078151d2002-08-11 04:24:12 +00006633/* XXX To save some code duplication, formatfloat/long/int could have been
6634 shared with stringobject.c, converting from 8-bit to Unicode after the
6635 formatting is done. */
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637static int
6638formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006639 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 int flags,
6641 int prec,
6642 int type,
6643 PyObject *v)
6644{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006645 /* fmt = '%#.' + `prec` + `type`
6646 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 char fmt[20];
6648 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 x = PyFloat_AsDouble(v);
6651 if (x == -1.0 && PyErr_Occurred())
6652 return -1;
6653 if (prec < 0)
6654 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6656 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006657 /* Worst case length calc to ensure no buffer overrun:
6658
6659 'g' formats:
6660 fmt = %#.<prec>g
6661 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6662 for any double rep.)
6663 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6664
6665 'f' formats:
6666 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6667 len = 1 + 50 + 1 + prec = 52 + prec
6668
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006669 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006670 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006671
6672 */
6673 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6674 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006675 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006676 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006677 return -1;
6678 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006679 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6680 (flags&F_ALT) ? "#" : "",
6681 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006682 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683}
6684
Tim Peters38fd5b62000-09-21 05:43:11 +00006685static PyObject*
6686formatlong(PyObject *val, int flags, int prec, int type)
6687{
6688 char *buf;
6689 int i, len;
6690 PyObject *str; /* temporary string object. */
6691 PyUnicodeObject *result;
6692
6693 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6694 if (!str)
6695 return NULL;
6696 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006697 if (!result) {
6698 Py_DECREF(str);
6699 return NULL;
6700 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006701 for (i = 0; i < len; i++)
6702 result->str[i] = buf[i];
6703 result->str[len] = 0;
6704 Py_DECREF(str);
6705 return (PyObject*)result;
6706}
6707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708static int
6709formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006710 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 int flags,
6712 int prec,
6713 int type,
6714 PyObject *v)
6715{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006716 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006717 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6718 * + 1 + 1
6719 * = 24
6720 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006721 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006722 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 long x;
6724
6725 x = PyInt_AsLong(v);
6726 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006727 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006728 if (x < 0 && type == 'u') {
6729 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006730 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006731 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6732 sign = "-";
6733 else
6734 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006736 prec = 1;
6737
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006738 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6739 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006740 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006741 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006742 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006743 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006744 return -1;
6745 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006746
6747 if ((flags & F_ALT) &&
6748 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006749 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006750 * of issues that cause pain:
6751 * - when 0 is being converted, the C standard leaves off
6752 * the '0x' or '0X', which is inconsistent with other
6753 * %#x/%#X conversions and inconsistent with Python's
6754 * hex() function
6755 * - there are platforms that violate the standard and
6756 * convert 0 with the '0x' or '0X'
6757 * (Metrowerks, Compaq Tru64)
6758 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006759 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006760 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006761 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006762 * We can achieve the desired consistency by inserting our
6763 * own '0x' or '0X' prefix, and substituting %x/%X in place
6764 * of %#x/%#X.
6765 *
6766 * Note that this is the same approach as used in
6767 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006768 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006769 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6770 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006771 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006772 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006773 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6774 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006775 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006776 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006777 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006778 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006779 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006780 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781}
6782
6783static int
6784formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006785 size_t buflen,
6786 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006788 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006789 if (PyUnicode_Check(v)) {
6790 if (PyUnicode_GET_SIZE(v) != 1)
6791 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006795 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006796 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006797 goto onError;
6798 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
6801 else {
6802 /* Integer input truncated to a character */
6803 long x;
6804 x = PyInt_AsLong(v);
6805 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006806 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006807#ifdef Py_UNICODE_WIDE
6808 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006809 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006810 "%c arg not in range(0x110000) "
6811 "(wide Python build)");
6812 return -1;
6813 }
6814#else
6815 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006816 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006817 "%c arg not in range(0x10000) "
6818 "(narrow Python build)");
6819 return -1;
6820 }
6821#endif
6822 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 }
6824 buf[1] = '\0';
6825 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006826
6827 onError:
6828 PyErr_SetString(PyExc_TypeError,
6829 "%c requires int or char");
6830 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006833/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6834
6835 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6836 chars are formatted. XXX This is a magic number. Each formatting
6837 routine does bounds checking to ensure no overflow, but a better
6838 solution may be to malloc a buffer of appropriate size for each
6839 format. For now, the current solution is sufficient.
6840*/
6841#define FORMATBUFLEN (size_t)120
6842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843PyObject *PyUnicode_Format(PyObject *format,
6844 PyObject *args)
6845{
6846 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006847 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 int args_owned = 0;
6849 PyUnicodeObject *result = NULL;
6850 PyObject *dict = NULL;
6851 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 if (format == NULL || args == NULL) {
6854 PyErr_BadInternalCall();
6855 return NULL;
6856 }
6857 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006858 if (uformat == NULL)
6859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 fmt = PyUnicode_AS_UNICODE(uformat);
6861 fmtcnt = PyUnicode_GET_SIZE(uformat);
6862
6863 reslen = rescnt = fmtcnt + 100;
6864 result = _PyUnicode_New(reslen);
6865 if (result == NULL)
6866 goto onError;
6867 res = PyUnicode_AS_UNICODE(result);
6868
6869 if (PyTuple_Check(args)) {
6870 arglen = PyTuple_Size(args);
6871 argidx = 0;
6872 }
6873 else {
6874 arglen = -1;
6875 argidx = -2;
6876 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006877 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6878 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 dict = args;
6880
6881 while (--fmtcnt >= 0) {
6882 if (*fmt != '%') {
6883 if (--rescnt < 0) {
6884 rescnt = fmtcnt + 100;
6885 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006886 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6889 --rescnt;
6890 }
6891 *res++ = *fmt++;
6892 }
6893 else {
6894 /* Got a format specifier */
6895 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006896 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 Py_UNICODE c = '\0';
6899 Py_UNICODE fill;
6900 PyObject *v = NULL;
6901 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006902 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006904 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006905 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906
6907 fmt++;
6908 if (*fmt == '(') {
6909 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006910 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 PyObject *key;
6912 int pcount = 1;
6913
6914 if (dict == NULL) {
6915 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006916 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 goto onError;
6918 }
6919 ++fmt;
6920 --fmtcnt;
6921 keystart = fmt;
6922 /* Skip over balanced parentheses */
6923 while (pcount > 0 && --fmtcnt >= 0) {
6924 if (*fmt == ')')
6925 --pcount;
6926 else if (*fmt == '(')
6927 ++pcount;
6928 fmt++;
6929 }
6930 keylen = fmt - keystart - 1;
6931 if (fmtcnt < 0 || pcount > 0) {
6932 PyErr_SetString(PyExc_ValueError,
6933 "incomplete format key");
6934 goto onError;
6935 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006936#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006937 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 then looked up since Python uses strings to hold
6939 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006940 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 key = PyUnicode_EncodeUTF8(keystart,
6942 keylen,
6943 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006944#else
6945 key = PyUnicode_FromUnicode(keystart, keylen);
6946#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (key == NULL)
6948 goto onError;
6949 if (args_owned) {
6950 Py_DECREF(args);
6951 args_owned = 0;
6952 }
6953 args = PyObject_GetItem(dict, key);
6954 Py_DECREF(key);
6955 if (args == NULL) {
6956 goto onError;
6957 }
6958 args_owned = 1;
6959 arglen = -1;
6960 argidx = -2;
6961 }
6962 while (--fmtcnt >= 0) {
6963 switch (c = *fmt++) {
6964 case '-': flags |= F_LJUST; continue;
6965 case '+': flags |= F_SIGN; continue;
6966 case ' ': flags |= F_BLANK; continue;
6967 case '#': flags |= F_ALT; continue;
6968 case '0': flags |= F_ZERO; continue;
6969 }
6970 break;
6971 }
6972 if (c == '*') {
6973 v = getnextarg(args, arglen, &argidx);
6974 if (v == NULL)
6975 goto onError;
6976 if (!PyInt_Check(v)) {
6977 PyErr_SetString(PyExc_TypeError,
6978 "* wants int");
6979 goto onError;
6980 }
6981 width = PyInt_AsLong(v);
6982 if (width < 0) {
6983 flags |= F_LJUST;
6984 width = -width;
6985 }
6986 if (--fmtcnt >= 0)
6987 c = *fmt++;
6988 }
6989 else if (c >= '0' && c <= '9') {
6990 width = c - '0';
6991 while (--fmtcnt >= 0) {
6992 c = *fmt++;
6993 if (c < '0' || c > '9')
6994 break;
6995 if ((width*10) / 10 != width) {
6996 PyErr_SetString(PyExc_ValueError,
6997 "width too big");
6998 goto onError;
6999 }
7000 width = width*10 + (c - '0');
7001 }
7002 }
7003 if (c == '.') {
7004 prec = 0;
7005 if (--fmtcnt >= 0)
7006 c = *fmt++;
7007 if (c == '*') {
7008 v = getnextarg(args, arglen, &argidx);
7009 if (v == NULL)
7010 goto onError;
7011 if (!PyInt_Check(v)) {
7012 PyErr_SetString(PyExc_TypeError,
7013 "* wants int");
7014 goto onError;
7015 }
7016 prec = PyInt_AsLong(v);
7017 if (prec < 0)
7018 prec = 0;
7019 if (--fmtcnt >= 0)
7020 c = *fmt++;
7021 }
7022 else if (c >= '0' && c <= '9') {
7023 prec = c - '0';
7024 while (--fmtcnt >= 0) {
7025 c = Py_CHARMASK(*fmt++);
7026 if (c < '0' || c > '9')
7027 break;
7028 if ((prec*10) / 10 != prec) {
7029 PyErr_SetString(PyExc_ValueError,
7030 "prec too big");
7031 goto onError;
7032 }
7033 prec = prec*10 + (c - '0');
7034 }
7035 }
7036 } /* prec */
7037 if (fmtcnt >= 0) {
7038 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 if (--fmtcnt >= 0)
7040 c = *fmt++;
7041 }
7042 }
7043 if (fmtcnt < 0) {
7044 PyErr_SetString(PyExc_ValueError,
7045 "incomplete format");
7046 goto onError;
7047 }
7048 if (c != '%') {
7049 v = getnextarg(args, arglen, &argidx);
7050 if (v == NULL)
7051 goto onError;
7052 }
7053 sign = 0;
7054 fill = ' ';
7055 switch (c) {
7056
7057 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007058 pbuf = formatbuf;
7059 /* presume that buffer length is at least 1 */
7060 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 len = 1;
7062 break;
7063
7064 case 's':
7065 case 'r':
7066 if (PyUnicode_Check(v) && c == 's') {
7067 temp = v;
7068 Py_INCREF(temp);
7069 }
7070 else {
7071 PyObject *unicode;
7072 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007073 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 else
7075 temp = PyObject_Repr(v);
7076 if (temp == NULL)
7077 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007078 if (PyUnicode_Check(temp))
7079 /* nothing to do */;
7080 else if (PyString_Check(temp)) {
7081 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007082 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007084 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007086 Py_DECREF(temp);
7087 temp = unicode;
7088 if (temp == NULL)
7089 goto onError;
7090 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007091 else {
7092 Py_DECREF(temp);
7093 PyErr_SetString(PyExc_TypeError,
7094 "%s argument has non-string str()");
7095 goto onError;
7096 }
7097 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007098 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 len = PyUnicode_GET_SIZE(temp);
7100 if (prec >= 0 && len > prec)
7101 len = prec;
7102 break;
7103
7104 case 'i':
7105 case 'd':
7106 case 'u':
7107 case 'o':
7108 case 'x':
7109 case 'X':
7110 if (c == 'i')
7111 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007112 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007113 temp = formatlong(v, flags, prec, c);
7114 if (!temp)
7115 goto onError;
7116 pbuf = PyUnicode_AS_UNICODE(temp);
7117 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007118 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007120 else {
7121 pbuf = formatbuf;
7122 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7123 flags, prec, c, v);
7124 if (len < 0)
7125 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007126 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007127 }
7128 if (flags & F_ZERO)
7129 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 break;
7131
7132 case 'e':
7133 case 'E':
7134 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007135 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 case 'g':
7137 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007138 if (c == 'F')
7139 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007140 pbuf = formatbuf;
7141 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7142 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 if (len < 0)
7144 goto onError;
7145 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007146 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 fill = '0';
7148 break;
7149
7150 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007151 pbuf = formatbuf;
7152 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 if (len < 0)
7154 goto onError;
7155 break;
7156
7157 default:
7158 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007159 "unsupported format character '%c' (0x%x) "
7160 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007161 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007162 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007163 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 goto onError;
7165 }
7166 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007167 if (*pbuf == '-' || *pbuf == '+') {
7168 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 len--;
7170 }
7171 else if (flags & F_SIGN)
7172 sign = '+';
7173 else if (flags & F_BLANK)
7174 sign = ' ';
7175 else
7176 sign = 0;
7177 }
7178 if (width < len)
7179 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007180 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 reslen -= rescnt;
7182 rescnt = width + fmtcnt + 100;
7183 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007184 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007185 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007186 PyErr_NoMemory();
7187 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007188 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007189 if (_PyUnicode_Resize(&result, reslen) < 0) {
7190 Py_XDECREF(temp);
7191 goto onError;
7192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 res = PyUnicode_AS_UNICODE(result)
7194 + reslen - rescnt;
7195 }
7196 if (sign) {
7197 if (fill != ' ')
7198 *res++ = sign;
7199 rescnt--;
7200 if (width > len)
7201 width--;
7202 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007203 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7204 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007205 assert(pbuf[1] == c);
7206 if (fill != ' ') {
7207 *res++ = *pbuf++;
7208 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007209 }
Tim Petersfff53252001-04-12 18:38:48 +00007210 rescnt -= 2;
7211 width -= 2;
7212 if (width < 0)
7213 width = 0;
7214 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 if (width > len && !(flags & F_LJUST)) {
7217 do {
7218 --rescnt;
7219 *res++ = fill;
7220 } while (--width > len);
7221 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007222 if (fill == ' ') {
7223 if (sign)
7224 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007225 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007226 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007227 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007228 *res++ = *pbuf++;
7229 *res++ = *pbuf++;
7230 }
7231 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007232 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 res += len;
7234 rescnt -= len;
7235 while (--width >= len) {
7236 --rescnt;
7237 *res++ = ' ';
7238 }
7239 if (dict && (argidx < arglen) && c != '%') {
7240 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007241 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007242 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 goto onError;
7244 }
7245 Py_XDECREF(temp);
7246 } /* '%' */
7247 } /* until end */
7248 if (argidx < arglen && !dict) {
7249 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007250 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 goto onError;
7252 }
7253
Thomas Woutersa96affe2006-03-12 00:29:36 +00007254 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7255 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 if (args_owned) {
7257 Py_DECREF(args);
7258 }
7259 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 return (PyObject *)result;
7261
7262 onError:
7263 Py_XDECREF(result);
7264 Py_DECREF(uformat);
7265 if (args_owned) {
7266 Py_DECREF(args);
7267 }
7268 return NULL;
7269}
7270
7271static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007272 (readbufferproc) unicode_buffer_getreadbuf,
7273 (writebufferproc) unicode_buffer_getwritebuf,
7274 (segcountproc) unicode_buffer_getsegcount,
7275 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276};
7277
Jeremy Hylton938ace62002-07-17 16:30:39 +00007278static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007279unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7280
Tim Peters6d6c1a32001-08-02 04:15:00 +00007281static PyObject *
7282unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7283{
7284 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007285 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007286 char *encoding = NULL;
7287 char *errors = NULL;
7288
Guido van Rossume023fe02001-08-30 03:12:59 +00007289 if (type != &PyUnicode_Type)
7290 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007291 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7292 kwlist, &x, &encoding, &errors))
7293 return NULL;
7294 if (x == NULL)
7295 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007296 if (encoding == NULL && errors == NULL)
7297 return PyObject_Unicode(x);
7298 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007299 return PyUnicode_FromEncodedObject(x, encoding, errors);
7300}
7301
Guido van Rossume023fe02001-08-30 03:12:59 +00007302static PyObject *
7303unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7304{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007305 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007306 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007307
7308 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7309 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7310 if (tmp == NULL)
7311 return NULL;
7312 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007313 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007314 if (pnew == NULL) {
7315 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007316 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007317 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007318 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7319 if (pnew->str == NULL) {
7320 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007321 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007322 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007323 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007324 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007325 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7326 pnew->length = n;
7327 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007328 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007329 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007330}
7331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007332PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007333"unicode(string [, encoding[, errors]]) -> object\n\
7334\n\
7335Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007336encoding defaults to the current default string encoding.\n\
7337errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007338
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339PyTypeObject PyUnicode_Type = {
7340 PyObject_HEAD_INIT(&PyType_Type)
7341 0, /* ob_size */
7342 "unicode", /* tp_name */
7343 sizeof(PyUnicodeObject), /* tp_size */
7344 0, /* tp_itemsize */
7345 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007346 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007348 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 0, /* tp_setattr */
7350 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007351 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007352 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007354 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 (hashfunc) unicode_hash, /* tp_hash*/
7356 0, /* tp_call*/
7357 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007358 PyObject_GenericGetAttr, /* tp_getattro */
7359 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007361 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7362 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007363 unicode_doc, /* tp_doc */
7364 0, /* tp_traverse */
7365 0, /* tp_clear */
7366 0, /* tp_richcompare */
7367 0, /* tp_weaklistoffset */
7368 0, /* tp_iter */
7369 0, /* tp_iternext */
7370 unicode_methods, /* tp_methods */
7371 0, /* tp_members */
7372 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007373 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007374 0, /* tp_dict */
7375 0, /* tp_descr_get */
7376 0, /* tp_descr_set */
7377 0, /* tp_dictoffset */
7378 0, /* tp_init */
7379 0, /* tp_alloc */
7380 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007381 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382};
7383
7384/* Initialize the Unicode implementation */
7385
Thomas Wouters78890102000-07-22 19:25:51 +00007386void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007388 int i;
7389
Fred Drakee4315f52000-05-09 19:53:39 +00007390 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007391 unicode_freelist = NULL;
7392 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007394 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007395 for (i = 0; i < 256; i++)
7396 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007397 if (PyType_Ready(&PyUnicode_Type) < 0)
7398 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399}
7400
7401/* Finalize the Unicode implementation */
7402
7403void
Thomas Wouters78890102000-07-22 19:25:51 +00007404_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007406 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007407 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007409 Py_XDECREF(unicode_empty);
7410 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007412 for (i = 0; i < 256; i++) {
7413 if (unicode_latin1[i]) {
7414 Py_DECREF(unicode_latin1[i]);
7415 unicode_latin1[i] = NULL;
7416 }
7417 }
7418
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007419 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 PyUnicodeObject *v = u;
7421 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007422 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007423 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007424 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007425 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007427 unicode_freelist = NULL;
7428 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007430
Anthony Baxterac6bd462006-04-13 02:06:09 +00007431#ifdef __cplusplus
7432}
7433#endif
7434
7435
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007436/*
7437Local variables:
7438c-basic-offset: 4
7439indent-tabs-mode: nil
7440End:
7441*/